Skip to content

Commit

Permalink
run black
Browse files Browse the repository at this point in the history
  • Loading branch information
rspeer committed Feb 8, 2022
1 parent 91195c7 commit 538145c
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 148 deletions.
95 changes: 41 additions & 54 deletions wordfreq/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@


CACHE_SIZE = 100000
DATA_PATH = pathlib.Path(resource_filename('wordfreq', 'data'))
DATA_PATH = pathlib.Path(resource_filename("wordfreq", "data"))

# We'll divide the frequency by 10 for each token boundary that was inferred.
# (We determined the factor of 10 empirically by looking at words in the
Expand Down Expand Up @@ -75,44 +75,43 @@ def read_cBpack(filename):
['blue', 'red']
]
"""
with gzip.open(filename, 'rb') as infile:
with gzip.open(filename, "rb") as infile:
data = msgpack.load(infile, raw=False)
header = data[0]
if (
not isinstance(header, dict) or header.get('format') != 'cB'
or header.get('version') != 1
not isinstance(header, dict)
or header.get("format") != "cB"
or header.get("version") != 1
):
raise ValueError("Unexpected header: %r" % header)
return data[1:]


def available_languages(wordlist='best'):
def available_languages(wordlist="best"):
"""
Given a wordlist name, return a dictionary of language codes to filenames,
representing all the languages in which that wordlist is available.
"""
if wordlist == 'best':
available = available_languages('small')
available.update(available_languages('large'))
if wordlist == "best":
available = available_languages("small")
available.update(available_languages("large"))
return available
elif wordlist == 'combined':
logger.warning(
"The 'combined' wordlists have been renamed to 'small'."
)
wordlist = 'small'
elif wordlist == "combined":
logger.warning("The 'combined' wordlists have been renamed to 'small'.")
wordlist = "small"

available = {}
for path in DATA_PATH.glob('*.msgpack.gz'):
if not path.name.startswith('_'):
list_name = path.name.split('.')[0]
name, lang = list_name.split('_')
for path in DATA_PATH.glob("*.msgpack.gz"):
if not path.name.startswith("_"):
list_name = path.name.split(".")[0]
name, lang = list_name.split("_")
if name == wordlist:
available[lang] = str(path)
return available


@lru_cache(maxsize=None)
def get_frequency_list(lang, wordlist='best', match_cutoff=None):
def get_frequency_list(lang, wordlist="best", match_cutoff=None):
"""
Read the raw data from a wordlist file, returning it as a list of
lists. (See `read_cBpack` for what this represents.)
Expand All @@ -123,27 +122,20 @@ def get_frequency_list(lang, wordlist='best', match_cutoff=None):
Looking up the alternate code 'por' will also get the same list.
"""
if match_cutoff is not None:
warnings.warn(
"The `match_cutoff` parameter is deprecated",
DeprecationWarning
)
warnings.warn("The `match_cutoff` parameter is deprecated", DeprecationWarning)
available = available_languages(wordlist)

# TODO: decrease the maximum distance. This distance is so high just
# because it allows a test where 'yue' matches 'zh', and maybe the
# distance between those is high because they shouldn't match.
best, _distance = langcodes.closest_match(
lang, list(available), max_distance=70
)
if best == 'und':
raise LookupError("No wordlist %r available for language %r"
% (wordlist, lang))
best, _distance = langcodes.closest_match(lang, list(available), max_distance=70)
if best == "und":
raise LookupError("No wordlist %r available for language %r" % (wordlist, lang))

if best != lang:
logger.warning(
"You asked for word frequencies in language %r. Using the "
"nearest match, which is %r."
% (lang, best)
"nearest match, which is %r." % (lang, best)
)

return read_cBpack(available[best])
Expand All @@ -161,9 +153,7 @@ def cB_to_freq(cB):
In general, x cB represents a frequency of 10 ** (x/100).
"""
if cB > 0:
raise ValueError(
"A frequency cannot be a positive number of centibels."
)
raise ValueError("A frequency cannot be a positive number of centibels.")
return 10 ** (cB / 100)


Expand Down Expand Up @@ -192,7 +182,7 @@ def zipf_to_freq(zipf):
words. For example, a word that occurs once per million words is at 3.0 on
the Zipf scale.
"""
return 10 ** zipf / 1e9
return 10**zipf / 1e9


def freq_to_zipf(freq):
Expand All @@ -204,16 +194,13 @@ def freq_to_zipf(freq):


@lru_cache(maxsize=None)
def get_frequency_dict(lang, wordlist='best', match_cutoff=None):
def get_frequency_dict(lang, wordlist="best", match_cutoff=None):
"""
Get a word frequency list as a dictionary, mapping tokens to
frequencies as floating-point probabilities.
"""
if match_cutoff is not None:
warnings.warn(
"The `match_cutoff` parameter is deprecated",
DeprecationWarning
)
warnings.warn("The `match_cutoff` parameter is deprecated", DeprecationWarning)
freqs = {}
pack = get_frequency_list(lang, wordlist)
for index, bucket in enumerate(pack):
Expand All @@ -223,7 +210,7 @@ def get_frequency_dict(lang, wordlist='best', match_cutoff=None):
return freqs


def iter_wordlist(lang, wordlist='best'):
def iter_wordlist(lang, wordlist="best"):
"""
Yield the words in a wordlist in approximate descending order of
frequency.
Expand Down Expand Up @@ -258,12 +245,12 @@ def _word_frequency(word, lang, wordlist, minimum):
# If any word is missing, just return the default value
return minimum
# spread the frequency of digits over all digit combinations
freq = freqs[token] / (10. ** digits)
freq = freqs[token] / (10.0**digits)
one_over_result += 1.0 / freq

freq = 1.0 / one_over_result

if get_language_info(lang)['tokenizer'] == 'jieba':
if get_language_info(lang)["tokenizer"] == "jieba":
# If we used the Jieba tokenizer, we could tokenize anything to match
# our wordlist, even nonsense. To counteract this, we multiply by a
# probability for each word break that was inferred.
Expand All @@ -272,14 +259,14 @@ def _word_frequency(word, lang, wordlist, minimum):
# All our frequency data is only precise to within 1% anyway, so round
# it to 3 significant digits
unrounded = max(freq, minimum)
if unrounded == 0.:
return 0.
if unrounded == 0.0:
return 0.0
else:
leading_zeroes = math.floor(-math.log(unrounded, 10))
return round(unrounded, leading_zeroes + 3)


def word_frequency(word, lang, wordlist='best', minimum=0.):
def word_frequency(word, lang, wordlist="best", minimum=0.0):
"""
Get the frequency of `word` in the language with code `lang`, from the
specified `wordlist`.
Expand All @@ -306,7 +293,7 @@ def word_frequency(word, lang, wordlist='best', minimum=0.):
return _wf_cache[args]


def zipf_frequency(word, lang, wordlist='best', minimum=0.):
def zipf_frequency(word, lang, wordlist="best", minimum=0.0):
"""
Get the frequency of `word`, in the language with code `lang`, on the Zipf
scale.
Expand Down Expand Up @@ -334,23 +321,24 @@ def zipf_frequency(word, lang, wordlist='best', minimum=0.):


@lru_cache(maxsize=100)
def top_n_list(lang, n, wordlist='best', ascii_only=False):
def top_n_list(lang, n, wordlist="best", ascii_only=False):
"""
Return a frequency list of length `n` in descending order of frequency.
This list contains words from `wordlist`, of the given language.
If `ascii_only`, then only ascii words are considered.
"""
results = []
for word in iter_wordlist(lang, wordlist):
if (not ascii_only) or max(word) <= '~':
if (not ascii_only) or max(word) <= "~":
results.append(word)
if len(results) >= n:
break
return results


def random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12,
ascii_only=False):
def random_words(
lang="en", wordlist="best", nwords=5, bits_per_word=12, ascii_only=False
):
"""
Returns a string of random, space separated words.
Expand All @@ -364,18 +352,17 @@ def random_words(lang='en', wordlist='best', nwords=5, bits_per_word=12,
You can restrict the selection of words to those written in ASCII
characters by setting `ascii_only` to True.
"""
n_choices = 2 ** bits_per_word
n_choices = 2**bits_per_word
choices = top_n_list(lang, n_choices, wordlist, ascii_only=ascii_only)
if len(choices) < n_choices:
raise ValueError(
"There aren't enough words in the wordlist to provide %d bits of "
"entropy per word." % bits_per_word
)
return ' '.join([random.choice(choices) for i in range(nwords)])
return " ".join([random.choice(choices) for i in range(nwords)])


def random_ascii_words(lang='en', wordlist='best', nwords=5,
bits_per_word=12):
def random_ascii_words(lang="en", wordlist="best", nwords=5, bits_per_word=12):
"""
Returns a string of random, space separated, ASCII words.
Expand Down
14 changes: 9 additions & 5 deletions wordfreq/chinese.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
import msgpack
import gzip

DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh.txt')
ORIG_DICT_FILENAME = resource_filename('wordfreq', 'data/jieba_zh_orig.txt')
SIMP_MAP_FILENAME = resource_filename('wordfreq', 'data/_chinese_mapping.msgpack.gz')
DICT_FILENAME = resource_filename("wordfreq", "data/jieba_zh.txt")
ORIG_DICT_FILENAME = resource_filename("wordfreq", "data/jieba_zh_orig.txt")
SIMP_MAP_FILENAME = resource_filename("wordfreq", "data/_chinese_mapping.msgpack.gz")
try:
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False, strict_map_key=False)
SIMPLIFIED_MAP = msgpack.load(
gzip.open(SIMP_MAP_FILENAME), raw=False, strict_map_key=False
)
except TypeError:
# work around incompatibility between pure-Python msgpack and C msgpack
SIMPLIFIED_MAP = msgpack.load(gzip.open(SIMP_MAP_FILENAME), raw=False)
Expand Down Expand Up @@ -58,6 +60,8 @@ def jieba_tokenize(text, external_wordlist=False):
# those spans from the original text, even if it's in Traditional
# Chinese
tokens = []
for _token, start, end in jieba_tokenizer.tokenize(simplify_chinese(text), HMM=False):
for _token, start, end in jieba_tokenizer.tokenize(
simplify_chinese(text), HMM=False
):
tokens.append(text[start:end])
return tokens
81 changes: 40 additions & 41 deletions wordfreq/language_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,20 +12,19 @@
# characters, are covered by the \p{IsIdeo} check. Checking for
# Script=Hani and IsIdeo slows down our regexes with huge, redundant
# classes of characters. Instead, we'll list the exceptions below.

'Hira', # Hiragana
'Kana', # Katakana
'Thai', # Thai script
'Khmr', # Khmer script
'Laoo', # Lao script
'Mymr', # Burmese script
'Tale', # Tai Le script
'Talu', # Tai Lü script
'Lana', # Lanna script
"Hira", # Hiragana
"Kana", # Katakana
"Thai", # Thai script
"Khmr", # Khmer script
"Laoo", # Lao script
"Mymr", # Burmese script
"Tale", # Tai Le script
"Talu", # Tai Lü script
"Lana", # Lanna script
]


EXTRA_JAPANESE_CHARACTERS = 'ー々〻〆'
EXTRA_JAPANESE_CHARACTERS = "ー々〻〆"

# ー is a lengthening mark that's both hiragana and katakana. Unicode
# segmentation handles it as a special case, but we're overriding standard
Expand Down Expand Up @@ -54,7 +53,7 @@ def _language_in_list(language, targets, max_distance=10):
objects. `targets` can be any iterable of such languages.
"""
matched = closest_match(language, targets, max_distance=max_distance)
return matched[0] != 'und'
return matched[0] != "und"


@lru_cache(maxsize=None)
Expand Down Expand Up @@ -131,42 +130,42 @@ def get_language_info(language):
# Start the `info` dictionary with default values, including the 'script'
# value that we now know from `language_full`.
info = {
'script': language_full.script,
'tokenizer': 'regex',
'normal_form': 'NFKC',
'remove_marks': False,
'dotless_i': False,
'diacritics_under': None,
'transliteration': None,
'lookup_transliteration': None
"script": language_full.script,
"tokenizer": "regex",
"normal_form": "NFKC",
"remove_marks": False,
"dotless_i": False,
"diacritics_under": None,
"transliteration": None,
"lookup_transliteration": None,
}

if _language_in_list(language, ['ja', 'ko']):
info['tokenizer'] = 'mecab'
elif _language_in_list(language, ['zh', 'yue']):
info['tokenizer'] = 'jieba'
elif info['script'] in SPACELESS_SCRIPTS:
info['tokenizer'] = None
if _language_in_list(language, ["ja", "ko"]):
info["tokenizer"] = "mecab"
elif _language_in_list(language, ["zh", "yue"]):
info["tokenizer"] = "jieba"
elif info["script"] in SPACELESS_SCRIPTS:
info["tokenizer"] = None

# Cased alphabetic scripts get NFC normal form
if info['script'] in ['Latn', 'Grek', 'Cyrl']:
info['normal_form'] = 'NFC'
if info["script"] in ["Latn", "Grek", "Cyrl"]:
info["normal_form"] = "NFC"

if info['script'] in ['Arab', 'Hebr']:
info['remove_marks'] = True
if info["script"] in ["Arab", "Hebr"]:
info["remove_marks"] = True

if _language_in_list(language, ['tr', 'az', 'kk']):
info['dotless_i'] = True
info['diacritics_under'] = 'cedillas'
elif _language_in_list(language, ['ro']):
info['diacritics_under'] = 'commas'
if _language_in_list(language, ["tr", "az", "kk"]):
info["dotless_i"] = True
info["diacritics_under"] = "cedillas"
elif _language_in_list(language, ["ro"]):
info["diacritics_under"] = "commas"

if _language_in_list(language, ['sr']):
info['transliteration'] = 'sr-Latn'
elif _language_in_list(language, ['az']):
info['transliteration'] = 'az-Latn'
if _language_in_list(language, ["sr"]):
info["transliteration"] = "sr-Latn"
elif _language_in_list(language, ["az"]):
info["transliteration"] = "az-Latn"

if language.language == 'zh' and language.script != 'Hant':
info['lookup_transliteration'] = 'zh-Hans'
if language.language == "zh" and language.script != "Hant":
info["lookup_transliteration"] = "zh-Hans"

return info
Loading

0 comments on commit 538145c

Please sign in to comment.