# Term frequency of formal english corpora

In [1]:
import csv
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import brown
from nltk.corpus import reuters
from nltk.corpus import gutenberg
from typing import AnyStr

In [2]:
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('wordnet')
nltk.download('brown')
nltk.download('reuters')
nltk.download('gutenberg')
nltk.data.path

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\gutenberg.zip.


['C:\\Users\\USER/nltk_data',
 'S:\\dev\\seahrh\\nlp-examples\\env\\nltk_data',
 'S:\\dev\\seahrh\\nlp-examples\\env\\share\\nltk_data',
 'S:\\dev\\seahrh\\nlp-examples\\env\\lib\\nltk_data',
 'C:\\Users\\USER\\AppData\\Roaming\\nltk_data',
 'C:\\nltk_data',
 'D:\\nltk_data',
 'E:\\nltk_data']

In [3]:
def to_str(bytes_or_str: AnyStr, encoding="utf-8") -> str:
    """Based on Effective Python Item 3:
    Know the difference between bytes str and unicode
    """
    if isinstance(bytes_or_str, bytes):
        return bytes_or_str.decode(encoding)
    # Instance of str
    return bytes_or_str

In [4]:
%%time
words = [to_str(w).lower() for w in wn.words()]
words += [to_str(w).lower() for w in brown.words()]
words += [to_str(w).lower() for w in reuters.words()]
words += [to_str(w).lower() for w in gutenberg.words()]
print(f"{len(words):,} words")

5,651,012 words
Wall time: 13 s


In [5]:
%%time
fd = nltk.FreqDist(words)
print(f"{len(fd):,} words")

202,023 words
Wall time: 2.99 s


In [6]:
fd.most_common(50)

[(',', 316785),
 ('the', 272831),
 ('.', 217779),
 ('and', 149943),
 ('of', 144458),
 ('to', 110615),
 ('in', 84171),
 ('a', 82259),
 (':', 50900),
 ('that', 46932),
 ('for', 42798),
 ('it', 42168),
 (';', 41657),
 ('he', 40621),
 ('said', 36774),
 ('i', 36166),
 ('was', 34346),
 ('is', 34212),
 ("'", 31462),
 ('with', 31067),
 ('his', 29071),
 ('be', 28850),
 ('not', 26373),
 ('as', 26357),
 ('s', 25599),
 ('on', 24582),
 ('-', 22633),
 ('"', 22238),
 ('but', 21927),
 ('from', 21663),
 ('at', 21564),
 ('by', 20920),
 ('you', 19873),
 ('they', 19319),
 ('mln', 18623),
 ('had', 18425),
 ('all', 18021),
 ('this', 17576),
 ('have', 17166),
 ('which', 16002),
 ('him', 15703),
 ('will', 15566),
 ('her', 14598),
 ('vs', 14529),
 ('are', 14464),
 ('or', 13780),
 ('1', 13397),
 ('were', 13247),
 ('an', 13195),
 ('one', 12474)]

In [7]:
ws = ["wtf", "omg", "dog"]
for w in ws:
    print(f"{w}\t\t{fd.freq(w)}")

wtf		0.0
omg		0.0
dog		3.3976215233660804e-05


In [8]:
with open("output/formal_en.tsv", "w", encoding="utf-8") as f:
    writer = csv.writer(f, delimiter='\t', lineterminator='\n')
    writer.writerow(["Word", "Count"])
    for word, count in fd.most_common():
        writer.writerow([word, count])