## Term Frequency Demo

In [31]:
import nltk 
from nltk.probability import FreqDist
from nltk.corpus import stopwords

In [32]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [33]:
# Read the Corpus
nltk.download('gutenberg')
words = nltk.Text(nltk.corpus.gutenberg.words('bryant-stories.txt'))


In [50]:
print(words[:500])

['[', 'Stories', 'to', 'Tell', 'to', 'Children', 'by', 'Sara', 'Cone', 'Bryant', '1918', ']', 'TWO', 'LITTLE', 'RIDDLES', 'IN', 'RHYME', 'There', "'", 's', 'a', 'garden', 'that', 'I', 'ken', ',', 'Full', 'of', 'little', 'gentlemen', ';', 'Little', 'caps', 'of', 'blue', 'they', 'wear', ',', 'And', 'green', 'ribbons', ',', 'very', 'fair', '.', '(', 'Flax', '.)', 'From', 'house', 'to', 'house', 'he', 'goes', ',', 'A', 'messenger', 'small', 'and', 'slight', ',', 'And', 'whether', 'it', 'rains', 'or', 'snows', ',', 'He', 'sleeps', 'outside', 'in', 'the', 'night', '.', '(', 'The', 'path', '.)', 'THE', 'LITTLE', 'YELLOW', 'TULIP', 'Once', 'there', 'was', 'a', 'little', 'yellow', 'Tulip', ',', 'and', 'she', 'lived', 'down', 'in', 'a', 'little', 'dark', 'house', 'under', 'the', 'ground', '.', 'One', 'day', 'she', 'was', 'sitting', 'there', ',', 'all', 'by', 'herself', ',', 'and', 'it', 'was', 'very', 'still', '.', 'Suddenly', ',', 'she', 'heard', 'a', 'little', '_tap', ',', 'tap', ',', 'tap_', 

In [8]:
# Convert to small letters
words = [word.lower() for word in words if word.isalpha()]
words = [word.lower() for word in words if word not in stop_words]

In [9]:
fDist = FreqDist(words)

In [10]:
print(len(words))

21718


In [11]:
print(len(set(words)))

3688


In [15]:
for x, v in fDist.most_common(10):
    print('{:<10}{:<10}'.format(x, v))

little    597       
said      453       
came      191       
one       183       
could     158       
king      141       
went      122       
would     112       
great     110       
day       107       


In [16]:
for x, v in fDist.most_common(10):
    print('{:<10}{:<10}'.format(x, v/len(fDist)))

little    0.1618763557483731
said      0.12283080260303687
came      0.05178958785249458
one       0.04962039045553145
could     0.042841648590021694
king      0.038232104121475055
went      0.03308026030368764
would     0.03036876355748373
great     0.02982646420824295
day       0.02901301518438178


In [17]:
fDist

FreqDist({'little': 597, 'said': 453, 'came': 191, 'one': 183, 'could': 158, 'king': 141, 'went': 122, 'would': 112, 'great': 110, 'day': 107, ...})

In [18]:
fDist['little']

597

In [19]:
fDist.items()



In [20]:
fDist.keys()



In [22]:
fDist.get('yes')

28

In [27]:
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
sent = 'This is an example sentence'
fdist = FreqDist()
for word in word_tokenize(sent):
    fdist[word.lower()] += 1

In [28]:
fdist

FreqDist({'this': 1, 'is': 1, 'an': 1, 'example': 1, 'sentence': 1})

In [30]:
# An equivalent way to do this is with the initializer:

fdist = FreqDist(word.lower() for word in word_tokenize(sent)) 
fdist

FreqDist({'this': 1, 'is': 1, 'an': 1, 'example': 1, 'sentence': 1})

### Add

In [35]:
#Add
FreqDist('abbb') + FreqDist('bcc')

FreqDist({'b': 4, 'c': 2, 'a': 1})

### Finding frequency of Common Alphabets/Words in two strings

In [36]:
# Intersection
FreqDist('abbb') & FreqDist('bcc')

FreqDist({'b': 1})

In [39]:
FreqDist('hello boy how are you') & FreqDist('hello girl who are you')

FreqDist({'o': 4, ' ': 4, 'h': 2, 'e': 2, 'l': 2, 'y': 1, 'w': 1, 'a': 1, 'r': 1, 'u': 1})

In [40]:
FreqDist('hello boy how are you'.split()) & FreqDist('hello hello girl who are you'.split())

FreqDist({'hello': 1, 'are': 1, 'you': 1})

### Union

In [41]:
# Union
FreqDist('abbb') | FreqDist('bcc')

FreqDist({'b': 3, 'c': 2, 'a': 1})

### Substract

In [42]:
# Substract
FreqDist('abbbc') - FreqDist('bccd')

FreqDist({'b': 2, 'a': 1})