# EXERCISE

Using the Brown's "Adventure" category as your test set, compare it with the remaining part of the Brown dataset and check:
* Vocabulary size difference. X
* The intersection between the 100 most frequent words. X
* Compare the most common Bigrams, with different measures
* Do the same with trigrams

In [154]:
from nltk.corpus import brown

## Separate "adventures" category from all others 

In [155]:
other_categories = brown.categories()
adventure_categories =other_categories.pop(0)

## Retrieve file ids for every category 

In [156]:
adventure_file_ids = brown.fileids(adventure_categories)
other_categories_file_ids = []
for category in other_categories:
    for fids in brown.fileids(category):
        other_categories_file_ids.append(fids)


## For every file ids, get the words and put in a set, obtaining a vocabulary

In [157]:
adventure_vocabulary_words = set(brown.words(fileids=adventure_file_ids))
other_vocabulary_words = set()
for category_fids in other_categories_file_ids:
        other_vocabulary_words=other_vocabulary_words.union(set(brown.words(fileids=category_fids)))

In [158]:
print(f"adventure_vocabulary_words: {len(adventure_vocabulary_words)}")
print(f"other_vocabulary_words: {len(other_vocabulary_words)}")
print(f"Vocabulary size difference: {len(other_vocabulary_words)-len(adventure_vocabulary_words)}")

adventure_vocabulary_words: 8874
other_vocabulary_words: 54339
Vocabulary size difference: 45465


# Intersection between the 100 most frequent words

### Get the word for "adventures" and other categories

In [159]:
adventure_words = brown.words(adventure_file_ids)
other_categories_words = brown.words(other_categories_file_ids)

###  Filtering: Tokenization - Normalization - Stemming - StopWord 

In [160]:

import string
from nltk.tokenize import word_tokenize

adventure_tokens = []
other_tokens = []

for word in adventure_vocabulary_words:
    tokens = word_tokenize(word)
    for token in tokens:
        adventure_tokens.append(token)
for word in other_categories_words:
    tokens = word_tokenize(word)
    for token in tokens:
        other_tokens.append(token)


In [161]:
punctuations = set(string.punctuation)
punctuations.add('\'')
punctuations.add('``')

# convert to lower-case
normalized_adventure_tokens = [word.lower() for word in adventure_tokens]
normalized_other_tokens = [word.lower() for word in other_tokens]


normalized_adventure_tokens = [w.translate(str.maketrans ('', '', string.punctuation)) for w in normalized_adventure_tokens]
normalized_other_tokens = [w.translate(str.maketrans ('', '', string.punctuation)) for w in normalized_other_tokens]

normalized_adventure_tokens = [w for w in normalized_adventure_tokens if len(w) >0]
normalized_other_tokens = [w for w in normalized_other_tokens if len(w) >0]


In [162]:
from nltk.stem import PorterStemmer
ps =PorterStemmer()
stemmed_adventure_words = [ps.stem(w) for w in normalized_adventure_tokens]
stemmed_other_words = [ps.stem(w) for w in normalized_other_tokens]

In [163]:
for w in stemmed_adventure_words:
    print(w)

protest
recruit
driver
confirm
possibl
jingl
field
writer
penetr
cheek
mortal
exalt
sud
spur
orchard
awhil
mop
hope
hobbl
german
right
schaffner
coachman
slam
remnant
emerg
cover
press
shall
nearer
hit
tilt
savag
firebug
card
base
fade
poolequip
qualif
progress
craft
flag
pint
delphin
s
mph
suspect
lunch
anim
implic
somebodi
weari
cousin
starv
blind
haze
pierc
view
pin
casual
hoawhup
rider
taught
bomb
express
kick
includ
hellrais
weekli
vanish
bodi
appetit
habitu
produc
a
will
run
sheriff
s
selfsatisfact
stilt
nice
killin
dose
nimbl
thumb
famou
howev
airplan
push
emot
shred
fire
platform
kill
must
thought
tapdanc
stuf
lighter
fleischman
berri
voic
look
present
sit
pronounc
window
tacloban
reaction
payment
strongli
mari
pronounc
henchmen
topmost
even
peel
allow
string
along
cloy
first
roadsid
row
pass
fuzzi
easili
eye
drink
hotel
add
ship
rapidli
begin
rose
ramey
radii
shot
conceal
bring
sick
inquest
surg
shake
think
repeat
spectacl
reveal
perman
compuls
gray
swing
girl
hang
would
incoh

In [164]:
import nltk
nltk.download('stopwords')
ignored_words = nltk.corpus.stopwords.words('english')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Salvatore\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Get frequency -> calculate intersection

In [165]:
from nltk.probability import FreqDist
CAP_FREQUENCY = 100

def brown_corpus_n_most_frequent_words(n,words):
    frequency_distribution = FreqDist(words)
    most_frequent_words = frequency_distribution.most_common(n)
    return [word for word,_ in most_frequent_words]

adventure_most_frequent_words = brown_corpus_n_most_frequent_words(CAP_FREQUENCY,stemmed_adventure_words)
other_categories_most_frequent_words = brown_corpus_n_most_frequent_words(CAP_FREQUENCY,stemmed_other_words)

adv_other_frequent_words_intersection = [value for value in adventure_most_frequent_words \
                                        if value in other_categories_most_frequent_words]

print(adv_other_frequent_words_intersection)


['s', 'nt', 'you', 'we', 'they', 'he', 'it', 'she', 'what', 'use', 'work', 'other', 'that', 'i', 'like', 'have', 'do', 'who', 'there', 'will', 'man']


# Compare the most common Bigrams, with different measures

In [166]:
import nltk.collocations as collocations
import string

punctuations = list(string.punctuation)
BEST_N_GRAMS = 10

bigram_measures = collocations.BigramAssocMeasures()
adventure_bigrams = collocations.BigramCollocationFinder.from_words(stemmed_adventure_words)
other_bigrams = collocations.BigramCollocationFinder.from_words(stemmed_other_words)

adventure_bigrams.apply_word_filter(lambda w: w.lower() in ignored_words)
other_bigrams.apply_word_filter(lambda w: w.lower() in ignored_words)


adventure_best_bmi = adventure_bigrams.nbest(bigram_measures.pmi,BEST_N_GRAMS)
other_best_bmi = other_bigrams.nbest(bigram_measures.pmi,BEST_N_GRAMS)

adventure_best_likelihood_ratio = adventure_bigrams.nbest(bigram_measures.likelihood_ratio,BEST_N_GRAMS)
other_best_likelihood_ratio = other_bigrams.nbest(bigram_measures.likelihood_ratio,BEST_N_GRAMS)

print("BMI COMPARISON")
print("Adventure | Other")
for i in range(BEST_N_GRAMS):
    print(f"{adventure_best_bmi[i]} | {other_best_bmi[i]}")

BMI COMPARISON
Adventure | Other
('1100', 'europ') | ('05mvm', '50percent')
('125', 'hacksaw') | ('060', 'jeanmari')
('1895', 'unmolest') | ('10000peryear', 'frenchborn')
('275', 'cept') | ('1068', '1159')
('700', 'swayback') | ('11inch', 'headroom')
('730', 'garvier') | ('11shot', 'hammerless')
('80', 'fondli') | ('1257700', 'nonfarm')
('a26', 'took') | ('147000', 'gpd')
('aaawww', 'fieri') | ('150milliamper', 'flashlighttyp')
('aaron', 'erect') | ('1671', 'nakoma')


In [167]:
print("LIKELIHOOD COMPARISON")
print("Adventure | Other")
for i in range(BEST_N_GRAMS):
    print(f"{adventure_best_likelihood_ratio[i]} | {other_best_likelihood_ratio[i]}")

LIKELIHOOD COMPARISON
Adventure | Other
('think', 'meet') | ('unit', 'state')
('wan', 'na') | ('new', 'york')
('ai', 'nt') | ('per', 'cent')
('ca', 'nt') | ('ca', 'nt')
('1100', 'europ') | ('rhode', 'island')
('125', 'hacksaw') | ('year', 'ago')
('1895', 'unmolest') | ('wo', 'nt')
('275', 'cept') | ('du', 'pont')
('700', 'swayback') | ('could', 'nt')
('730', 'garvier') | ('lo', 'angel')


# The same with Trigrams

In [168]:
trigram_measures = collocations.TrigramAssocMeasures()
adventure_trigrams = collocations.TrigramCollocationFinder.from_words(stemmed_adventure_words)
other_trigrams = collocations.TrigramCollocationFinder.from_words(stemmed_other_words)

adventure_trigrams.apply_word_filter(lambda w: w.lower() in ignored_words)
other_trigrams.apply_word_filter(lambda w: w.lower() in ignored_words)



adventure_best_bmi = adventure_trigrams.nbest(trigram_measures.pmi,BEST_N_GRAMS)
other_best_bmi = other_trigrams.nbest(trigram_measures.pmi,BEST_N_GRAMS)

adventure_best_likelihood_ratio = adventure_trigrams.nbest(trigram_measures.likelihood_ratio,BEST_N_GRAMS)
other_best_likelihood_ratio = other_trigrams.nbest(trigram_measures.likelihood_ratio,BEST_N_GRAMS)

print("BMI COMPARISON")
print("Adventure | Other")
for i in range(BEST_N_GRAMS):
    print(f"{adventure_best_bmi[i]} | {other_best_bmi[i]}")

BMI COMPARISON
Adventure | Other
('1100', 'europ', 'druther') | ('060', 'jeanmari', 'leclair')
('1895', 'unmolest', 'porch') | ('23036', 'riverboat', 'dalzellcousin')
('700', 'swayback', 'envi') | ('5foot', '11inch', 'headroom')
('aaawww', 'fieri', 'sold') | ('871', '892', 'alcin')
('aaron', 'erect', 'kept') | ('aber', 'kein', 'meinung')
('abl', 'piti', 'doran') | ('albican', 'pseudomona', 'pyocanea')
('ablaz', 'paradox', 'haunt') | ('alexei', 'zhitkov', 'lev')
('absent', 'altogeth', 'gift') | ('amphetamin', 'benzedrin', 'dexedrin')
('access', 'spoon', 'brawl') | ('amt', 'aber', 'kein')
('adio', 'talkin', 'vers') | ('anabel', 'brieff', 'flutist')


In [169]:
print("LIKELIHOOD COMPARISON")
print("Adventure | Other")
for i in range(BEST_N_GRAMS):
    print(f"{adventure_best_likelihood_ratio[i]} | {other_best_likelihood_ratio[i]}")

LIKELIHOOD COMPARISON
Adventure | Other
('50', 'think', 'meet') | ('unit', 'state', 'depart')
('think', 'meet', 'chrissak') | ('unit', 'state', 'govern')
('think', 'meet', 'mud') | ('v', 'unit', 'state')
('upstair', 'wan', 'na') | ('unit', 'state', 'code')
('wan', 'na', 'homicid') | ('unit', 'state', 'navi')
('pursu', 'think', 'meet') | ('unit', 'state', 'sec')
('sack', 'wan', 'na') | ('unit', 'state', 'supra')
('wan', 'na', 'brannon') | ('unit', 'state', '348')
('agreeabl', 'ai', 'nt') | ('unit', 'state', 'suprem')
('ai', 'nt', 'railroad') | ('unit', 'state', 'tax')
