# EXERCISE

Using the Brown's "Adventure" category as your test set, compare it with the remaining part of the Brown dataset and check:
* Vocabulary size difference. X
* The intersection between the 100 most frequent words. X
* Compare the most common Bigrams, with different measures
* Do the same with trigrams

In [29]:
from nltk.corpus import brown

## Separate "adventures" category from all others 

In [30]:
other_categories = brown.categories()
adventure_categories =other_categories.pop(0)

## Retrieve file ids for every category 

In [31]:
adventure_file_ids = brown.fileids(adventure_categories)
other_categories_file_ids = []
for category in other_categories:
    for fids in brown.fileids(category):
        other_categories_file_ids.append(fids)


## For every file ids, get the words and put in a set, obtaining a vocabulary

In [32]:
adventure_vocabulary_words = set(brown.words(fileids=adventure_file_ids))
other_vocabulary_words = set()
for category_fids in other_categories_file_ids:
        other_vocabulary_words=other_vocabulary_words.union(set(brown.words(fileids=category_fids)))

In [33]:
print(f"adventure_vocabulary_words: {len(adventure_vocabulary_words)}")
print(f"other_vocabulary_words: {len(other_vocabulary_words)}")
print(f"Vocabulary size difference: {len(other_vocabulary_words)-len(adventure_vocabulary_words)}")

adventure_vocabulary_words: 8874
other_vocabulary_words: 54339
Vocabulary size difference: 45465


# Intersection between the 100 most frequent words

### Get the word for "adventures" and other categories

In [34]:
adventure_words = brown.words(adventure_file_ids)
other_categories_words = brown.words(other_categories_file_ids)

###  Filtering: remove puntuactions

In [35]:
import string

punctuations = set(string.punctuation)
punctuations.add('``')
punctuations.add('\'\'')

adventure_words = [w for w in adventure_words if w not in punctuations]
other_categories_words = [w for w in other_categories_words if w not in punctuations]

### Get frequency -> calculate intersection

In [36]:
from nltk.probability import FreqDist
CAP_FREQUENCY = 100

def brown_corpus_n_most_frequent_words(n,words):
    frequency_distribution = FreqDist(words)
    most_frequent_words = frequency_distribution.most_common(n)
    return [word for word,_ in most_frequent_words]

adventure_most_frequent_words = brown_corpus_n_most_frequent_words(CAP_FREQUENCY,adventure_words)
other_categories_most_frequent_words = brown_corpus_n_most_frequent_words(CAP_FREQUENCY,other_categories_words)

adv_other_frequent_words_intersection = [value for value in adventure_most_frequent_words \
                                        if value in other_categories_most_frequent_words]

print(adv_other_frequent_words_intersection)


['the', 'and', 'a', 'of', 'to', 'was', 'in', 'his', 'he', 'I', 'had', 'He', 'that', 'it', 'on', 'her', 'him', 'The', 'with', 'you', 'for', 'at', 'as', 'said', 'out', 'from', 'were', 'she', 'up', '--', 'me', 'they', 'this', 'but', 'would', 'be', 'into', 'not', 'my', 'all', 'man', 'one', 'an', 'their', 'them', 'could', 'by', 'It', 'like', 'have', 'there', 'been', 'time', 'when', 'no', 'But', 'about', 'over', 'or', 'so', 'what', 'which', 'then', 'only', 'is', 'do', 'who', 'if', 'now', 'we', 'did', 'more', 'before', 'two', 'A', 'made']


# Compare the most common Bigrams, with different measures

In [37]:
import nltk.collocations as collocations
import string

punctuations = list(string.punctuation)
BEST_N_GRAMS = 10

bigram_measures = collocations.BigramAssocMeasures()
adventure_bigrams = collocations.BigramCollocationFinder.from_words(adventure_words)
other_bigrams = collocations.BigramCollocationFinder.from_words(other_categories_words)

adventure_bigrams.apply_word_filter(lambda w: w.lower() in punctuations)
other_bigrams.apply_word_filter(lambda w: w.lower() in punctuations)


adventure_best_bmi = adventure_bigrams.nbest(bigram_measures.pmi,BEST_N_GRAMS)
other_best_bmi = other_bigrams.nbest(bigram_measures.pmi,BEST_N_GRAMS)

adventure_best_likelihood_ratio = adventure_bigrams.nbest(bigram_measures.likelihood_ratio,BEST_N_GRAMS)
other_best_likelihood_ratio = other_bigrams.nbest(bigram_measures.likelihood_ratio,BEST_N_GRAMS)

print("BMI COMPARISON")
print("Adventure | Other")
for i in range(BEST_N_GRAMS):
    print(f"{adventure_best_bmi[i]} | {other_best_bmi[i]}")

BMI COMPARISON
Adventure | Other
('1', 'Europeans') | ('$10,000-per-year', 'French-born')
("1890's", 'Led') | ('$148.50', '2-4')
('28', 'Attack') | ('$16', 'Participating')
('600', 'fathoms') | ('$2,461,000', 'Inventories')
('Aircraft', 'Identification') | ('$590,000', 'Apologies')
('Alice', 'Rheumatics') | ('$79.89', 'nothing-down')
("Allen's", 'three-room') | ('$8.50', 'tab')
('Anthropology', '6') | ("'low", 'nigras')
('Association', 'meeting') | ('0.5-mv./m.', '50-percent')
('Autos', 'whizzed') | ('0.78', 'mEq')


In [38]:
print("LIKELIHOOD COMPARISON")
print("Adventure | Other")
for i in range(BEST_N_GRAMS):
    print(f"{adventure_best_likelihood_ratio[i]} | {other_best_likelihood_ratio[i]}")

LIKELIHOOD COMPARISON
Adventure | Other
('of', 'the') | ('of', 'the')
('had', 'been') | ('in', 'the')
('in', 'the') | ('the', 'the')
('on', 'the') | ('United', 'States')
('did', 'not') | ('to', 'be')
('It', 'was') | ('on', 'the')
('into', 'the') | ('had', 'been')
('Miss', 'Langford') | ('New', 'York')
('he', 'had') | ('have', 'been')
('I', "don't") | ('has', 'been')


# The same with Trigrams

In [40]:
trigram_measures = collocations.TrigramAssocMeasures()
adventure_trigrams = collocations.TrigramCollocationFinder.from_words(adventure_words)
other_trigrams = collocations.TrigramCollocationFinder.from_words(other_categories_words)

adventure_trigrams.apply_word_filter(lambda w: w.lower() in punctuations)
other_trigrams.apply_word_filter(lambda w: w.lower() in punctuations)

adventure_best_bmi = adventure_trigrams.nbest(trigram_measures.pmi,BEST_N_GRAMS)
other_best_bmi = other_trigrams.nbest(trigram_measures.pmi,BEST_N_GRAMS)

adventure_best_likelihood_ratio = adventure_trigrams.nbest(trigram_measures.likelihood_ratio,BEST_N_GRAMS)
other_best_likelihood_ratio = other_trigrams.nbest(trigram_measures.likelihood_ratio,BEST_N_GRAMS)

print("BMI COMPARISON")
print("Adventure | Other")
for i in range(BEST_N_GRAMS):
    print(f"{adventure_best_bmi[i]} | {other_best_bmi[i]}")

BMI COMPARISON
Adventure | Other
("Allen's", 'three-room', 'tarpapered') | ('$10,000-per-year', 'French-born', 'maitre')
('Blessed', 'Saint', 'Nicholas') | ('060', 'Jean-Marie', 'LeClair')
('Cigarette', 'butts', 'littered') | ("2'", 'Amenitskii', 'Noskova')
('Gaston', 'Berche', 'crimsoning') | ('275-degrees-F', '135*0C.', 'Dryer')
("Graves'", 'imaginative', 'interpretation') | ('2:30-:36', 'Riverboat', 'Dalzell-Cousin')
('Inc.', "They'd", 'peddled') | ('2:31', 'Armbro', 'Comet')
('Jed', 'Hawkins', 'lives') | ('5-foot', '11-inch', 'headroom')
("Nicolas's", 'whereabouts', 'Packing') | ('871', '892', "Alcinous'")
('Oil', 'Gas', 'Company') | ('Aiding', 'Leukemia', 'Stricken')
('Raft', 'River', 'turnoff') | ('Alexei', 'Zhitkov', 'Lev')


In [41]:
print("LIKELIHOOD COMPARISON")
print("Adventure | Other")
for i in range(BEST_N_GRAMS):
    print(f"{adventure_best_likelihood_ratio[i]} | {other_best_likelihood_ratio[i]}")

LIKELIHOOD COMPARISON
Adventure | Other
('out', 'of', 'the') | ('of', 'the', 'and')
('he', 'had', 'been') | ('the', 'number', 'of')
('of', 'the', 'hall') | ('the', 'is', 'of')
('front', 'of', 'the') | ('the', 'first', 'of')
('of', 'the', 'house') | ('of', 'the', 'same')
('one', 'of', 'the') | ('the', 'most', 'of')
('end', 'of', 'the') | ('the', 'part', 'of')
('side', 'of', 'the') | ('the', 'kind', 'of')
('He', 'had', 'been') | ('the', 'world', 'of')
('of', 'the', 'street') | ('the', 'end', 'of')
