# Advanced Task #5 - Collocations

# Step 1. Import tokens from both documents

In [395]:
import io

In [396]:
# doc name - import 
doc1name = "hamlet"
doc2name = "macbeth"

In [397]:
f1name = doc1name+"_tokens_processed.txt"
f2name = doc2name+"_tokens_processed.txt"

In [398]:
f1in = open("../data/"+f1name)
tokens1 = f1in.readlines()
tokens1 = [t.strip() for t in tokens1]
len(tokens1)

15457

In [399]:
tokens1[0:10]

['tragedie',
 'hamlet',
 'william',
 'shakespeare',
 '1599',
 'actus',
 'primus',
 'scoena',
 'prima',
 'enter']

In [400]:
f2in = open("../data/"+f2name)
tokens2 = f2in.readlines()
tokens2 = [t.strip() for t in tokens2]
len(tokens2)

10087

In [401]:
tokens2[0:10]

['tragedie',
 'macbeth',
 'william',
 'shakespeare',
 '1603',
 'actus',
 'primus',
 'scoena',
 'prima',
 'thunder']

# Step 2. Collocations - Simple Fxn

In [57]:
from nltk.collocations import *

In [402]:
# simple collocation fxn: highest frequency collocations for doc1
text1 = nltk.Text(tokens1)
text1.collocations()

good lord; rosincrance guildensterne; haue seene; exeunt enter; haue
heard; set downe; enter polonius; enter king; dost thou; thou hast;
sit downe; behinde arras; father death; thou art; good friend; horatio
marcellus; command ment; close consequence; christian buriall; mine
owne


In [404]:
# simple collocation fxn: highest frequency collocations for doc1
text2 = nltk.Text(tokens2)
text2.collocations()

exeunt scena; thane cawdor; knock knock; enter macbeth; scena secunda;
ten thousand; thou art; three witch; mine eye; malcolme donalbaine;
scena prima; good lord; burne cauldron; weyward sister; fire burne;
cauldron bubble; drum colour; mine owne; lord macb; worthy thane


# Step 3. Collocations - Bigrams

- https://realpython.com/nltk-nlp-python/#finding-collocations
- https://www.nltk.org/howto/collocations.html

In [418]:
# choose frequency of collocations to filter
filter_freq = 4

In [419]:
# advanced collocations
bigram_measures = nltk.collocations.BigramAssocMeasures()
# trigram_measures = nltk.collocations.TrigramAssocMeasures()

### Calcuate bigrams for doc1

In [420]:
finder1 = BigramCollocationFinder.from_words(text1)
finder1

<nltk.collocations.BigramCollocationFinder at 0x1bdd8a07e50>

In [421]:
# filter for collocations occurring n+ times
finder1.apply_freq_filter(filter_freq)

In [422]:
# finder1_nbest = finder1.nbest(bigram_measures.pmi, 1000)
finder1_nbest = finder1.nbest(bigram_measures.raw_freq, 1000)
finder1_nbest

[('good', 'lord'),
 ('enter', 'king'),
 ('wee', 'l'),
 ('haue', 'seene'),
 ('lord', 'hamlet'),
 ('enter', 'hamlet'),
 ('exeunt', 'enter'),
 ('hor', 'lord'),
 ('haue', 'heard'),
 ('lord', 'haue'),
 ('ophe', 'lord'),
 ('st', 'thou'),
 ('thou', 'hast'),
 ('enter', 'polonius'),
 ('father', 'death'),
 ('good', 'friend'),
 ('king', 'queene'),
 ('lord', 'polon'),
 ('come', 'againe'),
 ('dost', 'thou'),
 ('ile', 'haue'),
 ('let', 'see'),
 ('rosincrance', 'guildensterne'),
 ('set', 'downe'),
 ('thou', 'art'),
 ('would', 'haue'),
 ('king', 'haue'),
 ('king', 'oh'),
 ('let', 'come'),
 ('let', 'know'),
 ('mine', 'owne'),
 ('well', 'lord'),
 ('come', 'come'),
 ('e', 'ene'),
 ('enter', 'ghost'),
 ('enter', 'horatio'),
 ('hamlet', 'hamlet'),
 ('heauen', 'earth'),
 ('horatio', 'marcellus'),
 ('let', 'go'),
 ('lord', 'exeunt'),
 ('lord', 'king'),
 ('reynol', 'lord'),
 ('rosin', 'lord'),
 ('shall', 'heare'),
 ('sit', 'downe'),
 ('thy', 'selfe'),
 ('ti', 'true'),
 ('wilt', 'thou'),
 ('come', 'hither'),
 

In [423]:
len(finder1_nbest)

89

In [435]:
# frequency distribution of ngrams in doc1
finder1_fd = sorted(finder1.ngram_fd.items(), key=lambda t: (-t[1], t[0]))
finder1_fd[0:10]

[(('good', 'lord'), 23),
 (('enter', 'king'), 15),
 (('wee', 'l'), 13),
 (('haue', 'seene'), 12),
 (('lord', 'hamlet'), 11),
 (('enter', 'hamlet'), 10),
 (('exeunt', 'enter'), 10),
 (('hor', 'lord'), 10),
 (('haue', 'heard'), 9),
 (('lord', 'haue'), 9)]

### Repeat these steps for doc2

In [425]:
# repeat for doc2
finder2 = BigramCollocationFinder.from_words(text2)
finder2.apply_freq_filter(filter_freq)
# finder2_nbest = finder2.nbest(bigram_measures.pmi, 1000)
finder2_nbest = finder2.nbest(bigram_measures.raw_freq, 1000)
finder2_nbest

[('enter', 'macbeth'),
 ('exeunt', 'scena'),
 ('thane', 'cawdor'),
 ('knock', 'knock'),
 ('lord', 'macb'),
 ('st', 'thou'),
 ('thou', 'art'),
 ('good', 'lord'),
 ('haue', 'done'),
 ('macb', 'haue'),
 ('enter', 'lady'),
 ('wee', 'l'),
 ('macb', 'thou'),
 ('macbeth', 'macb'),
 ('mine', 'eye'),
 ('would', 'st'),
 ('enter', 'malcolme'),
 ('enter', 'three'),
 ('euery', 'one'),
 ('macb', 'ile'),
 ('mine', 'owne'),
 ('scena', 'secunda'),
 ('three', 'witch'),
 ('thy', 'selfe'),
 ('worthy', 'thane'),
 ('would', 'haue'),
 ('1', 'murth'),
 ('alarum', 'enter'),
 ('borne', 'woman'),
 ('come', 'come'),
 ('dy', 'de'),
 ('enter', 'banquo'),
 ('enter', 'king'),
 ('enter', 'macduffe'),
 ('enter', 'rosse'),
 ('haile', 'king'),
 ('haile', 'macbeth'),
 ('hath', 'made'),
 ('haue', 'seene'),
 ('macb', 'bring'),
 ('macbeth', 'macbeth'),
 ('malcolme', 'donalbaine'),
 ('old', 'man'),
 ('rosse', 'angus'),
 ('scena', 'prima'),
 ('see', 'thee'),
 ('sir', 'macb'),
 ('ten', 'thousand'),
 ('tertia', 'enter'),
 ('thy'

In [426]:
len(finder2_nbest)

53

In [434]:
# frequencies of ngrams in doc1
finder2_fd = sorted(finder2.ngram_fd.items(), key=lambda t: (-t[1], t[0]))
finder2_fd[0:10]

[(('enter', 'macbeth'), 18),
 (('exeunt', 'scena'), 15),
 (('thane', 'cawdor'), 13),
 (('knock', 'knock'), 10),
 (('lord', 'macb'), 10),
 (('st', 'thou'), 9),
 (('thou', 'art'), 9),
 (('good', 'lord'), 8),
 (('haue', 'done'), 8),
 (('macb', 'haue'), 8),
 (('enter', 'lady'), 7),
 (('wee', 'l'), 7),
 (('macb', 'thou'), 6),
 (('macbeth', 'macb'), 6),
 (('mine', 'eye'), 6),
 (('would', 'st'), 6),
 (('enter', 'malcolme'), 5),
 (('enter', 'three'), 5),
 (('euery', 'one'), 5),
 (('macb', 'ile'), 5),
 (('mine', 'owne'), 5),
 (('scena', 'secunda'), 5),
 (('three', 'witch'), 5),
 (('thy', 'selfe'), 5),
 (('worthy', 'thane'), 5),
 (('would', 'haue'), 5),
 (('1', 'murth'), 4),
 (('alarum', 'enter'), 4),
 (('borne', 'woman'), 4),
 (('come', 'come'), 4),
 (('dy', 'de'), 4),
 (('enter', 'banquo'), 4),
 (('enter', 'king'), 4),
 (('enter', 'macduffe'), 4),
 (('enter', 'rosse'), 4),
 (('haile', 'king'), 4),
 (('haile', 'macbeth'), 4),
 (('hath', 'made'), 4),
 (('haue', 'seene'), 4),
 (('macb', 'bring'),

### Calculate shared bigrams in the two docs. 

In [428]:
# find overlapping collocations in the two docs
overlapping_bigrams = [ngram for ngram in finder1_nbest if ngram in finder2_nbest]

# overlapping collocations between the two docs!

# if this list is empty, go back up to the line...
# filter_freq = 4
# and choose a smaller number, such as 2 or 3

overlapping_bigrams

[('good', 'lord'),
 ('enter', 'king'),
 ('wee', 'l'),
 ('haue', 'seene'),
 ('st', 'thou'),
 ('thou', 'art'),
 ('would', 'haue'),
 ('mine', 'owne'),
 ('come', 'come'),
 ('thy', 'selfe'),
 ('hath', 'made'),
 ('haue', 'done'),
 ('would', 'st')]

### Note that the frequency of shared bigrams is not necessarily the same in the two docs. 

In [429]:
# find doc1 collocations also appearing in doc2
doc1_bigrams_overlap = [ngram for ngram in finder1_fd if ngram[0] in finder2_nbest]

In [430]:
# overlapping collocations between the two docs - with freq dist from doc1

# if this list is empty, go back up to the code: filter_freq = 4
# and choose a smaller number, such as 3 or 2

doc1_bigrams_overlap

[(('good', 'lord'), 23),
 (('enter', 'king'), 15),
 (('wee', 'l'), 13),
 (('haue', 'seene'), 12),
 (('st', 'thou'), 9),
 (('thou', 'art'), 7),
 (('would', 'haue'), 7),
 (('mine', 'owne'), 6),
 (('come', 'come'), 5),
 (('thy', 'selfe'), 5),
 (('hath', 'made'), 4),
 (('haue', 'done'), 4),
 (('would', 'st'), 4)]

In [431]:
# find doc2 collocations also appearing in doc1
doc2_bigrams_overlap = [ngram for ngram in finder2_fd if ngram[0] in finder1_nbest]

In [432]:
# overlapping collocations between the two docs - with freq dist from doc2
doc2_bigrams_overlap

[(('st', 'thou'), 9),
 (('thou', 'art'), 9),
 (('good', 'lord'), 8),
 (('haue', 'done'), 8),
 (('wee', 'l'), 7),
 (('would', 'st'), 6),
 (('mine', 'owne'), 5),
 (('thy', 'selfe'), 5),
 (('would', 'haue'), 5),
 (('come', 'come'), 4),
 (('enter', 'king'), 4),
 (('hath', 'made'), 4),
 (('haue', 'seene'), 4)]

# Step 4. Collocations - Trigrams

- https://realpython.com/nltk-nlp-python/#finding-collocations
- https://www.nltk.org/howto/collocations.html

In [436]:
# choose frequency of collocations to filter
filter_freq = 2

In [437]:
# trigrams
trigram_measures = nltk.collocations.TrigramAssocMeasures()

In [438]:
finder1 = TrigramCollocationFinder.from_words(text1)
finder1

<nltk.collocations.TrigramCollocationFinder at 0x1bdd892c3d0>

In [439]:
# filter for collocations occurring n+ times
finder1.apply_freq_filter(filter_freq)

In [440]:
# finder1_nbest = finder1.nbest(trigram_measures.pmi, 1000)
finder1_nbest = finder1.nbest(trigram_measures.raw_freq, 1000)
finder1_nbest

[('enter', 'king', 'queene'),
 ('exeunt', 'enter', 'hamlet'),
 ('would', 'st', 'thou'),
 ('enter', 'hamlet', 'horatio'),
 ('enter', 'polonius', 'pol'),
 ('exeunt', 'manet', 'hamlet'),
 ('good', 'lord', 'polon'),
 ('hor', 'good', 'lord'),
 ('lord', 'exeunt', 'enter'),
 ('reynol', 'good', 'lord'),
 ('sit', 'downe', 'let'),
 ('(', 'say', ')'),
 ('buried', 'christian', 'buriall'),
 ('charge', 'thee', 'speake'),
 ('christian', 'buriall', 'clo'),
 ('clay', 'made', 'guest'),
 ('comicall', 'historicall', 'pastorall'),
 ('deere', 'brother', 'death'),
 ('dost', 'thou', 'heare'),
 ('dye', 'sleepe', 'sleepe'),
 ('enter', 'enter', 'queene'),
 ('enter', 'horatio', 'marcellus'),
 ('enter', 'king', 'king'),
 ('ere', 'go', 'bed'),
 ('exeunt', 'enter', 'horatio'),
 ('exeunt', 'scena', 'secunda'),
 ('father', 'much', 'offended'),
 ('follow', 'exeunt', 'enter'),
 ('giue', 'cup', 'let'),
 ('go', 'exeunt', 'enter'),
 ('god', 'blesse', 'sir'),
 ('god', 'buy', 'ye'),
 ('goe', 'ile', 'follow'),
 ('good', 'frie

In [441]:
len(finder1_nbest)

88

In [443]:
# frequencies of ngrams in doc1
finder1_fd = sorted(finder1.ngram_fd.items(), key=lambda t: (-t[1], t[0]))
finder1_fd[0:10]

[(('enter', 'king', 'queene'), 7),
 (('exeunt', 'enter', 'hamlet'), 5),
 (('would', 'st', 'thou'), 4),
 (('enter', 'hamlet', 'horatio'), 3),
 (('enter', 'polonius', 'pol'), 3),
 (('exeunt', 'manet', 'hamlet'), 3),
 (('good', 'lord', 'polon'), 3),
 (('hor', 'good', 'lord'), 3),
 (('lord', 'exeunt', 'enter'), 3),
 (('reynol', 'good', 'lord'), 3)]

In [444]:
# repeat for doc2
finder2 = TrigramCollocationFinder.from_words(text2)
finder2.apply_freq_filter(filter_freq)
finder2_nbest = finder2.nbest(trigram_measures.raw_freq, 1000)
finder2_nbest

[('knock', 'knock', 'knock'),
 ('enter', 'macbeth', 'macb'),
 ('enter', 'three', 'witch'),
 ('exeunt', 'scena', 'secunda'),
 ('good', 'lord', 'macb'),
 ('burne', 'cauldron', 'bubble'),
 ('enter', 'malcolme', 'seyward'),
 ('exeunt', 'scena', 'quarta'),
 ('exeunt', 'scena', 'tertia'),
 ('fire', 'burne', 'cauldron'),
 ('lord', 'macb', 'haue'),
 ('scena', 'prima', 'enter'),
 ('scena', 'secunda', 'enter'),
 ('thou', 'speak', 'st'),
 ('three', 'witch', '1'),
 ('thunder', 'enter', 'three'),
 ('trouble', 'fire', 'burne'),
 ('appar', 'macbeth', 'macbeth'),
 ('ban', 'good', 'lord'),
 ('byrnane', 'wood', 'come'),
 ('cauldron', 'bubble', '2'),
 ('child', 'shall', 'king'),
 ('colour', 'enter', 'malcolme'),
 ('come', 'come', 'come'),
 ('come', 'night', 'lady'),
 ('doe', 'ile', 'doe'),
 ('double', 'double', 'toyle'),
 ('double', 'toyle', 'trouble'),
 ('drum', 'colour', 'enter'),
 ('enter', 'lady', 'lady'),
 ('enter', 'macbeth', 'banquo'),
 ('enter', 'macbeth', 'lady'),
 ('enter', 'macduffe', 'macd'),

In [445]:
len(finder2_nbest)

76

In [447]:
# frequencies of ngrams in doc2
finder2_fd = sorted(finder2.ngram_fd.items(), key=lambda t: (-t[1], t[0]))
finder2_fd[0:10]

[(('knock', 'knock', 'knock'), 6),
 (('enter', 'macbeth', 'macb'), 5),
 (('enter', 'three', 'witch'), 4),
 (('exeunt', 'scena', 'secunda'), 4),
 (('good', 'lord', 'macb'), 4),
 (('burne', 'cauldron', 'bubble'), 3),
 (('enter', 'malcolme', 'seyward'), 3),
 (('exeunt', 'scena', 'quarta'), 3),
 (('exeunt', 'scena', 'tertia'), 3),
 (('fire', 'burne', 'cauldron'), 3)]

### Calculated shared trigrams in the two docs. 

In [448]:
# find overlapping collocations in the two docs
overlapping_trigrams = [ngram for ngram in finder1_nbest if ngram in finder2_nbest]

In [449]:
# overlapping collocations between the two docs!

# if this list is null, go back up to the line...
# filter_freq = 2
# and choose a smaller number, such as 2 or 3

overlapping_trigrams

[('would', 'st', 'thou'),
 ('exeunt', 'scena', 'secunda'),
 ('scena', 'secunda', 'enter')]

### Note that the frequency of overlapping trigrams is not the same for each document 

In [264]:
# find doc1 collocations also appearing in doc2
doc1_trigrams_overlap = [ngram for ngram in finder1_fd if ngram[0] in finder2_nbest]

In [265]:
# overlapping collocations between the two docs - with freq dist from doc1

# if this list is empty, go back up to the code: filter_freq = 4
# and choose a smaller number, such as 3 or 2

doc1_trigrams_overlap

[(('would', 'st', 'thou'), 4),
 (('exeunt', 'scena', 'secunda'), 2),
 (('scena', 'secunda', 'enter'), 2)]

In [266]:
# find doc2 collocations also appearing in doc1
doc2_bigrams_overlap = [ngram for ngram in finder2_fd if ngram[0] in finder1_nbest]

In [267]:
# overlapping collocations between the two docs - with freq dist from doc2
doc2_bigrams_overlap

[(('exeunt', 'scena', 'secunda'), 4),
 (('scena', 'secunda', 'enter'), 3),
 (('would', 'st', 'thou'), 2)]

# Step 5. Collocations containing a keyword

In [473]:
# choose a keyword relevant to your documents

### in both documents
# keyword = "king"
# keyword = "lord"
# keyword = "thou"

### in doc1
keyword = "hamlet"
# keyword = "father"

### in doc2
# keyword = "macb"
# keyword = "macbeth"
# keyword = "lady"

In [474]:
# set the minimum ngram frequency
freq_min = 1

### Bigrams containing a keyword

In [475]:
# bigrams in doc1
finder1 = BigramCollocationFinder.from_words(text1)
finder1.apply_freq_filter(freq_min)
finder1_nbest = finder1.nbest(bigram_measures.raw_freq, 1000)
finder1_keyword = [ngram for ngram in finder1_nbest if keyword in ngram]
finder1_keyword

[('lord', 'hamlet'),
 ('enter', 'hamlet'),
 ('hamlet', 'hamlet'),
 ('hamlet', 'come'),
 ('king', 'hamlet'),
 ('hamlet', 'hor'),
 ('hamlet', 'horatio'),
 ('hamlet', 'oh'),
 ('hamlet', 'thou'),
 ('manet', 'hamlet'),
 ('cause', 'hamlet'),
 ('cosin', 'hamlet'),
 ('drinke', 'hamlet'),
 ('exit', 'hamlet'),
 ('good', 'hamlet'),
 ('hamlet', 'giue'),
 ('hamlet', 'good'),
 ('hamlet', 'mother'),
 ('hamlet', 'polonius'),
 ('hamlet', 'prince'),
 ('madnesse', 'hamlet'),
 ('oh', 'hamlet'),
 ('tragedie', 'hamlet'),
 ('&', 'hamlet'),
 ('accord', 'hamlet'),
 ('adue', 'hamlet')]

In [470]:
# bigrams in doc2
finder2 = BigramCollocationFinder.from_words(text2)
finder2.apply_freq_filter(freq_min)
finder2_nbest = finder2.nbest(bigram_measures.raw_freq, 1000)
finder2_keyword = [ngram for ngram in finder2_nbest if keyword in ngram]
finder2_keyword

[('enter', 'macbeth'),
 ('macbeth', 'macb'),
 ('haile', 'macbeth'),
 ('macbeth', 'macbeth'),
 ('macbeth', 'banquo'),
 ('macbeth', 'haile'),
 ('macbeth', 'shall'),
 ('appar', 'macbeth'),
 ('macbeth', 'hath'),
 ('macbeth', 'lady'),
 ('macbeth', 'lenox'),
 ('macbeth', 'macd'),
 ('tragedie', 'macbeth'),
 ('3', 'macbeth'),
 ('banquo', 'macbeth')]

In [463]:
ngrams_overlapping_keyword = [ngram for ngram in finder1_keyword if ngram in finder2_keyword]
ngrams_overlapping_keyword

[]

### Trigrams containing a keyword

In [471]:
# trigrams in doc1
finder1 = TrigramCollocationFinder.from_words(text1)
finder1.apply_freq_filter(freq_min)
finder1_nbest = finder1.nbest(trigram_measures.raw_freq, 1000)
finder1_keyword = [ngram for ngram in finder1_nbest if keyword in ngram]
finder1_keyword

[]

In [472]:
# trigrams in doc2
finder2 = TrigramCollocationFinder.from_words(text2)
finder2.apply_freq_filter(freq_min)
finder2_nbest = finder2.nbest(trigram_measures.raw_freq, 1000)
finder2_keyword = [ngram for ngram in finder2_nbest if keyword in ngram]
finder2_keyword

[('enter', 'macbeth', 'macb'),
 ('appar', 'macbeth', 'macbeth'),
 ('enter', 'macbeth', 'banquo'),
 ('enter', 'macbeth', 'lady'),
 ('haile', 'macbeth', 'haile'),
 ('macbeth', 'haile', 'thee'),
 ('macbeth', 'macbeth', 'macbeth'),
 ('quinta', 'enter', 'macbeth'),
 ('1', 'appar', 'macbeth'),
 ('1', 'banquo', 'macbeth'),
 ('1', 'haile', 'macbeth'),
 ('1', 'lesser', 'macbeth'),
 ('2', 'appar', 'macbeth'),
 ('2', 'haile', 'macbeth'),
 ('3', 'haile', 'macbeth'),
 ('3', 'macbeth', '2'),
 ('3', 'meet', 'macbeth'),
 ('alarum', 'enter', 'macbeth'),
 ('awake', 'exeunt', 'macbeth'),
 ('banq', 'worthy', 'macbeth'),
 ('banquo', 'macbeth', 'haile'),
 ('banquo', 'sits', 'macbeth'),
 ('better', 'macbeth', 'one'),
 ('blacke', 'macbeth', 'seeme')]

In [458]:
ngrams_overlapping_keyword = [ngram for ngram in finder1_keyword if ngram in finder2_keyword]
ngrams_overlapping_keyword

[]

# Step 6. Report Results in a Word Document

- ### top bigrams in doc1 and doc2

    1. Top 10 (highest frequency) bigrams in doc1
    2. Top 10 (highest frequency) bigrams in doc2
    3. Overlapping bigrams in doc1 and doc2
    
- ### top trigrams in doc1 and doc2
    
    4. Top 10 (highest frequency) trigrams in doc1
    5. Top 10 (highest frequency) trigrams in doc2
    6. Overlapping trigrams in doc1 and doc2
    
- ### shared keyword
    7. Overlapping bigrams in doc1 and doc2 containing a shared keyword
    8. Overlapping trigrams in doc1 and doc2 containing a shared keyword
    
- ### keyword unique to doc1
    9. Bigrams in doc1 containing a keyword unique to doc1
    10. Trigrams in doc1 containing a keyword unique to doc1
    
- ### keyword unique to doc2
    11. Bigrams in doc2 containing a keyword unique to doc2
    12. Trigrams in doc2 containing a keyword unique to doc2
