In [37]:
import docx
import nltk
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.probability import FreqDist

Preparation

In [38]:
doc=docx.Document('datasets/Brexit.docx')

def get_doc_text(doc):
    doc_paragraphs = []

    for para in doc.paragraphs:
        doc_paragraphs.append(para.text)

    doc_text = ' '.join(doc_paragraphs)
    return doc_text

brexit_text = get_doc_text(doc)

Question 1:

In [39]:
def GetNGrams(str, n):
    tokens = word_tokenize(str)
    return ngrams(tokens, n)

print(list(GetNGrams(brexit_text, 2)))
print(list(GetNGrams(brexit_text, 3)))
print(list(GetNGrams(brexit_text, 4)))

[('Brexit', 'is'), ('is', 'the'), ('the', 'impending'), ('impending', 'withdrawal'), ('withdrawal', 'of'), ('of', 'the'), ('the', 'United'), ('United', 'Kingdom'), ('Kingdom', '('), ('(', 'UK'), ('UK', ')'), (')', 'from'), ('from', 'the'), ('the', 'European'), ('European', 'Union'), ('Union', '('), ('(', 'EU'), ('EU', ')'), (')', '.'), ('.', 'In'), ('In', 'a'), ('a', 'referendum'), ('referendum', 'on'), ('on', '23'), ('23', 'June'), ('June', '2016'), ('2016', ','), (',', 'a'), ('a', 'majority'), ('majority', 'of'), ('of', 'British'), ('British', 'voters'), ('voters', 'supported'), ('supported', 'leaving'), ('leaving', 'the'), ('the', 'EU'), ('EU', '.'), ('.', 'On'), ('On', '29'), ('29', 'March'), ('March', '2017'), ('2017', ','), (',', 'the'), ('the', 'UK'), ('UK', 'government'), ('government', 'invoked'), ('invoked', 'Article'), ('Article', '50'), ('50', 'of'), ('of', 'the'), ('the', 'Treaty'), ('Treaty', 'on'), ('on', 'European'), ('European', 'Union'), ('Union', '.'), ('.', 'The'), 

In [40]:
# nltk.download('averaged_perceptron_tagger')

# from collections import Counter
# pos_counts = Counter(tag for word,tag in brexit_pos)
# print("Brexit Total POS:")
# print(dict(pos_counts.items()))

Question 2:

In [41]:
fdist_noun = FreqDist()

def NounsCount(txt):
    nouns_count = 0

    brexit_pos = nltk.pos_tag(word_tokenize(txt))
    for word, tag in brexit_pos:
        if tag in ['NN', 'NNP', 'NNS', 'NNPS']:
            nouns_count = nouns_count + 1
            fdist_noun[word.upper()] += 1
    
    return nouns_count

print("Nouns Count: ", NounsCount(brexit_text))

Nouns Count:  195


In [42]:
def PronounsCount(txt):
    pronouns_count = 0

    brexit_pos = nltk.pos_tag(word_tokenize(txt))
    for word, tag in brexit_pos:
        if tag in ['PRP', 'PRP$', 'WP', 'WP$']:
            pronouns_count = pronouns_count + 1
    
    return pronouns_count

print("Pronouns Count: ", PronounsCount(brexit_text))

Pronouns Count:  6


In [43]:
def AdjectivesCount(txt):
    adjective_count = 0

    brexit_pos = nltk.pos_tag(word_tokenize(txt))
    for word, tag in brexit_pos:
        if tag in ['JJ', 'JJS', 'JJR']:
            adjective_count = adjective_count + 1
    
    return adjective_count

print("Adjectives Count: ", AdjectivesCount(brexit_text))

Adjectives Count:  41


In [44]:
def VerbsCount(txt):
    verbs_count = 0

    brexit_pos = nltk.pos_tag(word_tokenize(txt))
    for word, tag in brexit_pos:
        if tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']:
            verbs_count = verbs_count + 1
    
    return verbs_count

print("Verbs Count: ", VerbsCount(brexit_text))

Verbs Count:  67


In [45]:
def AdverbsCount(txt):
    adverbs_count = 0

    brexit_pos = nltk.pos_tag(word_tokenize(txt))
    for word, tag in brexit_pos:
        if tag in ['RB', 'RBS', 'RBR']:
            adverbs_count = adverbs_count + 1
    
    return adverbs_count

print("Adverbs Count: ", AdverbsCount(brexit_text))

Adverbs Count:  7


In [46]:
#nltk.download('maxent_ne_chunker')
#nltk.download('words')
#%pip install svgling

Question 3:

In [47]:
# GPE ORGANIZATION PERSON
fdist_geopolitical = FreqDist()
fdist_person = FreqDist()

def GeoPoliticalCount(txt):
    count = 0
    brexit_pos = nltk.pos_tag(word_tokenize(txt))
    ner = nltk.ne_chunk(brexit_pos)
    for chunk in ner:
        if type(chunk) == nltk.tree.Tree:
            if chunk.label() == 'GPE':
                count = count + 1
                fdist_geopolitical[chunk.leaves()[0][0].upper()] += 1
    return count    

def PersonsCount(txt):
    count = 0
    brexit_pos = nltk.pos_tag(word_tokenize(txt))
    ner = nltk.ne_chunk(brexit_pos)
    for chunk in ner:
        if type(chunk) == nltk.tree.Tree:
            if chunk.label() == 'PERSON':
                count = count + 1
                fdist_person[chunk.leaves()[0][0].upper()] += 1
    return count 

def OrganizationsCount(txt):
    count = 0
    brexit_pos = nltk.pos_tag(word_tokenize(txt))
    ner = nltk.ne_chunk(brexit_pos)
    for chunk in ner:
        if type(chunk) == nltk.tree.Tree:
            if chunk.label() == 'ORGANIZATION':
                count = count + 1
    return count        

print("GeoPoliticalCount: ", GeoPoliticalCount(brexit_text))
print("PersonsCount: ", PersonsCount(brexit_text))
print("OrganizationsCount: ", OrganizationsCount(brexit_text))

GeoPoliticalCount:  19
PersonsCount:  6
OrganizationsCount:  29


Question 4:

In [48]:
print("Most frequent Noun: ", fdist_noun.most_common(5))
print("Most frequent GeoPolitical Entity: ", fdist_geopolitical.most_common(5))
print("Most frequent Person: ", fdist_person.most_common(5))

Most frequent Noun:  [('UK', 13), ('EU', 8), ('UNION', 7), ('BREXIT', 6), ('EUROPEAN', 6)]
Most frequent GeoPolitical Entity:  [('EU', 8), ('EUROPEAN', 4), ('BREXIT', 2), ('BRITISH', 1), ('EURATOM', 1)]
Most frequent Person:  [('THERESA', 2), ('BREXIT', 2), ('DAVID', 1), ('CAMERON', 1)]


In [50]:
fdist_bigrams = FreqDist(GetNGrams(brexit_text, 2))
print("Most frequent BiGram: ", fdist_bigrams.most_common(5))

Most frequent BiGram:  [((',', 'the'), 7), (('the', 'UK'), 6), (('the', 'European'), 5), (('European', 'Union'), 5), (('the', 'EU'), 5)]
