In [1]:
from bs4 import BeautifulSoup
from nltk.util import ngrams
from collections import defaultdict
from nltk import trigrams
from nltk.tokenize import RegexpTokenizer
import requests

In [2]:
# load text from blog
response = requests.get("https://maxsiollun.wordpress.com/great-speeches-in-nigerias-history/")

In [3]:
# parse html and take only paragraph contents
soup = BeautifulSoup(response.text,'html.parser')
sentence = soup.find_all('p',text=True)

In [4]:
print(len(sentence))

511


### Preprocess Text

In [5]:
# marge all string lines into single line string
note=''
for line in sentence[50:100]:
    note+=str(line)

In [6]:
# convert text to lower case
sentence=note.lower()

In [7]:
# convert sentence into tokens and extract all punctuations
tokenizer = RegexpTokenizer(r'\w+')
tk_sentence=tokenizer.tokenize(sentence)
tk_sentence

['p',
 'on',
 'this',
 'occasion',
 'i',
 'wish',
 'to',
 'place',
 'on',
 'record',
 'the',
 'nation',
 's',
 'gratitude',
 'to',
 'the',
 'organization',
 'of',
 'african',
 'unity',
 'for',
 'its',
 'splendid',
 'diplomatic',
 'and',
 'moral',
 'support',
 'for',
 'the',
 'federal',
 'cause',
 'i',
 'thank',
 'particularly',
 'the',
 'chairman',
 'of',
 'the',
 'consultative',
 'committee',
 'on',
 'nigeria',
 'his',
 'imperial',
 'majesty',
 'haile',
 'selassie',
 'i',
 'and',
 'the',
 'other',
 'members',
 'of',
 'the',
 'committee',
 'i',
 'also',
 'thank',
 'the',
 'president',
 'of',
 'the',
 'oau',
 'general',
 'assembly',
 'presidents',
 'mobutu',
 'boumedienne',
 'and',
 'ahidjo',
 'who',
 'presided',
 'over',
 'oau',
 'summit',
 'discussions',
 'of',
 'the',
 'nigerian',
 'crisis',
 'the',
 'enemies',
 'of',
 'africa',
 'were',
 'restrained',
 'by',
 'the',
 'demonstration',
 'of',
 'such',
 'solid',
 'support',
 'i',
 'thank',
 'the',
 'secretary',
 'general',
 'of',
 'the

In [8]:
# trigram
gram_sentence=list(ngrams(tk_sentence, 3))
gram_sentence

[('p', 'on', 'this'),
 ('on', 'this', 'occasion'),
 ('this', 'occasion', 'i'),
 ('occasion', 'i', 'wish'),
 ('i', 'wish', 'to'),
 ('wish', 'to', 'place'),
 ('to', 'place', 'on'),
 ('place', 'on', 'record'),
 ('on', 'record', 'the'),
 ('record', 'the', 'nation'),
 ('the', 'nation', 's'),
 ('nation', 's', 'gratitude'),
 ('s', 'gratitude', 'to'),
 ('gratitude', 'to', 'the'),
 ('to', 'the', 'organization'),
 ('the', 'organization', 'of'),
 ('organization', 'of', 'african'),
 ('of', 'african', 'unity'),
 ('african', 'unity', 'for'),
 ('unity', 'for', 'its'),
 ('for', 'its', 'splendid'),
 ('its', 'splendid', 'diplomatic'),
 ('splendid', 'diplomatic', 'and'),
 ('diplomatic', 'and', 'moral'),
 ('and', 'moral', 'support'),
 ('moral', 'support', 'for'),
 ('support', 'for', 'the'),
 ('for', 'the', 'federal'),
 ('the', 'federal', 'cause'),
 ('federal', 'cause', 'i'),
 ('cause', 'i', 'thank'),
 ('i', 'thank', 'particularly'),
 ('thank', 'particularly', 'the'),
 ('particularly', 'the', 'chairman'),


In [9]:
# Create Word Model
word_model = defaultdict(lambda: defaultdict(lambda: 0))

for sentence in tk_sentence:
    for first_word, second_word, word_label in trigrams(tk_sentence,pad_left=True,pad_right=True):
        word_model[(first_word, second_word)][word_label] += 1
dict(word_model)

{(None, None): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
             {'p': 1559}),
 (None, 'p'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
             {'on': 1559}),
 ('p', 'on'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
             {'this': 1559}),
 ('on', 'this'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
             {'occasion': 1559}),
 ('this',
  'occasion'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>, {'i': 1559}),
 ('occasion', 'i'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
             {'wish': 1559}),
 ('i', 'wish'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
             {'to': 1559}),
 ('wish', 'to'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
             {'make': 1559, 'place': 1559}),
 ('to', 'place'): defaultdict(<function __main__.<lambda>.<locals>.<lambda>>,
             {'on': 1559}),
 ('place', 'on'): defaultdict(<function 

In [10]:
# convert the word occurance scores into probabilities
for words_train in word_model:
    total_count = float(sum(word_model[words_train].values()))
    for word_test in word_model[words_train]:
        word_model[words_train][word_test] /= total_count

In [11]:
dict(word_model['patience', 'sacrifice'])

{'loyalty': 1.0}

In [12]:
dict(word_model['us', 'as'])

{'we': 1.0}

In [13]:
dict(word_model['share', 'the'])

{'victory': 1.0}

In [14]:
dict(word_model['should', 'not'])

{'be': 1.0}