In [1]:
import re
import collections
import pandas as pd
import numpy as np
import nltk
from nltk.util import ngrams
from nltk.corpus import gutenberg

In [2]:
# I am using Alice in wonderland book
text = gutenberg.raw(fileids='carroll-alice.txt')

In [4]:
def bi_grams_model(text):
    """ the function takes the text as an input & returns bigrams"""
    # tokenise text
    words = nltk.word_tokenize(text)
    # create bi_gram list
    bi_grams = list(ngrams(words, 2))
    return bi_grams

In [5]:
def frequency(text):
    """the function creates a dataframe from bi_gram model"""
    #text = normalise(text)
    bi_grams = bi_grams_model(text) 
    
    # create from a bi_gram list a dictionary where each word is the key and its value is the words that are coming after them
    # collections.defaultdict: group a sequence of key-value pairs into a dictionary of lists
    dict_bi_grams = collections.defaultdict(list)
    for key, value in bi_grams:
        dict_bi_grams[key].append(value)
        
    # get the frequency that word(i-1) & word(i) occur together
    dict_freq = {}
    for key in dict_bi_grams:
        val = dict_bi_grams[key]
        dict_freq[key] = collections.Counter(val)
    dict_freq
    
    # get word counts
    word_counts = pd.Series(collections.Counter(nltk.word_tokenize(text)))
    
    # create dataframe from frequency dictionary, where index and columns are all words in the text
    df = pd.DataFrame.from_dict(dict_freq, orient='index')
    # Normalise by unigram (dividing by the whole count of each word)
    df = df.div(word_counts, axis=0)
    # replace Nan values with zeros
    df = df.fillna(0)
    return df

In [6]:
def predict(word):
    """ the function gives prediction to the next word.
        if the word not in the lyrics, it'll give a message saying 'Word not found'
    """
    df = frequency(text)
    if word not in df.index:
        print('Word not found')
    else:
        x = df.loc[word, :]
        prediction = x[x!=0].to_dict()
        words = list(prediction.keys())
        probability = list(prediction.values())
        predicted = words[probability.index(max(probability))]
        return predicted

In [7]:
predict('Alice')

','

##### let's see which words come after Alice as a check

In [8]:
df = frequency(text)
df.loc['Alice'].sort_values(ascending = False).head(10)

,          0.197970
.          0.137056
was        0.043147
;          0.040609
thought    0.030457
could      0.027919
had        0.027919
said       0.027919
did        0.025381
's         0.022843
Name: Alice, dtype: float64

### Validating the model

By checking the shape of dataframe and unique words in the story
they should be the same

In [9]:
(frequency(text)).shape

(3185, 3185)

In [10]:
x = np.array(nltk.word_tokenize(text))
len(np.unique(x))

3185