#### Steps:
* Use N-gram model
* Get the vocabulary with frequency count ( remove words with min_freq = 2 or 3 ...)
* Get the sentence Probability
* For target sentence word prediction make probaility matrix.
* Predict

### Task

```python
corpus = '''I like cats. This dog is like a mouse.'''

user_entered_sent = 'I like'

suggesstion = []```

After running the algo
<br/>
```python
suggesstion = [('word1', 0.1), ('word2', 0.012), ...]
```

In [1]:
# first import necessary libraries
import nltk
import numpy as np
import pandas as pd

In [2]:
corpus = '''I like cats. !
            This dog is like a mouse.'''

In [3]:
# get the vocab
def get_vocab(corpus):
    '''
    return a dict e.g {'word1': freq1,
                        'word2': freq2,
                        ...}
    '''
    
    # preprocess the data
    # sent = nltk.sent_tokenize(corpus)
    words = nltk.word_tokenize(corpus)
    
    # remove punctuations and stop words NOT
    
    vocab = {}
    
    for w in words:
        vocab[w] = vocab.get(w,0) + 1
        
    return vocab    

In [4]:
vocab_with_freq = get_vocab(corpus)

In [5]:
vocab_with_freq = get_vocab(corpus)
vocab = set([key for key in vocab_with_freq.keys()])

In [6]:
vocab

{'!', '.', 'I', 'This', 'a', 'cats', 'dog', 'is', 'like', 'mouse'}

In [7]:
# Test the above function
print(get_vocab(corpus))
vocab = set([key for key in get_vocab(corpus).keys()])
print(vocab)

{'I': 1, 'like': 2, 'cats': 1, '.': 2, '!': 1, 'This': 1, 'dog': 1, 'is': 1, 'a': 1, 'mouse': 1}
{'mouse', 'dog', 'cats', 'like', 'This', 'is', 'a', '!', 'I', '.'}


In [8]:
sent = ['I', 'like']
k = 1
V = len(vocab)

In [9]:
sent[:-1]

['I']

In [10]:
total_words = 0
for val in vocab_with_freq.values():
    total_words += val
total_words

12

In [11]:
def p(word):
    return vocab_with_freq.get(word[0])/total_words

# total_words = sum of all freq values
# use functional programming for above task

In [12]:
def get_n_gram(corpus, n):
    '''
    return: a dictionary of n-grams words with freq (n-gram words are a tuple)
    '''
    
    sent = nltk.sent_tokenize(corpus)
    #print(sent)
    data = []
    # tokenize each sentence into words
    for s in sent:
        data.append(nltk.word_tokenize(s))
    
    start_token = ['<s>']*n
    end_token = ['</s>']
    #print(data)
    #print(start_token, end_token)
    
    res = {}
    # iterate through each sentence in data which is list of list
    for sentence in data:
        # add start and end tokens
        sentence = tuple(start_token + sentence + end_token)
        
        r = len(sentence)-n+1
        if r < 0:
            continue
        for i in range(r):
            t = sentence[i:i+n]
            
            # add this n-gram to dict-res
            res[t] = res.get(t,0)+1
    return res       

In [13]:
# make distribution_matrix
def word_dist_matrix(corpus):
    n = 4 # set value, say  n = 4
    
    # get unigram + bigram
    unigram = get_n_gram(corpus, n=1) # dictionary of words
    bigram = get_n_gram(corpus, n=2) # dictionary of bi-gram words
    
    list_of_all_words = []
    for key in unigram.keys():
        list_of_all_words.append(key)
    for key in bigram.keys():
        list_of_all_words.append(key)
    
    l = len(list_of_all_words)
    
    # initialize the matrix
    matrix = np.zeros((l,l))
    m = pd.DataFrame(matrix, index=list_of_all_words, columns=list_of_all_words)
    
    # fill the matrix
    all_word_with_freq = {**unigram, **bigram}
    for row in list_of_all_words:
        for col in list_of_all_words:
            key = row+col # define this function
            m.loc[[row],[col]] = all_word_with_freq.get(key, 0)
    
    return m, list_of_all_words

In [14]:
# define count function
## ! this function won't run since sent is in list convert it to sentence for searching into matrix
def count(sent, m):
    
    
    
    # find all permutations of sent then return the sum of each counts
    perm = (len(sent)-1) + 2
    print('Perm', perm)
    
    c = 0
    print(c)
    print('-------------')
    # let's loop through matrix m
    c += m._get_value(('<s>',),tuple(sent))
    print(c)
    c += m._get_value(tuple(sent),('</s>',))
    print(c)
    
    for i in range(1,perm-1):
        c += m._get_value(tuple(sent[:i]),tuple(sent[i:]))
        print(c)
    return c      

In [15]:
m, _ = word_dist_matrix(corpus)

In [16]:
m

Unnamed: 0,"(<s>,)","(I,)","(like,)","(cats,)","(.,)","(</s>,)","(!,)","(This,)","(dog,)","(is,)",...,"(., </s>)","(<s>, !)","(!, </s>)","(<s>, This)","(This, dog)","(dog, is)","(is, like)","(like, a)","(a, mouse)","(mouse, .)"
"(<s>,)",3.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(I,)",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(like,)",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(cats,)",0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(.,)",0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(</s>,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(!,)",0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(This,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(dog,)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"(is,)",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
m._get_value(tuple(sent),('</s>',))

0.0

In [18]:
# test the above function
sent = ['I', 'like']
count(sent, m)

Perm 3
0
-------------
0.0
0.0
1.0


1.0

In [19]:
x=m.loc[[tuple(sent)],[('</s>',)]]
x

Unnamed: 0,"(</s>,)"
"(I, like)",0.0


In [20]:
x._get_value(('I','like'),('</s>',))

0.0

In [21]:
m.loc[[('<s>',)], [('<s>',)]]

Unnamed: 0,"(<s>,)"
"(<s>,)",3.0


In [22]:
# define the probability function
# recursive programming
def probability(sent):
    
    if len(sent) == 1:
        return p(sent)
    
    # apply k-smoothing for prob
    sent_minus_one = sent[:-1]
    return ((count(sent, m) + k)/(count(sent_minus_one, m) + k*V))*probability(sent_minus_one)  

In [23]:
# test this function
count(['I', 'like'], m)

Perm 3
0
-------------
0.0
0.0
1.0


1.0

In [24]:
unigram = get_n_gram(corpus, n=1) # dictionary of words
bigram = get_n_gram(corpus, n=2) 
print(unigram)
print('--------------------------------------------------------')
print(bigram)
print()
print('================================================================================')
all_word_with_freq = {**unigram, **bigram}
print(all_word_with_freq)

{('<s>',): 3, ('I',): 1, ('like',): 2, ('cats',): 1, ('.',): 2, ('</s>',): 3, ('!',): 1, ('This',): 1, ('dog',): 1, ('is',): 1, ('a',): 1, ('mouse',): 1}
--------------------------------------------------------
{('<s>', '<s>'): 3, ('<s>', 'I'): 1, ('I', 'like'): 1, ('like', 'cats'): 1, ('cats', '.'): 1, ('.', '</s>'): 2, ('<s>', '!'): 1, ('!', '</s>'): 1, ('<s>', 'This'): 1, ('This', 'dog'): 1, ('dog', 'is'): 1, ('is', 'like'): 1, ('like', 'a'): 1, ('a', 'mouse'): 1, ('mouse', '.'): 1}

{('<s>',): 3, ('I',): 1, ('like',): 2, ('cats',): 1, ('.',): 2, ('</s>',): 3, ('!',): 1, ('This',): 1, ('dog',): 1, ('is',): 1, ('a',): 1, ('mouse',): 1, ('<s>', '<s>'): 3, ('<s>', 'I'): 1, ('I', 'like'): 1, ('like', 'cats'): 1, ('cats', '.'): 1, ('.', '</s>'): 2, ('<s>', '!'): 1, ('!', '</s>'): 1, ('<s>', 'This'): 1, ('This', 'dog'): 1, ('dog', 'is'): 1, ('is', 'like'): 1, ('like', 'a'): 1, ('a', 'mouse'): 1, ('mouse', '.'): 1}


In [25]:
print(sent)
probability(sent)

['I', 'like']
Perm 3
0
-------------
0.0
0.0
1.0
Perm 2
0
-------------
1.0
1.0


0.015151515151515152

In [26]:
def suggest_a_word(corpus, user_entered_sent):
    
    prob = [] # (word, probabilty)
    for word in vocab:
        pred_sent = user_entered_sent + ' ' + word
        prob.append(tuple(word, probability(pred_sent)))
    
    # print max 10 suggestion
    # get the index of top 10 prob and put it into suggestion
    return prob

# suggestion = suggest_a_word(corpus, user_entered_sent)
# print(suggestion)

In [27]:
user_entered_sent = 'I like'
for word in vocab:
    pred_sent = user_entered_sent + ' ' + word
    print(pred_sent)

I like mouse
I like dog
I like cats
I like like
I like This
I like is
I like a
I like !
I like I
I like .


In [28]:
suggest_a_word(corpus, 'I like')

Perm 13
0
-------------


KeyError: ('I', ' ', 'l', 'i', 'k', 'e', ' ', 'm', 'o', 'u', 's', 'e')