# N-Gram Language Modelling

## Tri-Gram Model

In [1]:
from nltk.corpus import brown
from nltk.tokenize import word_tokenize

In [2]:
corpus = brown.words()

In [3]:
print(f"Number of words in the corpus: {len(corpus)}")
lowercase = [c.lower() for c in corpus]
vocab = set(lowercase)
print(f"Vocab of the corpus : {len(vocab)}")

Number of words in the corpus: 1161192
Vocab of the corpus : 49815


In [4]:
print(f"Snippet of the corpus: {lowercase[90:107]}")

Snippet of the corpus: ['in', 'the', 'hard-fought', 'primary', 'which', 'was', 'won', 'by', 'mayor-nominate', 'ivan', 'allen', 'jr.', '.', '``', 'only', 'a', 'relative']


In [5]:
bigram_counts = {}
trigram_counts = {}

for i in range(len(lowercase)-2):
    bigram = (lowercase[i], lowercase[i+1])
    trigram = (lowercase[i], lowercase[i+1], lowercase[i+2])
    
    if bigram in bigram_counts.keys():
        bigram_counts[bigram]+=1
    else:
        bigram_counts[bigram]=1
        
    if trigram in trigram_counts.keys():
        trigram_counts[trigram]+=1
    else:
        trigram_counts[trigram]=1  

In [6]:
print(f"Bigram count for (was, won) is: {bigram_counts[('was', 'won')]}")
print(f"Bigram count for (which, was) is: {bigram_counts[('which', 'was')]}")

Bigram count for (was, won) is: 2
Bigram count for (which, was) is: 155


In [9]:
def nextgram(input_, trigram_counts_dict, bigram_counts_dict, corpus_vocab):
    input_tokens = word_tokenize(input_.lower())
    input_bigram = input_tokens[-2:]
    N = len(corpus_vocab)
    
    all_probs = {}
    
    for vocab_word in corpus_vocab: 
        test_bigram = (input_bigram[0], input_bigram[1])
        test_trigram = (input_bigram[0], input_bigram[1], vocab_word)
        
        bigram_count = bigram_counts_dict.get(test_bigram,0)
        trigram_count = trigram_counts_dict.get(test_trigram,0)
        
        prob = (trigram_count+1)/(bigram_count+N)
        
        all_probs[vocab_word] = prob
        
    top_words = sorted(all_probs.items(), key= lambda x:x[1], reverse = True)[:5]
        
    return top_words

In [11]:
nextgram("the world is", trigram_counts, bigram_counts, vocab)

[('dominated', 4.013002126891127e-05),
 ('the', 4.013002126891127e-05),
 ('still', 4.013002126891127e-05),
 ('flowing', 4.013002126891127e-05),
 ('played', 4.013002126891127e-05)]

In [15]:
nextgram("the world is dominated", trigram_counts, bigram_counts, vocab)

[('by', 6.022040668847984e-05),
 ('contrast', 2.0073468896159946e-05),
 ('highwayman', 2.0073468896159946e-05),
 ('volatilization', 2.0073468896159946e-05),
 ('hamiltonians', 2.0073468896159946e-05)]

In [16]:
nextgram("the world is dominated by", trigram_counts, bigram_counts, vocab)

[('the', 0.0001003492152691366),
 ('either', 4.013968610765464e-05),
 ('two', 4.013968610765464e-05),
 ('a', 4.013968610765464e-05),
 ('money', 4.013968610765464e-05)]

In [17]:
nextgram("the world is dominated by the", trigram_counts, bigram_counts, vocab)

[('time', 0.0006841014815683515),
 ('united', 0.0002736405926273406),
 ('fact', 0.0002736405926273406),
 ('way', 0.000254094836011102),
 ('secretary', 0.00023454907939486337)]

In [18]:
nextgram("the world is dominated by the time", trigram_counts, bigram_counts, vocab)

[('of', 0.0009187872008948188),
 ('the', 0.00041944633084328687),
 (',', 0.00041944633084328687),
 ('.', 0.00033955179163504176),
 ('and', 0.00023968361762473535)]

In [19]:
nextgram("the world is dominated by the time of", trigram_counts, bigram_counts, vocab)

[('the', 0.0002806229830223095),
 ('his', 0.00012026699272384694),
 ('her', 8.017799514923129e-05),
 ('year', 8.017799514923129e-05),
 ('flight', 6.013349636192347e-05)]

In [20]:
nextgram("the world is dominated by the time of the", trigram_counts, bigram_counts, vocab)

[('united', 0.002183699522945643),
 ('new', 0.0018141503729086877),
 ('most', 0.0013942081569576026),
 ('world', 0.0012094335819391251),
 ('``', 0.0011758382046630383)]

In [48]:
def nextNgrams(input_, Ngrams, trigram_counts_dict, bigram_counts_dict, corpus_vocab):
        
    for _ in range(Ngrams):
        input_tokens = word_tokenize(input_.lower())
        input_bigram = input_tokens[-2:]
        N = len(corpus_vocab)

        all_probs = {}

        for vocab_word in corpus_vocab: 
            test_bigram = (input_bigram[0], input_bigram[1])
            test_trigram = (input_bigram[0], input_bigram[1], vocab_word)
            bigram_count = bigram_counts_dict.get(test_bigram,0)
            trigram_count = trigram_counts_dict.get(test_trigram,0)

            prob = (trigram_count+1)/(bigram_count+N)

            all_probs[vocab_word] = prob

        top_word = sorted(all_probs.items(), key= lambda x:x[1], reverse = True)[0]
        input_ = input_+ " "+ top_word[0]
    
    
        
    return input_

In [51]:
nextNgrams("the world is", 10,trigram_counts, bigram_counts, vocab)

'the world is dominated by the time of the united states , and'

#### Inference: 
Longer sentences fail to deliver semantic correctness.

## Four Gram Model 