In [1]:
import os
import nltk
import math

In [2]:
from nltk import sent_tokenize,word_tokenize
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

In [3]:
def preprocessing(words):
    mod_words = []
    symbols = ['.',',',':','?','!','<','>','(',')','#','--','-','$','@','%','``',';',"''"]
    for word in words:
        if word not in stopWords:
            if word.isalpha():
                if len(word) > 2 :
                    mod_words.append(word.lower())
    return mod_words

In [4]:
def bigram_generator(words,bigram_dict):
    for i in range(len(words)-1):
        if words[i] in bigram_dict:
            if words[i+1] in bigram_dict[words[i]]:
                bigram_dict[words[i]][words[i+1]] += 1
            else:
                bigram_dict[words[i]][words[i+1]] = 1
        else:
            bigram_dict[words[i]] = {}
            bigram_dict[words[i]][words[i+1]] = 1
    return bigram_dict

In [5]:
def bigram_freq_generator(bigram_dict,size):
    for key in bigram_dict.keys():
        for k in bigram_dict[key].keys():
            bigram_dict[key][k] += 1 # Laplace Smoothing
    bigram_freq = {}
    for key in bigram_dict.keys():
        bigram_freq[key] = {}
        count = len(bigram_dict[key])
        denom = count + size
        for k in bigram_dict[key].keys():
            bigram_freq[key][k] = math.log10(bigram_dict[key][k]/denom)
            
    return bigram_freq

In [6]:
def trigram_generator(words,trigram_dict):
    for i in range(len(words)-2):
        if words[i] in trigram_dict:
            if words[i+1] in trigram_dict[words[i]]:
                if words[i+2] in trigram_dict[words[i]][words[i+1]]:
                    trigram_dict[words[i]][words[i+1]][words[i+2]] += 1
                else:
                    trigram_dict[words[i]][words[i+1]][words[i+2]] = 1
            else:
                trigram_dict[words[i]][words[i+1]] = {}
                trigram_dict[words[i]][words[i+1]][words[i+2]] = 1
        else:
            trigram_dict[words[i]] ={}
            trigram_dict[words[i]][words[i+1]] = {}
            trigram_dict[words[i]][words[i+1]][words[i+2]] = 1
    return trigram_dict

In [7]:
def trigram_freq_generator(trigram_dict,size):
    for keys in trigram_dict.keys():
        for key in trigram_dict[keys].keys():
            for k in trigram_dict[keys][key].keys():
                trigram_dict[keys][key][k] += 1 # Laplace Smoothing
    trigram_freq = {}
    for keys in trigram_dict.keys():
        trigram_freq[keys] = {}
        for key in trigram_dict[keys].keys():
            trigram_freq[keys][key] = {}
            count = len(trigram_dict[keys][key])
            denom = count + size
            for k in trigram_dict[keys][key].keys():
                trigram_freq[keys][key][k] = math.log10(trigram_dict[keys][key][k]/denom)
                
    return trigram_freq

In [21]:
#Bigram Sentence generator
def maxBigram(key,bigram_freq):
    m = max(bigram_freq[key].values())
    for k,v in bigram_freq[key].items():
        if v == m :
            print(k,end = ' ')
            return k
    

In [24]:
def maxTrigram(key1,key2,trigram_freq):
    m = max(trigram_freq[key1][key2].values())
    for k,v in trigram_freq[key1][key2].items():
        if v == m :
            print(k,end=' ')
            return k

In [8]:
my_path = '20_newsgroups'
folder = 'rec.motorcycles'
path = os.path.join(my_path,folder)
motor_text = ''
for file in os.listdir(path):
    file_path = os.path.join(path,file)
    with open(file_path,'r') as f:
        motor_text += f.read() + ' '

In [9]:
sentences = sent_tokenize(motor_text)

In [10]:
bigram_dict = {}
bigrams_count = 0
for sent in sentences:
    words = preprocessing(word_tokenize(sent))
    bigrams_count += len(words)
    words.insert(0,'<s>')
    bigram_dict = bigram_generator(words,bigram_dict)
    
    
    

In [11]:
trigram_dict = {}
trigram_count = 0
for sent in sentences:
    words = preprocessing(word_tokenize(sent))
    trigram_count += len(words)
    words.insert(0,'<s>')
    words.insert(0,'<s>')
    
    trigram_dict = trigram_generator(words,trigram_dict)

In [12]:

bigram_freq = bigram_freq_generator(bigram_dict,bigrams_count)
trigram_freq = trigram_freq_generator(trigram_dict,trigram_count)

In [27]:
key = '<s>' # Starting Bigram
for i in range(15):
    key = maxBigram(key,bigram_freq)

the grateful dead seemed like least could newsgroups path rochester udel gatech concert duke infante 

In [26]:
key1 = '<s>'
key2 = '<s>'
for i in range(15):
    key = maxTrigram(key1,key2,trigram_freq)
    key1 = key2
    key2 = key

the grateful dead seemed like least could newsgroups path rochester udel gatech agate robinson from 

Q2) Given Input retun Log probability

In [31]:
def findBigramProbability(key1,key2,bigram_freq,size):
    if key1 in bigram_freq.keys():
        if key2 in bigram_freq[key1].keys():
            return bigram_freq[key1][key2]
        else:
            count = len(bigram_freq[key1])
            denom = count+size
            return 1/denom
    else:
        return 1/(1000+size)

In [32]:
sent = 'the grateful dead seemed like least could newsgroups path rochester udel gatech agate robinson from '
words = preprocessing(word_tokenize(sent))
words.insert(0,'<s>')
prob_sum = 0
for i in range(len(words)-1):
    prob_sum += findBigramProbability(words[i],words[i+1],bigram_freq,bigrams_count)
    
print(prob_sum)

-40.74911186251159


In [None]:
#1)what if first key in bigram does not exist? How do we smooth it?
