In [8]:
#importing libraries
import os,re,math,pickle
import numpy as np
import pandas as pd
from collections import defaultdict,Counter

In [9]:
#loading dataset
DATA="../data/data.csv"
TEST="../data/test.csv"
OUT_MODEL="../models/trigram_model_full.pkl"
os.makedirs(os.path.dirname(OUT_MODEL),exist_ok=True)

In [11]:
#read data
df=pd.read_csv(DATA)
corpus=df['text'].dropna().astype(str).tolist()

In [12]:
#preprocess helper
tok_re=re.compile(r"[a-z0-9']+")
def preprocess(s):
    return tok_re.findall(s.lower())

print(preprocess("Hello, I'm 20."))

['hello', "i'm", '20']


In [13]:
#building tokens and vocab with <unk> (min_count)
tokens=[ ['<s>']+preprocess(s)+['</s>'] for s in corpus ]
ctr=Counter(w for sent in tokens for w in sent)
min_count=2   # change to 1 if very small train set
vocab=[w for w,c in ctr.items() if c>=min_count]
for t in ['<s>','</s>','<unk>']:
    if t not in vocab: vocab.append(t)
vocab=sorted(set(vocab))
v2i={w:i for i,w in enumerate(vocab)}
i2v={i:w for w,i in v2i.items()}
V=len(vocab)
train_mapped=[[ w if w in v2i else '<unk>' for w in sent ] for sent in tokens]
print('vocab:',V,'train_sents:',len(train_mapped))

vocab: 64804 train_sents: 40000


In [17]:
#building sparse counts efficiently
bigram_next=defaultdict(Counter)
bigram=Counter()    
bigram2=defaultdict(Counter)
unigrams=Counter()
tot_unigrams=0

for s in train_mapped:
    L=len(s)
    for i in range(L-2):
        w1,w2,w3=s[i],s[i+1],s[i+2]
        bigram_next[(w1,w2)][w3]+=1
        bigram[(w1,w2)]+=1
        bigram2[w2][w3]+=1
        unigrams[w3]+=1
        tot_unigrams+=1
print('unique bigrams:',len(bigram_next),'tot_unigrams:',tot_unigrams)
#precomputing denominators
bigram_denom={bn:count for bn,count in bigram.items()}
bigram2_denom={w2:sum(cnt.values()) for w2,cnt in bigram2.items()}
unigram_denom=tot_unigrams

unique bigrams: 1943148 tot_unigrams: 9409877


In [18]:
#hyperparams
alpha=1.0
lam3,lam2,lam1=0.6,0.3,0.1
def prob_trigram(w1,w2,w3):
    bn=(w1,w2)
    if bn in bigram_next:
        num=bigram_next[bn].get(w3,0)+alpha
        den=bigram_denom.get(bn,0)+alpha*V
        return num/den
    return 0.0

def prob_bigram(w2,w3):
    num=bigram2[w2].get(w3,0)+alpha
    den=bigram2_denom.get(w2,0)+alpha*V
    return num/den if den>0 else 0.0

def prob_unigram(w3):
    return (unigrams.get(w3,0)+alpha) / (unigram_denom+alpha*V) if unigram_denom>0 else 1.0/V

def prob_combined(w1,w2,w3):
    p3=prob_trigram(w1,w2,w3)
    p2=prob_bigram(w2,w3)
    p1=prob_unigram(w3)
    return lam3*p3 + lam2*p2 + lam1*p1

In [19]:
#greedy prediction and generation 
def predict_next(w1,w2):
    bn=(w1,w2)
    if bn in bigram_next and bigram_next[bn]:
        return max(bigram_next[bn].items(), key=lambda x:x[1])[0]
    if bigram2[w2]:
        return max(bigram2[w2].items(), key=lambda x:x[1])[0]
    return max(unigrams.items(), key=lambda x:x[1])[0] if unigrams else '</s>'

def generate(start=('the','quick'),maxlen=20):
    w1,w2=start
    out=[w1,w2]
    for _ in range(maxlen):
        nxt=predict_next(w1,w2)
        out.append(nxt)
        if nxt=='</s>': break
        w1,w2=w2,nxt
    return ' '.join(out)

In [20]:
#testing and save model
print('sample prob:',prob_combined('the','quick','brown'))
print('predict:',predict_next('the','quick'))
print('gen:',generate(('the','quick')))

to_save={
    'vocab':vocab,'v2i':v2i,'i2v':i2v,
    'bigram_next':bigram_next,'bigram':bigram,'bigram_denom':bigram_denom,
    'bigram2':bigram2,'bigram2_denom':bigram2_denom,
    'unigrams':unigrams,'unigram_denom':unigram_denom,
    'tot_unigrams':tot_unigrams,
    'alpha':alpha,'lam':(lam3,lam2,lam1)
}
with open(OUT_MODEL,'wb') as f:
    pickle.dump(to_save,f,protocol=pickle.HIGHEST_PROTOCOL)
print('saved model ->',OUT_MODEL)

sample prob: 1.844581262577135e-05
predict: and
gen: the quick and the film is a very good and the film is a very good and the film is a very
saved model -> ../models/trigram_model_full.pkl


In [12]:
#tests
print('vocab size:',V)
print('observed bigrams:',len(bigram_next))
print('sample prob:',trigram_prob('the','quick','brown'))
print('predict:',predict_next('the','quick'))
print('gen:',generate(('the','quick')))

vocab size: 64804
observed bigrams: 1943148
sample prob: 1.5422816514751925e-05
predict: and
gen: the quick and the film is a very good and the film is a very good and the film is a very


In [21]:
#evaluation test on test.csv
df_test=pd.read_csv(TEST)
test_sents=df_test['text'].dropna().astype(str).tolist()

unk_present = '<unk>' in v2i
def map_tokens(s):
    toks=preprocess(s)
    if unk_present:
        return ['<s>'] + [ (w if w in v2i else '<unk>') for w in toks ] + ['</s>']
    return ['<s>'] + toks + ['</s>']

test_tokens=[ map_tokens(s) for s in test_sents ]

#coverage
flat=[w for sent in test_tokens for w in sent if w not in ('<s>','</s>')]
known=sum(1 for w in flat if w in v2i and w!='<unk>')
coverage = known/len(flat) if flat else 0.0
print('test tokens:',len(flat),'known tokens:',known,'coverage:',coverage)

#computing perplexity
sum_log=0.0; N=0
for sent in test_tokens:
    for i in range(len(sent)-2):
        w1,w2,w3=sent[i],sent[i+1],sent[i+2]
        p=prob_combined(w1,w2,w3)
        if p<=0: p=1e-12
        sum_log += math.log(p); N+=1

ppl=math.exp(-sum_log/N) if N>0 else float('inf')
print('test trigrams:',N,'perplexity:',ppl)

test tokens: 1178808 known tokens: 1168208 coverage: 0.9910078655726802
test trigrams: 1178808 perplexity: 1764.2275509946078
