In [46]:
import pandas as pd
from tqdm import tqdm
import time
from collections import Counter as ctr

In [47]:
train = pd.read_csv("train.txt", delimiter=" ", names=['word', 'tag', 'drop']).drop('drop', axis=1)
tags = train['tag'].unique()
smoother = float(1e-5)
total = len(train)

### Emission Function $P(T_i|W_{i-1})$

In [48]:
word_tag_counts = {}
word_tag_totals = {}

for tag in tags:
    sub_train = train[train['tag']==tag]
    word_tag_counts[tag] = ctr(sub_train['word'])
    word_tag_totals[tag] = sum(word_tag_counts[tag].values())

#P(W|T)
def Pwt(W='', T=''):
    if W not in word_tag_counts[T]:
        return smoother
    return word_tag_counts[T][W] / word_tag_totals[T]
#P(T)
tag_counts = ctr(train['tag'])
def Ptag(T=''):
    return tag_counts[T] / total
#P(W)
word_counts = ctr(train['word'])
def Pword(W=''):
    if W not in word_counts:
        return smoother
    return word_counts[W] / total
#P(T|W)
def Ptw(T='', W=''):
    return (Pwt(W=W, T=T) * Ptag(T)) / Pword(W)

### Transition Function $P(T_i|T_{i-1})$

In [49]:
train['prev_tag'] = train['tag'].shift(1)
train = train.dropna()

In [50]:
tag_bigram_counts = {}

for tag in tags:
    sub_train = train[train['tag']==tag]
    tag_bigram_counts[tag] = ctr(sub_train['prev_tag'])

In [51]:
# P(T_i|T{i-1})
def PtagPrev(tag='', prev_tag=''):
    return tag_bigram_counts[tag][prev_tag] / word_tag_totals[prev_tag]

### Greedy Decoder

In [54]:
words = train['word'].unique()

guesses = []
def greedy_decoder(df):
    for i,row in tqdm(df.iterrows()):
        word = row['word']
        tag = row['tag']
        prev_tag = row['prev_tag']
        transition_emission = {t:Ptw(T=t, W=word) * PtagPrev(tag=t, prev_tag=prev_tag) for t in tags}
        max_tag = max(transition_emission, key=transition_emission.get)
        guesses.append(max_tag)
    return guesses

In [56]:
guesses = greedy_decoder(train)

211722it [00:07, 27119.59it/s]
