In [1]:
import pandas as pd
from tqdm import tqdm
import time
from collections import Counter as ctr

In [5]:
for i in tqdm(range(100)):
    time.sleep(0.1)

100%|██████████| 100/100 [00:10<00:00,  9.47it/s]


In [6]:
train = pd.read_csv('train.txt', delimiter=' ', names=['word', 'tag', 'tag2'])

In [7]:
train

Unnamed: 0,word,tag,tag2
0,Confidence,NN,B-NP
1,in,IN,B-PP
2,the,DT,B-NP
3,pound,NN,I-NP
4,is,VBZ,B-VP
...,...,...,...
211722,to,TO,B-PP
211723,San,NNP,B-NP
211724,Francisco,NNP,I-NP
211725,instead,RB,B-ADVP


In [8]:
tags = list(set(train.tag))
total = len(train)
smoother = float(1e-5)

len(tags), tags

(44,
 ['VBP',
  ')',
  'IN',
  'VBG',
  'EX',
  'NNPS',
  ':',
  'RB',
  '$',
  'WP',
  'VBZ',
  'RBR',
  'JJR',
  'NNS',
  'VBD',
  'DT',
  ',',
  '.',
  'RBS',
  '#',
  'SYM',
  'CD',
  'MD',
  "''",
  '``',
  'UH',
  'NN',
  'PRP',
  'PDT',
  'CC',
  'FW',
  'PRP$',
  'POS',
  'JJ',
  'WDT',
  'NNP',
  'WRB',
  'WP$',
  'JJS',
  'TO',
  'VB',
  'VBN',
  'RP',
  '('])

### Estimate Transition Probabilities $P(T_i|T_{i-1})$

### Estimate Emission Probabilities $P(T_i|W_{i-1})$

In [9]:
#P(W|T)
word_tag_counts = {}
word_tag_totals = {}
for tag in tags:
    sub_train = train[train.tag == tag]
    word_tag_counts[tag] = ctr(sub_train.word)
    word_tag_totals[tag] = sum(word_tag_counts[tag].values())

def Pwt(W='', T=''):
    if W not in word_tag_counts[T]:
        return smoother
    return word_tag_counts[T][W] / word_tag_totals[T]
    
    
#P(T)
tag_counts = ctr(train.tag)
def Ptag(T=''):
    return tag_counts[T] / total

# P(W)
word_counts = ctr(train.word)
def Pword(W=''):
    if W not in word_counts:
        return smoother
    return word_counts[W] / total

def Ptw(T='', W=''):
    return Pwt(W, T) * Ptag(T) / Pword(W)

In [10]:
Ptag(T='DT')

0.08659736358612742

In [11]:
Pword(W='thebigllama')

1e-05

In [12]:
Pwt("the", "DT")

0.5018816471229888

In [13]:
Ptw("DT", "the")

0.9981559822106519

In [14]:
def Pall_tags(word):
    return {t:Ptw(T=t, W=word) for t in tags}

In [15]:
ethe = Pall_tags('the')

In [16]:
max(ethe, key=ethe.get)

'DT'

### Greedy Decoder

In [17]:
example_sentence = ['the', 'dog', 'plays']

In [18]:
for word in example_sentence:
    emission = Pall_tags(word)
    max_tag = max(emission, key=emission.get)
    print(word, max_tag)
    

the DT
dog NN
plays VBZ


In [19]:
guesses = []
for i,row in tqdm(train.iterrows()):
    word=row['word']
    tag = row['tag']
    emission = Pall_tags(word)
    max_tag = max(emission, key=emission.get)
    guesses.append(max_tag)

211727it [00:05, 36517.67it/s]


In [20]:
train['guess'] = guesses

In [21]:
sum(train.guess == train.tag) / total

0.9649595941944107

In [22]:
train['ptag'] = train['tag'].shift(1)

In [23]:
train

Unnamed: 0,word,tag,tag2,guess,ptag
0,Confidence,NN,B-NP,NN,
1,in,IN,B-PP,IN,NN
2,the,DT,B-NP,DT,IN
3,pound,NN,I-NP,NN,DT
4,is,VBZ,B-VP,VBZ,NN
...,...,...,...,...,...
211722,to,TO,B-PP,TO,PRP
211723,San,NNP,B-NP,NNP,TO
211724,Francisco,NNP,I-NP,NNP,NNP
211725,instead,RB,B-ADVP,RB,NNP
