In [1]:
import nltk
import re
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize

In [2]:
file = open('Brown_tagged_train.txt', 'r', encoding = 'utf8')
linesOG = []

for i in file:
    linesOG.append(i.replace('\n', '').split())

In [3]:
tagged_words = []
for line in linesOG:
    for i in range(len(line)):
        line[i] = line[i].split('/')
        tagged_words.append(line[i])
        if(len(line[i]) > 2):
            for j in range(1, len(line[i])-1):
                line[i][0] = line[i][0] + '/' + line[i][j]
            for j in range(len(line[i])-2):
                line[i].pop(-2)

In [4]:
lines = linesOG
len(tagged_words)

543149

In [5]:
words = [pair[0] for pair in tagged_words]
words[:10]

['At',
 'that',
 'time',
 'highway',
 'engineers',
 'traveled',
 'rough',
 'and',
 'dirty',
 'roads']

In [6]:
vocab = list(set(pair[0] for pair in tagged_words))
len(vocab)

33643

In [7]:
tags = list(set(pair[1] for pair in tagged_words))
len(tags)

12

In [8]:
tags

['ADP',
 'PRON',
 'NOUN',
 'DET',
 'PRT',
 'CONJ',
 'ADV',
 'X',
 'NUM',
 'ADJ',
 'VERB',
 '.']

In [9]:
# by tags
from collections import defaultdict

def bytag(data):
    bytags = defaultdict(list)
    for pair in data:
        bytags[pair[1]].append(pair[0])
    return bytags

## Part 1

### Calculating emission and transition probabilities

In [10]:
# emission probability
def prob_word_given_tag(word, tag, bytags):
    count_tag = len(bytags[tag])
    count = bytags[tag].count(word)
    return count/count_tag

In [11]:
def calcemission(data):
    emissionProb = defaultdict(lambda: defaultdict(float))
    bytags = bytag(data)
    words = [pair[0] for pair in data]
    words = list(set(words))
    for tag in tags:
        for word in words:
            emissionProb[tag][word] = prob_word_given_tag(word, tag, bytags)
    return emissionProb

In [12]:
# transition probability
def prob_tag2_given_tag1(tag1, tag2, data):
    tag1count = 0
    tag1tag2count = 0
    for i, pair in enumerate(data):
        if(pair[1] == tag1):
            tag1count += 1
            if((i+1) < len(data)):
                if(data[i+1][1] == tag2):
                    tag1tag2count += 1
    return tag1tag2count/tag1count

In [13]:
def calctransition(data):
    transitionProb = defaultdict(lambda: defaultdict(float))
    for tag1 in tags:
        for tag2 in tags:
            transitionProb[tag1][tag2] = prob_tag2_given_tag1(tag1, tag2, data)
    return transitionProb

### Viterbi algorithm

In [14]:
def viterbi(test_words, eprobs, tprobs):
    pred_tags = []
    for i, pair in enumerate(test_words):
        probmax = 0.0
        # NOUN is used as the default tag under the assumption that most unknown words would be nouns
        tagmax = None
        for tag in tags:
            eprob = eprobs[tag][pair[0]]
            if(i==0):
                tprob = tprobs['.'][tag]
            else:
                tprob = tprobs[pred_tags[i-1][1]][tag]
            stateprob = eprob * tprob    
            if(stateprob > probmax):
                probmax = stateprob
                tagmax = tag
        pred_tags.append([pair[0], tagmax])
    
    return pred_tags

### Testing using 3 fold CV

In [15]:
def accuracy(test, pred):
    return np.sum(test[1]==pred[1])

In [None]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=3)
accs = []
eproblist = []
for train, test in kf.split(tagged_words):
    x = [tagged_words[i] for i in train]
    y = [tagged_words[i] for i in test]
    eprobs = calcemission(x)
    tprobs = calctransition(x)
    preds = viterbi(y, eprobs, tprobs)
    accs.append(accuracy(y, preds))
    eproblist.append(eprobs)

### Part 2

### A

In [None]:
from gensim.models import Word2Vec