In [1]:
!ls

assignment_2.ipynb  brown.txt  question.pdf


In [2]:
from tqdm.auto import tqdm
from collections import defaultdict, OrderedDict
import numpy as np
from collections import Counter
import random, itertools
from sklearn.model_selection import KFold
import pickle

In [3]:
class txt_data():
    '''
    Master class to load dataset
    '''
    def __init__(self,path,split=0.9):
        self.path = path
        
        # list of sentences of list of tuples defining pos_tag and words.
        self.data = self.___sent_to_tags___(self.___open___(self)) 
#         random.shuffle(self.data)
        self.train =  self.data[:int(len(self.data)*split)]
        self.val = self.data[int(len(self.data)*split):]
        
        self.pos_tags = [i for i in set([j[0] for i in self.data for j in i])] #list of unique pos_tags
        self.vocab = [i for i in set([j[1] for i in self.data for j in i])] # list of vocab
            
    
    def ___open___(self,path):
        '''
        To read a text file
        path: path to the text file
        '''
        with open(self.path, "r") as f:
            return f.readlines()
    
    def ___sent_to_tags___(self,sents):
        '''
        '''
        dummy = []
        for i in tqdm(sents):
            d = []
            for j in i.split():
                try: d.append((j.split('_')[1],j.split('_')[0].lower()))
                except: pass #print(f"--{i}--{j}\n")
            dummy.append(d)
        return dummy

### Converitng emission/transition matrix to prob

In [4]:
def prob(matrix): 
    dummy = []
    for i in np.array(matrix):
        if np.sum(i) !=0: dummy.append(i/np.sum(i))
        else: dummy.append(i)
    return np.array(dummy)

### Smoothing

In [5]:
def smooting(matrix, name='simple'):
    if name == 'simple':
        matrix[matrix == 0] = 0.000000001*matrix.mean()
        return matrix

### emission matrix

In [6]:
def e_matrix(data,vocab,pos_tags):
    '''
    emission matrix: count of words given the pos_tag
    
    data = list of sents with the tags: format is list of list of sentences with tag and word as a tuple
    vocab = list of vocab
    pos_tags = list of pos_tags
    
    return list of pos_tags and a list of matrix (pos_tags,vocab) with count(vocab/pos_tags)
    '''
    data = [i for j in data for i in j] # list of pos_tag and word pair: [('VB', 'suppose'), ('NP', 'lauren'),()]
    dummy = {s[0]:[] for s in data} # dictioanry with tag and list of words as key, value pair, {'AT' :['the', 'the', 'a', 'the'],}
    for i in data:
        dummy[i[0]].append(i[1])
        
    e_matrix = [] # emission count matrix with tag and count of words as row, column pair.
    for i in tqdm(pos_tags):
        if i in dummy.keys():
            count = Counter(dummy[i]) # count of words in a list. 
            e_matrix.append([count[j] for j in vocab]) # vocab(50k) loop takes time not the man loop
        else: e_matrix.append([0 for j in vocab])
    
    return pos_tags, prob(e_matrix)

### transition matrix

In [7]:
def t_matrix(data, pos_tags, n_gram=2):
    '''
    transition matrix: count of current_pos-tag given the last_pos-tag
    
    data = list of sents with the tags: format is list of list of sentences with tag and word as a tuple
    pos_tags = list of pos_tags
    n_gram = look back + 1 i.e, number of words tollok back to calculate the transition matrix
    
    return a list of matrix (pos_tags**n_gram,pos_tags) with count(pos_tags/pos_tags**_gram)
    
    '''
    
    data = [[j[0]  for j in i] for i in data] # list of tags for each sentence: [['AT', 'NN', CS', '.'],[],] 
    #list of n_grams of tags, for 2 grams [['AT', 'NN'], ['NN', 'MD'], ['MD', 'HV'],[],]
    tags = [i[j:j+n_gram] for i in data for j in range(len(i) - n_gram + 1)]
    print(f" Number of {n_gram} grams present in the dataset: {len(tags)/1000000} million")
    # list of tags of cartesian product all the possible combination of pos_tag pair
    keys = [' '.join([j for j in i]) for i in list(itertools.product(pos_tags, repeat=n_gram-1))]
    keys.append("<s>") # start symbol <s>
     
    tags_ = {i:[] for i in keys} # initialize an empty dictionary, dict with
    for i in tags:
        a = ''
        for j in i[:-1]:
            a+= f"{j} "
        tags_[a.strip()].append(i[-1])
    tags_['<s>'] = [j for i in data for j in i] # list of all the tags in the dataset to count the start_prob
    
    t_matrix = [] # transition count matrix with n_gram tag and count of pos_tags as key, value pair.
    for i in tqdm(keys):
        count = Counter(tags_[i])
        d = [count[j] for j in pos_tags]
        t_matrix.append(d)

    return keys[:-1], prob(t_matrix[:-1]), prob([t_matrix[-1]])[0] 

# Viterbi algorithm

In [8]:
def decoding(obs, pos_tags, t_tag, s_prob, t_prob, e_prob, n_gram, t_2=None):
    '''
    Viterbi decoding: "https://en.wikipedia.org/wiki/Viterbi_algorithm"
    
    obs: list of words in the sentence
    pos_tags: list of pos_tags
    s_prob: starting probabilty
    t_prob: transition probability
    e_prob: emission probability
    n_gram: number of words to look back +1
    
    return: top path
    '''
    path = { s:[] for s in pos_tags} # list of all the previous path pos_tags took
    curr_prob = {s:s_prob[s]*e_prob[s][obs[0]] for s in pos_tags} # first word/time-step prob

    for i in range(1, len(obs)):
        prev_prob = curr_prob
        curr_prob = {}
        for curr_state in pos_tags:
            if i > n_gram - 1 and n_gram > 2: # for the n_gram case
                max_prob, state = max(((prev_prob[last_state]*t_prob[f"{path[curr_state][-1:][0]} {last_state}"][curr_state]*e_prob[curr_state][obs[i]], last_state) 
                                           for last_state in pos_tags))
            elif n_gram > 2: # simple look back at the last word 
                max_prob, state = max(((prev_prob[last_state]*t_2[last_state][curr_state]*e_prob[curr_state][obs[i]], last_state) 
                                           for last_state in pos_tags))
            else:
                max_prob, state = max(((prev_prob[last_state]*t_prob[last_state][curr_state]*e_prob[curr_state][obs[i]], last_state) 
                                           for last_state in pos_tags))
            curr_prob[curr_state] = max_prob
            path[curr_state].append(state)

    # find the final largest probability
    max_prob = -1
    max_path = None
    for l, p in path.items():
        p.append(l)
        if curr_prob[l] > max_prob:
            max_path = p
            max_prob = curr_prob[l]
            
    return max_path



# Experimentation

In [9]:
path = "brown.txt"
brown = txt_data(path)
train, val = brown.train, brown.val
# pos_tags, vocab = brown.pos_tags, brown.vocab

HBox(children=(FloatProgress(value=0.0, max=55145.0), HTML(value='')))




In [10]:
n_splits = 3
cv = KFold(n_splits=n_splits, random_state=42, shuffle=True)
folds = {} # stores the data of train and test for n_split folds
for r, index in tqdm(enumerate(cv.split(brown.data)), total = n_splits):
    train = [j for k,j in enumerate(brown.data) if k in index[0]]
    test = [j for k,j in enumerate(brown.data) if k in index[1]]
    folds[f"fold_{r}"]= {
                            'train': train,
                            'test': test,
                            'pos_tags': brown.pos_tags,
                            'vocab': brown.vocab
    }

HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))




## Training loop

In [42]:
n_gram = 2 # markov assumption length: if it's 2 we need trigrams, thus value would be 3 (assumption length + 1) 
exp_results = {} # saving the results in the dict

print(f"Number of folds-/-n_gram used: {len(folds)}-/-{n_gram}\n\n")
for fold, data in tqdm(folds.items()):
    save_dict = {}
    print(f"\n\n{'-'*15}: {fold} \n\nlength of train and test data : {len(data['train'])}/{len(data['test'])}")
    
    save_dict['n_gram'] =  n_gram
    
    train = data['train']
    val = data['test']
    pos_tags = data['pos_tags']
    vocab = data['vocab']
    
#     save_dict['train_data'] =  train
#     save_dict['val_data'] = val
    save_dict['pos_tags'] = pos_tags
    save_dict['vocab'] = vocab

    print(f"Creating the emission and transition probability matrix: ")
    e_tag, e_prob = e_matrix(train, vocab, pos_tags) # emission prob and pos_tags
    print(f"Finished emission prob matrix")
    # transition prob and tags
    if n_gram == 3: # if n_gram is 3 we also need 2_gram prob for the first index
        t_tag, t_prob_2, start_probability = t_matrix(train, pos_tags, 2) 
        t_tag, t_prob, start_probability = t_matrix(train, pos_tags, n_gram) 
    if n_gram == 2: t_tag, t_prob, start_probability = t_matrix(train, pos_tags, n_gram) 
    print(f"Finished tansition prob matrix")
    # smoothing the probabilites
    e_prob[e_prob == 0] = 0.000000001*e_prob.mean()
    t_prob[t_prob == 0] = 0.000000001*t_prob.mean()
    
    if n_gram == 3: 
        save_dict['t_prob_t_prob_2']  = (t_prob,t_prob_2)
        save_dict['e_prob'] = e_prob
    elif n_gram == 2:
        save_dict['t_prob']  = (t_prob)
        save_dict['e_prob'] = e_prob
    
    print(f"Creating emission and transition probability dict for O(1) searching")
    # creating emission and transition probability dict for O(1) searching
    if n_gram == 3: t_2 = {i: {pos_tags[f]:j for f,j in enumerate(t_prob_2[r])} for r,i in tqdm(enumerate(pos_tags), total=len(pos_tags))}
    else: t_2 = None
    transition_probability = {i: {pos_tags[f]:j for f,j in enumerate(t_prob[r])} for r,i in tqdm(enumerate(t_tag), total=len(t_tag))}
    emission_probability = {i: {vocab[f]:j for f,j in enumerate(e_prob[r])} for r,i in tqdm(enumerate(e_tag), total=len(e_tag))}
    start_prob = {i:start_probability[r] for r,i in enumerate(pos_tags)} 
    # start_prob = {i:1.0 for r,i in enumerate(t_tag)} # if wanted the same starting prob for all pos_tags
    
    print(f"Starting viterbi decoding: ")
    out = []
    true = []
    for i in tqdm(val[:1]):
        true.append(i)
        obs = [j[1] for j in i]
        out.append(decoding(obs, pos_tags, t_tag, start_prob, transition_probability, emission_probability, n_gram, t_2))
    
    save_dict['true_val'] = true
    save_dict['pred_val'] = out
    
    print(f"starting metric calcualtion")
    true = [i[0] for j in true for i in j]
    out = [j for i in out for j in i]
    assert len(true) == len(out)

    
    #pos to index and index to pos dict for easy searching
    pos_index = {i:e for e,i in enumerate(pos_tags)}
    index_pos = {e:i for e,i in enumerate(pos_tags)}
    
    save_dict['pos_index'] = pos_index
    save_dict['index_pos'] = index_pos
    
    # adding the true and predictions in a single list
    expected = [pos_index[i] for i in  true]
    predicted = [pos_index[i] for i in  out]
    
#     labels = sorted(pos_index.values())
    labels = set(expected+predicted)
    save_dict['labels'] = labels
    
    
    eps = 0 #0.1e-15

    dummy = OrderedDict()
    for i in labels:
        dummy[str(i)] = {str(j) : eps for j in labels}
    for r, i in enumerate(expected):
        dummy[str(i)][str(predicted[r])] +=1

    # confusion matrix
    conf_m = np.array([[j for j in i.values()] for i in dummy.values()]) # confusion matrix
    save_dict['conf_matrix'] =  conf_m
    
    #classwise 
#     pre = np.sum(conf_m,axis=0)
#     rec = np.sum(conf_m,axis=1)
#     precision = [i[r] for r,i in enumerate(conf_m)]/pre 
#     recall = [i[r] for r,i in enumerate(conf_m)]/rec 
#     save_dict['classwise_prec_and_rec'] = (precision, recall)
    
    # micro precision and recall
    precision = sum([i[r] for r,i in enumerate(conf_m)])/np.sum(pre) 
    recall = sum([i[r] for r,i in enumerate(conf_m)])/np.sum(rec)
    print("micro--",precision, recall)
    save_dict['micro_precision_and_recall'] = (precision, recall)
    
    # accuracy
    acc = len([i for i in range(len(expected)) if expected[i]==predicted[i]])/len(expected)
    print(f"Accuracy: {acc}")
    save_dict['accuracy'] = acc
    
    # micro f1 score
    f_1 = 2*((precision*recall)/(precision+recall))
    print(f"F1 score: {f_1}")
    save_dict['micro_f1'] = f_1
    
    exp_results[fold] = save_dict

Number of folds-/-n_gram used: 3-/-2




HBox(children=(FloatProgress(value=0.0, max=3.0), HTML(value='')))



---------------: fold_0 

length of train and test data : 36763/18382
Creating the emission and transition probability matrix: 


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Finished emission prob matrix
 Number of 2 grams present in the dataset: 0.738769 million


HBox(children=(FloatProgress(value=0.0, max=473.0), HTML(value='')))


Finished tansition prob matrix
Creating emission and transition probability dict for O(1) searching


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Starting viterbi decoding: 


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


starting metric calcualtion
micro-- 0.813953488372093 0.813953488372093
Accuracy: 0.813953488372093
F1 score: 0.8139534883720931


---------------: fold_1 

length of train and test data : 36763/18382
Creating the emission and transition probability matrix: 


  precision = [i[r] for r,i in enumerate(conf_m)]/pre
  recall = [i[r] for r,i in enumerate(conf_m)]/rec


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Finished emission prob matrix
 Number of 2 grams present in the dataset: 0.737799 million


HBox(children=(FloatProgress(value=0.0, max=473.0), HTML(value='')))


Finished tansition prob matrix
Creating emission and transition probability dict for O(1) searching


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Starting viterbi decoding: 


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


starting metric calcualtion
micro-- 0.84 0.84
Accuracy: 0.84
F1 score: 0.8399999999999999


---------------: fold_2 

length of train and test data : 36764/18381
Creating the emission and transition probability matrix: 


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Finished emission prob matrix
 Number of 2 grams present in the dataset: 0.735526 million


HBox(children=(FloatProgress(value=0.0, max=473.0), HTML(value='')))


Finished tansition prob matrix
Creating emission and transition probability dict for O(1) searching


HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=472.0), HTML(value='')))


Starting viterbi decoding: 


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


starting metric calcualtion
micro-- 0.8285714285714286 0.8285714285714286
Accuracy: 0.8285714285714286
F1 score: 0.8285714285714286



In [43]:
exp_results['fold_0'].keys()

dict_keys(['n_gram', 'pos_tags', 'vocab', 't_prob', 'e_prob', 'true_val', 'pred_val', 'pos_index', 'index_pos', 'labels', 'conf_matrix', 'classwise_prec_and_rec', 'micro_precision_and_recall', 'accuracy', 'micro_f1'])

In [44]:
exp_results['fold_0']['micro_f1']

0.8139534883720931

In [45]:
with open("exp_results.pk", "wb")as f:
    pickle.dump(exp_results, f, protocol=pickle.HIGHEST_PROTOCOL)

In [39]:
with open('exp_results.pk', 'rb') as f:
    exp_results = pickle.load(f)


In [None]:
# txt = [[(j[1], pos_tags, t_tag, start_prob, transition_probability, emission_probability) for j in i] for i in val]
# from concurrent.futures import ProcessPoolExecutor

# def run(Viterbit, txt):
#     with ProcessPoolExecutor(max_workers=32) as executor:
#         results = list(tqdm((executor.map(Viterbit, txt)), total=len(txt)))
#     return results

# print("starting the processes")
# temp = run(Viterbit, txt)
# print("saving the files")

In [None]:
pos_index = {i:e for e,i in enumerate(pos_tags)}
index_pos = {e:i for e,i in enumerate(pos_tags)}

In [None]:
expected = [pos_index[i] for i in  true]
predicted = [pos_index[i] for i in  out]

In [None]:
len(expected), len(predicted)

# Confusion matrix

In [None]:
# expected = [1, 1, 0,2,2,2,3,3,4,0,0,4,4,4,4,4,4,4,4,7]
# predicted = [1, 0, 0,2,1,2,3,3,3,4,3,3,4,3,4,4,4,4,4,6]

# # expected = [1, 1, 0, 1, 0, 0, 1, 0, 0, 0]
# # predicted = [1, 0, 0, 1, 0, 0, 1, 1, 1, 0]

# # Counter(expected)
labels = set(expected+predicted)

In [None]:
# labels = sorted(pos_index.values())

In [None]:
eps = 0.1e-200

dummy = OrderedDict()
for i in labels:
    dummy[str(i)] = {str(j) : eps for j in labels}
for r, i in enumerate(expected):
    dummy[str(i)][str(predicted[r])] +=1
    
conf_m = np.array([[j for j in i.values()] for i in dummy.values()]) # confusion matrix
# dummy, conf_m

In [None]:
pre = np.sum(conf_m,axis=0)
rec = np.sum(conf_m,axis=1)
#classwise 
precision = [i[r] for r,i in enumerate(conf_m)]/pre 
recall = [i[r] for r,i in enumerate(conf_m)]/rec 
# print("--",precision, recall)
#overall macro
print("macro--",sum(precision)/ len(precision), sum(recall)/len(recall))

#overall micro
precision = sum([i[r] for r,i in enumerate(conf_m)])/np.sum(pre) 
recall = sum([i[r] for r,i in enumerate(conf_m)])/np.sum(rec)
print("micro--",precision, recall)

print(f"Accuracy: {len([i for i in range(len(expected)) if expected[i]==predicted[i]])/len(expected)}")

f"F1 score: {2*((precision*recall)/(precision+recall))}"

In [None]:
from sklearn.metrics import confusion_matrix
results = confusion_matrix(expected, predicted)
from sklearn.metrics import precision_score, recall_score, f1_score
# precision_score(expected, predicted, average =None), recall_score(expected, predicted, average =None)

In [None]:
precision_score(expected, predicted, average ="macro"), recall_score(expected, predicted, average ="macro")

In [None]:
precision_score(expected, predicted, average ="weighted"), recall_score(expected, predicted, average ="weighted")

In [None]:
precision_score(expected, predicted, average ="micro"), recall_score(expected, predicted, average ="micro")

In [None]:
f1_score(expected, predicted, average='micro'),f1_score(expected, predicted, average='macro'),f1_score(expected, predicted, average='weighted')