In [205]:
import numpy as np
import pandas as pd
from nltk.corpus import treebank

In [206]:
tagged_sentences = treebank.tagged_sents()[0:]

### Function

In [207]:
def getTransitions(POSTaggedSent):
    tag_count = {}
    transitions = {}
    unique_tags = set()

    for sentence in POSTaggedSent:
        for i in range(0, len(sentence)-1):
            tag = sentence[i][1]
            next_tag = sentence[i+1][1]

            if(tag not in tag_count):
                tag_count[tag] = 1
            else:
                tag_count[tag] += 1

            if((tag,next_tag) not in transitions):
                transitions[(tag,next_tag)] = 1
            else:
                transitions[(tag,next_tag)] += 1

            unique_tags.add(tag)
        
    for transition in transitions:
        first_tag = transition[0]
        first_tag_cnt = tag_count[first_tag]
        transitions[transition] = transitions[transition]/first_tag_cnt

    return tag_count,transitions,unique_tags
            

In [208]:
def getEmissions(POSTaggedSent):
    word_count = {}
    emissions = {}
    unqiue_words = set()

    for sentence in POSTaggedSent:
        for i in range(0, len(sentence)-1):
            word = sentence[i][0]
            tag = sentence[i][1]

            if(word not in word_count):
                word_count[word] = 1
            else:
                word_count[word] += 1

            if((word,tag) not in emissions):
                emissions[(word,tag)] = 1
            else:
                emissions[(word,tag)] += 1
            
            unqiue_words.add(word)

    for emission in emissions:
        word = emission[0]
        emissions[emission] = emissions[emission]/word_count[word]

    return word_count, emissions, unqiue_words
        

In [209]:
def getTransitionEmissionTables(POSTaggedSentences):
    _, transition_probs, unique_tags = getTransitions(tagged_sentences)
    _, emission_probs, unique_words = getEmissions(tagged_sentences)


    Transition = pd.DataFrame(index=(list(unique_tags)), columns=(list(unique_tags)))
    Transition.fillna(0.0,inplace=True)

    for transition in transition_probs:
        Transition.loc[transition[0],transition[1]] = transition_probs[transition]

    Emission = pd.DataFrame(index=(list(unique_words)), columns=(list(unique_tags)))
    Emission.fillna(0.0,inplace=True)

    for emission in emission_probs:
        Emission.loc[emission[0],emission[1]] = emission_probs[emission]

    return Transition,Emission

### Get transition and emission table

In [210]:
transition_table, emmision_table = getTransitionEmissionTables(tagged_sentences)

In [211]:
transition_table

Unnamed: 0,NNPS,WDT,SYM,WP,VBD,VBG,NNS,RBR,RP,EX,...,",",'',JJS,$,JJ,NNP,-NONE-,UH,MD,NN
NNPS,0.008368,0.0,0.0,0.0,0.041841,0.0,0.008368,0.0,0.0,0.0,...,0.138075,0.0,0.0,0.0,0.0,0.330544,0.008368,0.0,0.033473,0.020921
WDT,0.0,0.0,0.0,0.0,0.004494,0.0,0.01573,0.0,0.0,0.002247,...,0.0,0.0,0.002247,0.0,0.011236,0.011236,0.858427,0.0,0.002247,0.006742
SYM,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
WP,0.0,0.0,0.0,0.0,0.0,0.0,0.008299,0.0,0.0,0.0,...,0.004149,0.0,0.0,0.0,0.004149,0.020747,0.780083,0.0,0.0,0.012448
VBD,0.0,0.0,0.0,0.000657,0.0,0.017417,0.019389,0.002629,0.015445,0.0,...,0.008873,0.000329,0.0,0.013802,0.044364,0.030233,0.273086,0.000329,0.0,0.029576
VBG,0.000685,0.0,0.0,0.002055,0.00137,0.002055,0.094521,0.003425,0.019178,0.0,...,0.017808,0.00137,0.0,0.006164,0.069178,0.031507,0.075342,0.0,0.0,0.146575
NNS,0.0,0.014557,0.0,0.008768,0.074442,0.01158,0.012572,0.001323,0.000331,0.0,...,0.119107,0.004301,0.0,0.000496,0.018362,0.001158,0.040695,0.0,0.026634,0.022498
RBR,0.0,0.0,0.0,0.0,0.007353,0.014706,0.0,0.0,0.0,0.0,...,0.022059,0.0,0.0,0.0,0.375,0.0,0.022059,0.0,0.007353,0.0
RP,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,0.0,0.0,...,0.027778,0.009259,0.0,0.009259,0.060185,0.023148,0.115741,0.0,0.0,0.050926
EX,0.0,0.0,0.0,0.0,0.136364,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.056818,0.0


In [212]:
emmision_table

Unnamed: 0,NNPS,WDT,SYM,WP,VBD,VBG,NNS,RBR,RP,EX,...,",",'',JJS,$,JJ,NNP,-NONE-,UH,MD,NN
prejudice,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
margins,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
invariably,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ire,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
noodles,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Veselich,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Even,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
battery-operated,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
FAX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### Viterbi