In [1]:
import pandas as pd
import pyconll
from nltk.tokenize import RegexpTokenizer
import numpy as np

In [2]:
def compute_trasition_matrix(possible_tags,train):
    transition_matrix = np.zeros((len(possible_tags), len(possible_tags)), dtype='float32')
    for i,t1 in enumerate(possible_tags):
        for j,t2 in enumerate(possible_tags):
            transition_matrix[i][j] =  compute_transition_probability(train,t1,t2)
    return transition_matrix

In [3]:
def compute_emission_matrix(possible_tags, sentence, train):
    tokenizer = RegexpTokenizer(r'\w+')
    sentence_tokens = tokenizer.tokenize(sentence)
    emission_matrix = np.zeros((len(possible_tags), len(sentence_tokens)), dtype='float32')
    for i,t in enumerate(possible_tags):
        for j,w in enumerate(sentence_tokens):
            emission_matrix[i][j] = compute_emission_probability(train,t,w)
    return emission_matrix   

In [4]:
#t2_given_t1
def compute_transition_probability(train,tag1,tag2):
    count_t1_before_t2 = 0
    count_t1 = 0
    for sentence in train:
        for i in range (len(sentence)):
            if sentence[i-1].upos == tag1 and sentence[i].upos == tag2 and i != 0:
                count_t1_before_t2 = count_t1_before_t2 + 1
            if sentence[i].upos == tag1:
                count_t1 = count_t1 + 1
    return count_t1_before_t2/count_t1

In [5]:
def compute_initial_transition_probabilities(possible_tags, train):
    initial_probabilities = np.zeros((1,len(possible_tags)), dtype='float32')
    for i,t in enumerate(possible_tags):
        initial_probabilities[0][i] = tag_initial_state_probability(train, t)
    return initial_probabilities

In [6]:
def tag_initial_state_probability(train, tag):
    count_initial_t = 0
    for sentence in train:
        if sentence[0].upos == tag:
            count_initial_t = count_initial_t + 1
                
    return count_initial_t/len(train)

In [7]:
#word_given_tag
def compute_emission_probability(train,tag,word):
    count_tag_word = 0
    count_tag = 0
    count_word = 0
    for sentence in train:
        for token in sentence:
            if token.form == word:
                count_word = count_word + 1
            if token.upos == tag and token.form == word:
                count_tag_word = count_tag_word + 1
            if token.upos == tag:
                count_tag = count_tag + 1
    if count_word == 0:
        return 1/14
    return count_tag_word/count_tag

In [8]:
train = pyconll.load_from_file('la_llct-ud-train.conllu')

In [9]:
tokenizer = RegexpTokenizer(r'\w+')

In [10]:
start = ['START']
possible_tags = ['ADJ','ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'NOUN',
                     'NUM', 'PART', 'PRON','PROPN', 'PUNCT', 'SCONJ', 'VERB', 'X']

sentence = "Δημήτριος δ̓ ὁ Πολιορκητὴς οὐ δαιμονίως ἤρα Λαμίας τῆς αὐλητρίδος, ἐξ ἧς ἔσχε καὶ θυγατέρα Φίλαν"

In [11]:
transition_matrix = pd.DataFrame(compute_trasition_matrix(possible_tags, train), columns = list(possible_tags), index=list(possible_tags))
#transition_matrix.to_csv('transition_matrix.csv', index = False)
#transition_matrix = pd.read_csv('transition_matrix.csv')

initial_transition_probabilities = pd.DataFrame(compute_initial_transition_probabilities(possible_tags, train), columns = list(possible_tags), index=list(start))

emission_matrix = pd.DataFrame(compute_emission_matrix(possible_tags, sentence, train), columns = list(tokenizer.tokenize(sentence)), index=list(possible_tags))

In [12]:
pd.set_option('display.notebook_repr_html', True)
transition_matrix

Unnamed: 0,ADJ,ADP,ADV,AUX,CCONJ,DET,NOUN,NUM,PART,PRON,PROPN,PUNCT,SCONJ,VERB,X
ADJ,0.089186,0.022273,0.005023,0.034594,0.047578,0.006729,0.283196,0.000758,0.000284,0.008435,0.213819,0.252867,0.000569,0.034499,0.00019
ADP,0.090429,0.002025,0.008323,0.0,0.0,0.244686,0.353729,0.006355,0.0,0.105331,0.147677,0.000562,0.000169,0.040659,5.6e-05
ADV,0.011566,0.214399,0.048576,0.017782,0.013879,0.043516,0.080382,0.00506,0.029059,0.061009,0.129536,0.032529,0.010843,0.30172,0.000145
AUX,0.089726,0.109915,0.013459,0.0,0.026021,0.018394,0.061014,0.001795,0.000449,0.154329,0.014805,0.275011,0.005384,0.229699,0.0
CCONJ,0.05447,0.157828,0.050239,0.004051,0.007563,0.089943,0.263798,0.029621,0.010354,0.037364,0.05456,0.004952,0.059242,0.175925,9e-05
DET,0.061589,0.056543,0.076478,0.001415,0.024611,0.055128,0.447487,0.007629,0.000554,0.033225,0.076724,0.060604,0.014705,0.083308,0.0
NOUN,0.101177,0.092977,0.023564,0.007742,0.076263,0.151512,0.082509,0.018716,0.000892,0.027326,0.077879,0.180141,0.007091,0.151778,0.000434
NUM,0.015735,0.189908,0.034183,0.003256,0.051546,0.010309,0.421595,0.022789,0.0,0.003798,0.000543,0.185567,0.001085,0.059685,0.0
PART,0.0,0.015009,0.011257,0.223265,0.043152,0.001876,0.071295,0.0,0.0,0.009381,0.001876,0.0,0.0,0.622889,0.0
PRON,0.005215,0.08203,0.049471,0.020414,0.039115,0.100954,0.13992,0.009835,0.001714,0.079049,0.253241,0.020861,0.003129,0.19103,0.004023


In [13]:
initial_transition_probabilities

Unnamed: 0,ADJ,ADP,ADV,AUX,CCONJ,DET,NOUN,NUM,PART,PRON,PROPN,PUNCT,SCONJ,VERB,X
START,0.046508,0.022911,0.070243,0.000412,0.191384,0.034984,0.139388,0.001372,0.000549,0.025792,0.009878,0.378241,0.003841,0.074359,0.000137


In [31]:
print(emission_matrix)

       Δημήτριος         δ         ὁ  Πολιορκητὴς        οὐ  δαιμονίως  \
ADJ     0.000000  0.071429  0.000000     0.071429  0.000000   0.000000   
ADP     0.000000  0.071429  0.000000     0.071429  0.000000   0.000000   
ADV     0.000000  0.071429  0.000000     0.071429  0.063072   0.000074   
CCONJ   0.000000  0.071429  0.000000     0.071429  0.000000   0.000000   
DET     0.000000  0.071429  0.070930     0.071429  0.000000   0.000000   
INTJ    0.000000  0.071429  0.000000     0.071429  0.000000   0.000000   
NOUN    0.000031  0.071429  0.000000     0.071429  0.000000   0.000000   
NUM     0.000000  0.071429  0.000000     0.071429  0.000000   0.000000   
PART    0.000000  0.071429  0.000000     0.071429  0.006604   0.000000   
PRON    0.000000  0.071429  0.001264     0.071429  0.000000   0.000000   
PUNCT   0.000000  0.071429  0.000000     0.071429  0.000000   0.000000   
SCONJ   0.000000  0.071429  0.000000     0.071429  0.000000   0.000000   
VERB    0.000000  0.071429  0.000000  

In [32]:

def viterbi_algorithm(sentence, possible_tags, transition_matrix, emission_matrix):
    print("Viterbi Algorithm...")
    states = []    
    sentence_tokens = tokenizer.tokenize(sentence)
    
    for key,word in enumerate(sentence_tokens):
        p = []
        for t,tag in enumerate(possible_tags):
            s = 0
            if key == 0:
                trasition_p = initial_transition_probabilities.iloc[0][tag]
            else:
                trasition_p = transition_matrix.loc[states[s - 1]][tag]
            emission_p = emission_matrix.iloc[t,key]
            state_probability = emission_p * trasition_p
            p.append(state_probability)
            s = s + 1
        pmax = max(p)
        state_max = possible_tags[p.index(pmax)]
        states.append(state_max)
            
    print(list(zip(sentence_tokens,states)))


In [33]:
#test
viterbi_algorithm(sentence, possible_tags, transition_matrix, emission_matrix)

Viterbi Algorithm...
[('Δημήτριος', 'NOUN'), ('δ', 'VERB'), ('ὁ', 'DET'), ('Πολιορκητὴς', 'NOUN'), ('οὐ', 'ADV'), ('δαιμονίως', 'ADV'), ('ἤρα', 'VERB'), ('Λαμίας', 'PUNCT'), ('τῆς', 'DET'), ('αὐλητρίδος', 'NOUN'), ('ἐξ', 'ADP'), ('ἧς', 'PRON'), ('ἔσχε', 'VERB'), ('καὶ', 'CCONJ'), ('θυγατέρα', 'NOUN'), ('Φίλαν', 'VERB')]
