In [16]:
import pandas as pd
import pyconll
from nltk.tokenize import RegexpTokenizer
import numpy as np

In [4]:
def compute_trasition_matrix(possible_tags,train):
    transition_matrix = np.zeros((len(possible_tags), len(possible_tags)), dtype='float32')
    for i,t1 in enumerate(possible_tags):
        for j,t2 in enumerate(possible_tags):
            transition_matrix[i][j] =  compute_transition_probability(train,t1,t2)
    return transition_matrix

In [5]:
def compute_emission_matrix(possible_tags, sentence, train):
    tokenizer = RegexpTokenizer(r'\w+')
    sentence_tokens = tokenizer.tokenize(sentence)
    emission_matrix = np.zeros((len(possible_tags), len(sentence_tokens)), dtype='float32')
    for i,t in enumerate(possible_tags):
        for j,w in enumerate(sentence_tokens):
            emission_matrix[i][j] = compute_emission_probability(train,t,w)
    return emission_matrix   

In [6]:
#t2_given_t1
def compute_transition_probability(train,tag1,tag2):
    count_t1_before_t2 = 0
    count_t1 = 0
    for sentence in train:
        for i in range (len(sentence)):
            if sentence[i-1].upos == tag1 and sentence[i].upos == tag2 and i != 0:
                count_t1_before_t2 = count_t1_before_t2 + 1
            if sentence[i].upos == tag1:
                count_t1 = count_t1 + 1
    return count_t1_before_t2/count_t1

In [7]:
def compute_initial_transition_probabilities(possible_tags, train):
    initial_probabilities = np.zeros((1,len(possible_tags)), dtype='float32')
    for i,t in enumerate(possible_tags):
        initial_probabilities[0][i] = tag_initial_state_probability(train, t)
    return initial_probabilities

In [8]:
def tag_initial_state_probability(train, tag):
    count_initial_t = 0
    count_t = 0
    for sentence in train:
        if sentence[0].upos == tag:
            count_initial_t = count_initial_t + 1
        for token in sentence:
            if token.upos == tag:
                count_t = count_t + 1
                
    return count_initial_t/count_t

In [9]:
#word_given_tag
def compute_emission_probability(train,tag,word):
    count_tag_word = 0
    count_tag = 0
    count_word = 0
    for sentence in train:
        for token in sentence:
            if token.form == word:
                count_word = count_word + 1
            if token.upos == tag and token.form == word:
                count_tag_word = count_tag_word + 1
            if token.upos == tag:
                count_tag = count_tag + 1
    if count_word == 0:
        return 1/14
    return count_tag_word/count_tag

In [10]:
train = pyconll.load_from_file('grc_perseus-ud-train.conllu')

In [11]:
tokenizer = RegexpTokenizer(r'\w+')

In [12]:
start = ['START']
possible_tags = ['ADJ','ADP', 'ADV', 'CCONJ', 'DET', 'INTJ', 'NOUN',
                 'NUM', 'PART', 'PRON', 'PUNCT', 'SCONJ', 'VERB', 'X']

sentence = "Δημήτριος δ̓ ὁ Πολιορκητὴς οὐ δαιμονίως ἤρα Λαμίας τῆς αὐλητρίδος, ἐξ ἧς ἔσχε καὶ θυγατέρα Φίλαν"

In [37]:
transition_matrix = pd.DataFrame(compute_trasition_matrix(possible_tags, train), columns = list(possible_tags), index=list(possible_tags))
#transition_matrix.to_csv('transition_matrix.csv', index = False)
#transition_matrix = pd.read_csv('transition_matrix.csv')

initial_transition_probabilities = pd.DataFrame(compute_initial_transition_probabilities(possible_tags, train), columns = list(possible_tags), index=list(start))

emission_matrix = pd.DataFrame(compute_emission_matrix(possible_tags, sentence, train), columns = list(tokenizer.tokenize(sentence)), index=list(possible_tags))

In [38]:
print(transition_matrix)

         ADJ       ADP       ADV     CCONJ       DET      INTJ      NOUN  \
0   0.075908  0.045142  0.041618  0.023214  0.026291  0.000392  0.280192   
1   0.144166  0.002418  0.028884  0.005090  0.146456  0.000000  0.487721   
2   0.090145  0.046916  0.116627  0.017409  0.051638  0.000074  0.116480   
3   0.141324  0.051427  0.103462  0.005467  0.076129  0.000607  0.290747   
4   0.166345  0.039027  0.102092  0.018400  0.022110  0.000000  0.526191   
5   0.241935  0.002688  0.064516  0.000000  0.002688  0.075269  0.381720   
6   0.139245  0.046763  0.042743  0.035410  0.027892  0.000153  0.139859   
7   0.141463  0.034146  0.024390  0.048780  0.009756  0.000000  0.326829   
8   0.124441  0.063101  0.110720  0.026047  0.012620  0.000000  0.206765   
9   0.086120  0.038200  0.082718  0.018177  0.029646  0.000972  0.127527   
10  0.047969  0.026868  0.054920  0.047279  0.020016  0.004979  0.051075   
11  0.069152  0.023452  0.152736  0.015634  0.036079  0.000000  0.078773   
12  0.101454

In [30]:
print(initial_transition_probabilities)

            ADJ       ADP       ADV     CCONJ       DET      INTJ      NOUN  \
START  0.054875  0.066675  0.211567  0.107107  0.092744  0.540323  0.041546   

            NUM      PART      PRON  PUNCT     SCONJ      VERB         X  
START  0.082927  0.029349  0.198484    0.0  0.169573  0.055198  0.131579  


In [31]:
print(emission_matrix)

       Δημήτριος         δ         ὁ  Πολιορκητὴς        οὐ  δαιμονίως  \
ADJ     0.000000  0.071429  0.000000     0.071429  0.000000   0.000000   
ADP     0.000000  0.071429  0.000000     0.071429  0.000000   0.000000   
ADV     0.000000  0.071429  0.000000     0.071429  0.063072   0.000074   
CCONJ   0.000000  0.071429  0.000000     0.071429  0.000000   0.000000   
DET     0.000000  0.071429  0.070930     0.071429  0.000000   0.000000   
INTJ    0.000000  0.071429  0.000000     0.071429  0.000000   0.000000   
NOUN    0.000031  0.071429  0.000000     0.071429  0.000000   0.000000   
NUM     0.000000  0.071429  0.000000     0.071429  0.000000   0.000000   
PART    0.000000  0.071429  0.000000     0.071429  0.006604   0.000000   
PRON    0.000000  0.071429  0.001264     0.071429  0.000000   0.000000   
PUNCT   0.000000  0.071429  0.000000     0.071429  0.000000   0.000000   
SCONJ   0.000000  0.071429  0.000000     0.071429  0.000000   0.000000   
VERB    0.000000  0.071429  0.000000  

In [32]:

def viterbi_algorithm(sentence, possible_tags, transition_matrix, emission_matrix):
    print("Viterbi Algorithm...")
    states = []    
    sentence_tokens = tokenizer.tokenize(sentence)
    
    for key,word in enumerate(sentence_tokens):
        p = []
        for t,tag in enumerate(possible_tags):
            s = 0
            if key == 0:
                trasition_p = initial_transition_probabilities.iloc[0][tag]
            else:
                trasition_p = transition_matrix.loc[states[s - 1]][tag]
            emission_p = emission_matrix.iloc[t,key]
            state_probability = emission_p * trasition_p
            p.append(state_probability)
            s = s + 1
        pmax = max(p)
        state_max = possible_tags[p.index(pmax)]
        states.append(state_max)
            
    print(list(zip(sentence_tokens,states)))


In [33]:
#test
viterbi_algorithm(sentence, possible_tags, transition_matrix, emission_matrix)

Viterbi Algorithm...
[('Δημήτριος', 'NOUN'), ('δ', 'VERB'), ('ὁ', 'DET'), ('Πολιορκητὴς', 'NOUN'), ('οὐ', 'ADV'), ('δαιμονίως', 'ADV'), ('ἤρα', 'VERB'), ('Λαμίας', 'PUNCT'), ('τῆς', 'DET'), ('αὐλητρίδος', 'NOUN'), ('ἐξ', 'ADP'), ('ἧς', 'PRON'), ('ἔσχε', 'VERB'), ('καὶ', 'CCONJ'), ('θυγατέρα', 'NOUN'), ('Φίλαν', 'VERB')]
