In [22]:
import sys
import numpy as np
from validate_utils import validate_transition_matrix, validate_emission_matrix, validate_initial_state

sys.path.insert(1, "..")
from src.scrapper import parse_conllu_file
from src.tagger import HiddenMarkovModelTrainer, HiddenMarkovModelTagger
sys.path.insert(1, ".")

In [23]:
c = parse_conllu_file(filepath="../datasets/en_partut-ud-train.conllu")
tagger = HiddenMarkovModelTrainer(corpus=c).train()

## TESTING OF MATRICES

Now we are going to check if all the probabilities sum 1.

In [None]:
try:
    validate_transition_matrix(tagger.transition_matrix)
    validate_emission_matrix(tagger.emission_matrix)
    validate_initial_state(tagger.initial_state)
    
    print("ALL PROBABILITIES SUM 1 :)")
except:
    pass

## TESTING WITH EXAMPLES


In [26]:
A = {
    "N":{
        "N": -3,
        "V": -1},
    "V":{
        "N": -1,
        "V": -3}
}

B = {
    "they": {
        "N": -2,
        "V": -10},
    "can": {
        "N": -3,
        "V": -1},
    "fish": {
        "N": -3,
        "V": -3}
}

PI = {
    "N": -1,
    "V": -3
}

vocabulary = ["they", "can", "fish"]
tagset = ["N", "V"]

tagger_1 = HiddenMarkovModelTagger(
    transition_matrix=A, emission_matrix=B, initial_state=PI, tagset=tagset, vocabulary=vocabulary
)

In [32]:
viterbi, best_path, best_prob = tagger_1.viterbi_best_path("they can fish")

print(viterbi, "\n")
print(best_path, "\n")
print(best_prob, "\n")

[[ -3.  -9.  -9.]
 [-13.  -5. -11.]] 

[('they', 'N'), ('can', 'V'), ('fish', 'N')] 

-9.0 



In [18]:
c = parse_conllu_file(filepath="../datasets/en_partut-ud-train.conllu")
tagger = HiddenMarkovModelTrainer(corpus=c).train()
sent = """The labor-participation rate for women with post-secondary education is 64%, far exceeding the 35% rate for those with only a primary or middle-school education."""
matrix, path, prob = tagger.viterbi_best_path(sentence=sent)
print(path)

[('The', 'noun'), ('labor-participation', 'noun'), ('rate', 'noun'), ('for', 'adp'), ('women', 'noun'), ('with', 'adp'), ('post-secondary', 'noun'), ('education', 'noun'), ('is', 'aux'), ('64%,', 'verb'), ('far', 'adv'), ('exceeding', 'verb'), ('the', 'det'), ('35%', 'noun'), ('rate', 'noun'), ('for', 'adp'), ('those', 'det'), ('with', 'adp'), ('only', 'adv'), ('a', 'det'), ('primary', 'noun'), ('or', 'cconj'), ('middle-school', 'noun'), ('education.', 'noun')]


In [17]:
corpus = parse_conllu_file("../datasets/en_eslspok-ud-dev.conllu")