In [1]:
import pickle
import nltk
from nltk.tag.hmm import HiddenMarkovModelTagger as hmm_tag
from collections import Counter
from nltk.metrics import ConfusionMatrix
import numpy as np

In [2]:
with open("corpus_data/pickled/dev.pickle", "rb") as file:
    dev = pickle.load(file)

In [3]:
with open("corpus_data/pickled/test.pickle", "rb") as file:
    test = pickle.load(file)

In [4]:
with open("corpus_data/pickled/train.pickle", "rb") as file:
    train = pickle.load(file)

In [5]:
train_class = [(train[i][0], train[i][2]) for i in range(len(train))]
dev_class = [(dev[i][0], dev[i][2]) for i in range(len(dev))]
test_class = [(test[i][0], test[i][2]) for i in range(len(test))]

In [6]:
index = []
for i in range(len(train_class)-1):
    if train_class[i][0][-1] == '.' and train_class[i+1][0][0].isupper():
        index.append(i+1)

train_sents = [train_class[:index[0]]]
for i in range(len(index)-1):
    sent = train_class[index[i]:index[i+1]]
    train_sents.append(sent)

In [7]:
index = []
for i in range(len(test_class)-1):
    if test_class[i][0][-1] == '.' and test_class[i+1][0][0].isupper():
        index.append(i+1)

test_sents = [test_class[:index[0]]]
for i in range(len(index)-1):
    sent = test_class[index[i]:index[i+1]]
    test_sents.append(sent)

In [8]:
tagger = hmm_tag.train(train_sents)

In [16]:
words = []
tags = []
for sent in range(len(test_sents)):
    for word in range(len(test_sents[sent])):
        words.append(test_sents[sent][word][0])
        tags.append(test_sents[sent][word][1])

In [17]:
tagged_words = tagger.tag(words)
ref = [tagged_words[i][1] for i in range(len(tagged_words))]

In [18]:
tagsFDist = nltk.FreqDist(tags)
tagsFDist

FreqDist({'O': 16693, 'P': 2025, 'M': 1564, 'T': 828})

In [19]:
#tags = len(tagged_words)*['O']

In [20]:
cm = ConfusionMatrix(ref, tags)
print(cm)

  |     M     O     P     T |
--+-------------------------+
M |  <578>  550   239    29 |
O |   503<13634>  699   391 |
P |   162   572  <546>  105 |
T |   321  1937   541  <303>|
--+-------------------------+
(row = reference; col = test)



In [21]:
labels = set('M O P T'.split())

true_positives = Counter()
false_negatives = Counter()
false_positives = Counter()

for i in labels:
    for j in labels:
        if i == j:
            true_positives[i] += cm[i,j]
        else:
            false_negatives[i] += cm[i,j]
            false_positives[j] += cm[i,j]

f1_ind = []
for i in sorted(labels):
    if true_positives[i] == 0:
        fscore = 0
    else:
        precision = true_positives[i] / float(true_positives[i]+false_positives[i])
        recall = true_positives[i] / float(true_positives[i]+false_negatives[i])
        fscore = 2 * (precision * recall) / float(precision + recall)
    f1_ind.append(fscore)
    print(i, fscore)

M 0.3905405405405405
O 0.8542606516290727
P 0.3202346041055718
T 0.15419847328244274


In [24]:
f1_mean = (f1_ind[0]+f1_ind[2]+f1_ind[3])/3
f1_mean

0.28832453930951835