# Bigram HMMs

In [1]:
from xtagger import HiddenMarkovModel

import nltk
from sklearn.model_selection import train_test_split

nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
train_set, test_set = train_test_split(nltk_data,train_size=0.8,test_size=0.2)

In [2]:
model = HiddenMarkovModel(
    extend_to = "bigram",
    language = "en",
)

model.fit(train_set)

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [4]:
model.evaluate(
    test_set,
    random_size = 20,
    seed = 15,
    eval_metrics = ['acc', 'classwise_f1'],
    result_type = "%",
)

HBox(children=(FloatProgress(value=0.0, max=568.0), HTML(value='')))




{'acc': 89.08450704225352,
 'classwise_f1': {'CONJ': 38.46153846153846,
  'PRON': 100.0,
  'VERB': 87.71929824561403,
  'NOUN': 87.97468354430379,
  'ADV': 72.72727272727272,
  '.': 100.0,
  'ADJ': 80.0,
  'PRT': 100.0,
  'ADP': 95.55555555555556,
  'NUM': 100.0,
  'DET': 98.87640449438202,
  'X': 100.0}}

In [5]:
s = ["There", "are", "no", "two", "words", "in", "the", "English", 
     "language", "more", "harmful", "than", "good", "job"]

model.predict(s)

HBox(children=(FloatProgress(value=0.0, max=14.0), HTML(value='')))




[('There', 'DET'),
 ('are', 'VERB'),
 ('no', 'DET'),
 ('two', 'NUM'),
 ('words', 'NOUN'),
 ('in', 'ADP'),
 ('the', 'DET'),
 ('English', 'ADJ'),
 ('language', 'NOUN'),
 ('more', 'ADV'),
 ('harmful', 'ADJ'),
 ('than', 'ADP'),
 ('good', 'ADJ'),
 ('job', 'NOUN')]

# Trigram HMMs

In [6]:
from xtagger import HiddenMarkovModel

import nltk
from sklearn.model_selection import train_test_split

nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
train_set, test_set = train_test_split(nltk_data,train_size=0.8,test_size=0.2)

In [7]:
model = HiddenMarkovModel(
    extend_to = "trigram",
    language = "en",
)

model.fit(train_set)

HBox(children=(FloatProgress(value=0.0, max=1728.0), HTML(value='')))




In [8]:
model.evaluate(
    test_set,
    random_size = 20,
    seed = 15,
    eval_metrics = ['acc', 'avg_recall'],
    result_type = "%",
)

HBox(children=(FloatProgress(value=0.0, max=77184.0), HTML(value='')))




{'acc': 90.67164179104478,
 'avg_recall': {'weigted': 90.67164179104478,
  'micro': 90.67164179104478,
  'macro': 91.21351833618937}}

In [9]:
s = ["Oh", "my", "dear", "God", "are", "you", "one", 
     "of", "those", "single", "tear", "people"]

model.predict(s)

HBox(children=(FloatProgress(value=0.0, max=1584.0), HTML(value='')))




[('Oh', 'X'),
 ('my', 'PRON'),
 ('dear', 'CONJ'),
 ('God', 'NOUN'),
 ('are', 'VERB'),
 ('you', 'PRON'),
 ('one', 'NUM'),
 ('of', 'ADP'),
 ('those', 'DET'),
 ('single', 'ADJ'),
 ('tear,people', 'CONJ')]

# Deleted Interpolated HMMs

In [10]:
from xtagger import HiddenMarkovModel

import nltk
from sklearn.model_selection import train_test_split

nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
train_set, test_set = train_test_split(nltk_data,train_size=0.8,test_size=0.2)

In [11]:
model = HiddenMarkovModel(
    extend_to = "deleted_interpolation",
    language = "en",
)

model.fit(train_set)

HBox(children=(FloatProgress(value=0.0, max=3456.0), HTML(value='')))


λ1: 0.32782403380141667, λ2: 0.2937989312787374, λ3: 0.3783770349198459


In [12]:
model.evaluate(
    test_set,
    random_size = 20,
    seed = 15,
    eval_metrics = ['acc', 'report'],
    result_type = "%",
)

HBox(children=(FloatProgress(value=0.0, max=76176.0), HTML(value='')))


              precision    recall  f1-score   support

        CONJ       0.17      1.00      0.29        11
        PRON       1.00      1.00      1.00        10
        VERB       0.98      0.86      0.91        56
        NOUN       0.97      0.79      0.87       139
         ADV       1.00      0.77      0.87        13
           .       1.00      1.00      1.00        81
         ADJ       0.92      0.68      0.78        34
         PRT       0.90      1.00      0.95        18
         ADP       0.98      0.96      0.97        52
         NUM       1.00      0.77      0.87        39
         DET       1.00      1.00      1.00        49
           X       1.00      0.96      0.98        27

   micro avg       0.88      0.88      0.88       529
   macro avg       0.91      0.90      0.87       529
weighted avg       0.96      0.88      0.91       529
 samples avg       0.88      0.88      0.88       529



{'acc': 88.09073724007561}

In [13]:
s = ["The", "next", "Charlie", "Parker", "would", "never", "be", "discouraged"]

model.predict(s)

HBox(children=(FloatProgress(value=0.0, max=1152.0), HTML(value='')))




[('The', 'DET'),
 ('next', 'ADJ'),
 ('Charlie', 'NOUN'),
 ('Parker', 'CONJ'),
 ('would', 'VERB'),
 ('never', 'ADV'),
 ('be', 'VERB'),
 ('discouraged', 'CONJ')]