# Morphological Support For Unknown Words

In [1]:
from xtagger import HiddenMarkovModel
from xtagger import EnglishRegExTagger

import nltk
from sklearn.model_selection import train_test_split

nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
train_set, test_set = train_test_split(nltk_data,train_size=0.8,test_size=0.2)

In [2]:
rules = [
    (r'.*ing$', 'VERB'),
    (r'.*ed$',  'VERB'),
    (r'.*es$',  'VERB')
]

morphological_tagger = EnglishRegExTagger(
    rules = rules,
    use_default = False,
    mode = "morphological"
)

In [3]:
model = HiddenMarkovModel(
    extend_to = "bigram",
    language = "en",
    morphological = morphological_tagger,
)

model.fit(train_set)

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [4]:
model.evaluate(
    test_set,
    random_size = 30,
    seed = 15,
    eval_metrics = ['acc', 'report'],
    result_type = "%",
    morphological = True,
)

HBox(children=(FloatProgress(value=0.0, max=750.0), HTML(value='')))


              precision    recall  f1-score   support

         ADJ       0.51      0.87      0.64        54
           .       1.00      1.00      1.00        87
         PRT       0.91      0.97      0.94        30
         ADP       0.97      0.95      0.96        64
           X       1.00      0.98      0.99        58
        NOUN       0.96      0.85      0.90       213
         DET       1.00      0.97      0.98        65
        VERB       0.93      0.86      0.89       103
        CONJ       0.94      1.00      0.97        17
         ADV       0.76      0.65      0.70        20
        PRON       1.00      1.00      1.00        20
         NUM       1.00      0.79      0.88        19

   micro avg       0.91      0.91      0.91       750
   macro avg       0.91      0.91      0.91       750
weighted avg       0.93      0.91      0.91       750
 samples avg       0.91      0.91      0.91       750



{'acc': 90.66666666666666}

In [6]:
model.predict(["The", "existence", "of", "human", "mankind", "is", "insignificant"], morphological=True)

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




[('The', 'DET'),
 ('existence', 'NOUN'),
 ('of', 'ADP'),
 ('human', 'ADJ'),
 ('mankind', 'ADJ'),
 ('is', 'VERB'),
 ('insignificant', 'ADJ')]

# Prior Support For Computational Efficiency

In [7]:
from xtagger import HiddenMarkovModel
from xtagger import EnglishRegExTagger

import nltk
from sklearn.model_selection import train_test_split

nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
train_set, test_set = train_test_split(nltk_data,train_size=0.8,test_size=0.2)

In [11]:
rules = [
    (r'.*ing$', 'VERB'),
    (r'.*ed$',  'VERB'),
    (r'.*es$',  'VERB')
]

prior_tagger = EnglishRegExTagger(
    rules = rules,
    use_default = False,
    mode = "prior"
)

In [12]:
model = HiddenMarkovModel(
    extend_to = "bigram",
    language = "en",
    prior = prior_tagger,
)

model.fit(train_set)

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [13]:
model.evaluate(
    test_set,
    random_size = 30,
    seed = 15,
    eval_metrics = ['acc', 'report'],
    result_type = "%",
    prior = True,
)

HBox(children=(FloatProgress(value=0.0, max=873.0), HTML(value='')))


              precision    recall  f1-score   support

         ADJ       0.38      0.81      0.51        48
           .       1.00      1.00      1.00       126
         PRT       0.93      0.97      0.95        29
         ADP       0.97      0.99      0.98        74
           X       1.00      0.94      0.97        70
         DET       1.00      0.98      0.99        63
        VERB       0.77      0.95      0.85       114
        CONJ       1.00      0.94      0.97        18
         ADV       0.87      0.68      0.76        19
        PRON       1.00      1.00      1.00        11
        NOUN       0.97      0.72      0.83       265
         NUM       1.00      0.83      0.91        36

   micro avg       0.88      0.88      0.88       873
   macro avg       0.91      0.90      0.89       873
weighted avg       0.92      0.88      0.89       873
 samples avg       0.88      0.88      0.88       873



{'acc': 87.62886597938144}

In [16]:
model.predict(["The", "existence", "of", "human", "mankind", "is", "over", "prioritized"], prior=True)

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




[('The', 'DET'),
 ('existence', 'ADJ'),
 ('of', 'ADP'),
 ('human', 'ADJ'),
 ('mankind', 'ADJ'),
 ('is', 'VERB'),
 ('over', 'ADP'),
 ('prioritized', 'VERB')]