# Hidden Markov Model (bigram)

In [3]:
from xtagger import HiddenMarkovModel

import nltk
from sklearn.model_selection import train_test_split

nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
train_set, test_set = train_test_split(nltk_data,train_size=0.8,test_size=0.2)

In [4]:
model = HiddenMarkovModel(
    extend_to = "bigram",
    language = "en",
)

model.fit(train_set)

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [5]:
model.evaluate(
    test_set,
    random_size = 30,
    seed = 15,
    eval_metrics = ['acc', 'avg_recall'],
    result_type = "%",
)

HBox(children=(FloatProgress(value=0.0, max=702.0), HTML(value='')))




{'acc': 90.5982905982906,
 'avg_recall': {'weigted': 90.5982905982906,
  'micro': 90.5982905982906,
  'macro': 94.78558992447881}}

# Hidden Markov Model (trigram)

In [None]:
from xtagger import HiddenMarkovModel

import nltk
from sklearn.model_selection import train_test_split

nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
train_set, test_set = train_test_split(nltk_data,train_size=0.8,test_size=0.2)

In [6]:
model = HiddenMarkovModel(
    extend_to = "trigram",
    language = "en",
)

model.fit(train_set)

HBox(children=(FloatProgress(value=0.0, max=1728.0), HTML(value='')))




In [7]:
model.evaluate(
    test_set,
    random_size = 30,
    seed = 15,
    eval_metrics = ['acc', 'avg_recall'],
    result_type = "%",
)

HBox(children=(FloatProgress(value=0.0, max=101088.0), HTML(value='')))




{'acc': 90.31339031339031,
 'avg_recall': {'weigted': 90.31339031339031,
  'micro': 90.31339031339031,
  'macro': 94.70444338704142}}

In [8]:
s = ["There", "are", "no", "two", "words", "in", "the", "English", 
     "language", "more", "harmful", "than", "good", "job"]

model.predict(s)

HBox(children=(FloatProgress(value=0.0, max=2016.0), HTML(value='')))




[('There', 'DET'),
 ('are', 'VERB'),
 ('no', 'DET'),
 ('two', 'NUM'),
 ('words', 'NOUN'),
 ('in', 'ADP'),
 ('the', 'DET'),
 ('English', 'ADJ'),
 ('language', 'NOUN'),
 ('more', 'ADJ'),
 ('harmful', 'NUM'),
 ('than', 'ADP'),
 ('good', 'ADJ'),
 ('job', 'NOUN')]

# Hidden Markov Model (deleted interpolation)

In [None]:
from xtagger import HiddenMarkovModel

import nltk
from sklearn.model_selection import train_test_split

nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
train_set, test_set = train_test_split(nltk_data,train_size=0.8,test_size=0.2)

In [9]:
model = HiddenMarkovModel(
    extend_to = "deleted_interpolation",
    language = "en",
)

model.fit(train_set)

HBox(children=(FloatProgress(value=0.0, max=3456.0), HTML(value='')))


λ1: 0.32609235044537627, λ2: 0.2947175148251991, λ3: 0.3791901347294246


In [10]:
model.evaluate(
    test_set,
    random_size = 30,
    seed = 15,
    eval_metrics = ['acc', 'avg_recall'],
    result_type = "%",
)

HBox(children=(FloatProgress(value=0.0, max=101088.0), HTML(value='')))




{'acc': 90.17094017094017,
 'avg_recall': {'weigted': 90.17094017094017,
  'micro': 90.17094017094017,
  'macro': 94.26575829762105}}

# Hidden Markov Model (morphological support)

In [11]:
from xtagger import HiddenMarkovModel
from xtagger import EnglishRegExTagger

import nltk
from sklearn.model_selection import train_test_split

nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
train_set, test_set = train_test_split(nltk_data,train_size=0.8,test_size=0.2)

In [12]:
rules = [
    (r'.*ing$', 'VERB'),
    (r'.*ed$',  'VERB'),
    (r'.*es$',  'VERB')
]

morphological_tagger = EnglishRegExTagger(
    rules = rules,
    use_default = False,
    mode = "morphological"
)

In [13]:
model = HiddenMarkovModel(
    extend_to = "bigram",
    language = "en",
    morphological = morphological_tagger,
)

model.fit(train_set)

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [14]:
model.evaluate(
    test_set,
    random_size = 30,
    seed = 15,
    eval_metrics = ['acc', 'report'],
    result_type = "%",
    morphological = True,
)

HBox(children=(FloatProgress(value=0.0, max=717.0), HTML(value='')))


              precision    recall  f1-score   support

         NUM       0.37      1.00      0.54        22
        PRON       1.00      1.00      1.00        30
        CONJ       0.94      1.00      0.97        15
        VERB       0.92      0.94      0.93        90
         DET       1.00      0.98      0.99        64
         PRT       0.94      1.00      0.97        17
        NOUN       0.97      0.84      0.90       198
         ADP       1.00      0.97      0.98        90
         ADJ       0.87      0.77      0.82        44
           .       1.00      1.00      1.00        85
           X       1.00      0.92      0.96        37
         ADV       0.90      0.76      0.83        25

   micro avg       0.92      0.92      0.92       717
   macro avg       0.91      0.93      0.91       717
weighted avg       0.95      0.92      0.93       717
 samples avg       0.92      0.92      0.92       717



{'acc': 91.77126917712691}

# Hidden Markov Model (prior support)

In [25]:
from xtagger import HiddenMarkovModel
from xtagger import EnglishRegExTagger

import nltk
import nltk
from sklearn.model_selection import train_test_split

nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
train_set, test_set = train_test_split(nltk_data,train_size=0.8,test_size=0.2)

In [26]:
prior_tagger = EnglishRegExTagger(
    rules = rules,
    use_default = True,
    mode = "prior"
)

In [27]:
model = HiddenMarkovModel(
    extend_to = "bigram",
    language = "en",
    prior = prior_tagger,
)

model.fit(train_set)

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [28]:
model.evaluate(
    test_set,
    random_size = 30,
    seed = 15,
    eval_metrics = ['acc', 'avg_recall'],
    result_type = "%",
    prior = True,
)

HBox(children=(FloatProgress(value=0.0, max=809.0), HTML(value='')))




{'acc': 36.46477132262052,
 'avg_recall': {'weigted': 36.46477132262052,
  'micro': 36.46477132262052,
  'macro': 21.42495914871835}}

# Hidden Markov Model (user defined metric)

In [29]:
from xtagger import HiddenMarkovModel
from xtagger import EnglishRegExTagger
from xtagger import xMetrics

import nltk
import nltk
from sklearn.model_selection import train_test_split

nltk_data = list(nltk.corpus.treebank.tagged_sents(tagset='universal'))
train_set, test_set = train_test_split(nltk_data,train_size=0.8,test_size=0.2)

In [30]:
model = HiddenMarkovModel(
    extend_to = "bigram",
    language = "en"
)

model.fit(train_set)

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [31]:
class MyAcc(xMetrics):
    def __init__(self, y_true, y_pred, tags):
        super(MyAcc, self).__init__(y_true, y_pred, tags)
        
    def __call__(self):
        import numpy as np
        acc = 0
        for gt, pred in zip(self.y_true, self.y_pred):
            gt_index   = np.where(gt == 1)[0].item()
            pred_index = np.where(pred == 1)[0].item()
            
            if gt_index == pred_index:
                acc += 1
                
        return acc / self.y_true.shape[0]

In [32]:
model.evaluate(
    test_set,
    random_size = 10,
    seed = 15,
    eval_metrics = ['acc', 'classwise_precision', MyAcc],
    result_type = "%",
    prior = True,
)

HBox(children=(FloatProgress(value=0.0, max=306.0), HTML(value='')))




{'acc': 94.11764705882352,
 'classwise_precision': {'NUM': 51.724137931034484,
  'PRON': 100.0,
  'CONJ': 100.0,
  'VERB': 100.0,
  'DET': 96.96969696969697,
  'PRT': 100.0,
  'NOUN': 100.0,
  'ADP': 97.5609756097561,
  'ADJ': 100.0,
  '.': 100.0,
  'X': 100.0,
  'ADV': 50.0},
 'MyAcc': 0.9411764705882353}

# LSTM

In [1]:
import nltk
from sklearn.model_selection import train_test_split

import torch
from xtagger import LSTMForTagging
from xtagger import xtagger_dataset_to_df, df_to_torchtext_data


nltk_data = list(nltk.corpus.conll2000.tagged_sents(tagset='universal'))
train_set, test_set = train_test_split(nltk_data,train_size=0.8,test_size=0.2)

df_train = xtagger_dataset_to_df(train_set)
df_test = xtagger_dataset_to_df(test_set)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_iterator, valid_iterator, test_iterator, TEXT, TAGS = df_to_torchtext_data(
    df_train, 
    df_test, 
    device, 
    batch_size = 32
)

Number of training examples: 8758
Number of testing examples: 2190
Unique tokens in TEXT vocabulary: 17385
Unique tokens in TAGS vocabulary: 13


In [2]:
input_dim = len(TEXT.vocab)
out_dim = len(TAGS.vocab)
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
tag_pad_idx = TAGS.vocab.stoi[TAGS.pad_token]


model = LSTMForTagging(
    input_dim, 
    out_dim, 
    TEXT, 
    TAGS, 
    cuda=True
)

The model has 2,372,625 trainable parameters


In [3]:
model.fit(
    train_iterator,
    test_iterator, 
    epochs = 3,
    eval_metrics=["acc", "avg_f1"]
)

HBox(children=(FloatProgress(value=0.0, max=822.0), HTML(value='')))

Evaluating...


HBox(children=(FloatProgress(value=0.0, max=343.0), HTML(value='')))


{'eval': {'acc': 94.89803741183685, 'avg_f1': {'weighted': 94.83446190272467, 'micro': 94.89803741183685, 'macro': 80.29061953217816}}, 'train': {'acc': 95.86812804453723, 'avg_f1': {'weighted': 95.8163241747707, 'micro': 95.86812804453723, 'macro': 81.04660665829542}}, 'eval_loss': 0.15775534607793973, 'train_loss': 0.13121628212015124}
Evaluating...


HBox(children=(FloatProgress(value=0.0, max=343.0), HTML(value='')))


{'eval': {'acc': 96.79546151487274, 'avg_f1': {'weighted': 96.80615410670855, 'micro': 96.79546151487274, 'macro': 81.93059608544998}}, 'train': {'acc': 98.36029923451636, 'avg_f1': {'weighted': 98.34978517904864, 'micro': 98.36029923451636, 'macro': 83.17007350815857}}, 'eval_loss': 0.09765658047104227, 'train_loss': 0.053714395731850696}
Evaluating...


HBox(children=(FloatProgress(value=0.0, max=343.0), HTML(value='')))


{'eval': {'acc': 97.34360625574978, 'avg_f1': {'weighted': 97.34907310177117, 'micro': 97.34360625574978, 'macro': 82.48313661755341}}, 'train': {'acc': 99.14173045697055, 'avg_f1': {'weighted': 99.12927282176535, 'micro': 99.14173045697055, 'macro': 83.87137727244681}}, 'eval_loss': 0.08171060540969821, 'train_loss': 0.028569950229304766}



In [4]:
model.evaluate(valid_iterator)

HBox(children=(FloatProgress(value=0.0, max=69.0), HTML(value='')))




{'acc': 97.34355534259703}

In [5]:
s = ["There", "are", "no", "two", "words", "in", "the", "English", 
     "language", "more", "harmful", "than", "good", "job"]
model.predict(s)

([('there', 'DET'),
  ('are', 'VERB'),
  ('no', 'DET'),
  ('two', 'NUM'),
  ('words', 'NOUN'),
  ('in', 'ADP'),
  ('the', 'DET'),
  ('english', 'NOUN'),
  ('language', 'NOUN'),
  ('more', 'ADV'),
  ('harmful', 'ADJ'),
  ('than', 'ADP'),
  ('good', 'ADJ'),
  ('job', 'NOUN')],
 ['harmful'])

# LSTM With Checkpointing

In [6]:
import nltk
from sklearn.model_selection import train_test_split

import torch
from xtagger import LSTMForTagging
from xtagger import xtagger_dataset_to_df, df_to_torchtext_data


nltk_data = list(nltk.corpus.conll2000.tagged_sents(tagset='universal'))
train_set, test_set = train_test_split(nltk_data,train_size=0.8,test_size=0.2)

df_train = xtagger_dataset_to_df(train_set)
df_test = xtagger_dataset_to_df(test_set)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_iterator, valid_iterator, test_iterator, TEXT, TAGS = df_to_torchtext_data(
    df_train, 
    df_test, 
    device, 
    batch_size = 32
)

Number of training examples: 8758
Number of testing examples: 2190
Unique tokens in TEXT vocabulary: 17430
Unique tokens in TAGS vocabulary: 13


In [6]:
input_dim = len(TEXT.vocab)
out_dim = len(TAGS.vocab)
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
tag_pad_idx = TAGS.vocab.stoi[TAGS.pad_token]


model = LSTMForTagging(
    input_dim, 
    out_dim, 
    TEXT, 
    TAGS, 
    cuda=True
)

The model has 2,372,625 trainable parameters


In [9]:
from xtagger import Checkpointing
checkpointing = Checkpointing(
    model_path = "./", 
    model_name = "lstm_tagger.pt", 
    monitor = "eval_acc",
    mode = "maximize",
    verbose = 1
)

In [11]:
model.fit(
    train_iterator,
    test_iterator, 
    epochs = 3,
    eval_metrics=["acc", "avg_f1"],
    checkpointing = checkpointing
)

HBox(children=(FloatProgress(value=0.0, max=822.0), HTML(value='')))

Evaluating...


HBox(children=(FloatProgress(value=0.0, max=343.0), HTML(value='')))


Model is saved with eval_acc = 94.959368291935
{'eval': {'acc': 94.959368291935, 'avg_f1': {'weighted': 94.89236698344878, 'micro': 94.959368291935, 'macro': 80.32638441433846}}, 'train': {'acc': 96.01648882703162, 'avg_f1': {'weighted': 95.98749387384105, 'micro': 96.01648882703162, 'macro': 81.11227149023092}}, 'eval_loss': 0.15850219357272852, 'train_loss': 0.12928859462594464}
Evaluating...


HBox(children=(FloatProgress(value=0.0, max=343.0), HTML(value='')))


Model is saved with eval_acc = 96.82421036491874
{'eval': {'acc': 96.82421036491874, 'avg_f1': {'weighted': 96.78855053753105, 'micro': 96.82421036491874, 'macro': 82.02406188757104}}, 'train': {'acc': 98.48256398360783, 'avg_f1': {'weighted': 98.46880943368681, 'micro': 98.48256398360783, 'macro': 83.30675889478326}}, 'eval_loss': 0.09872278036630672, 'train_loss': 0.05145323381888388}
Evaluating...


HBox(children=(FloatProgress(value=0.0, max=343.0), HTML(value='')))


Model is saved with eval_acc = 97.2669426556271
{'eval': {'acc': 97.2669426556271, 'avg_f1': {'weighted': 97.24130569304464, 'micro': 97.2669426556271, 'macro': 82.4032560805827}}, 'train': {'acc': 99.10403618649966, 'avg_f1': {'weighted': 99.09503083629303, 'micro': 99.10403618649966, 'macro': 85.09768069604601}}, 'eval_loss': 0.08675712697963783, 'train_loss': 0.029156454481918663}



In [12]:
model = LSTMForTagging(input_dim, out_dim, TEXT, TAGS, cuda=True)
model = checkpointing.load(model)
model.evaluate(valid_iterator)

The model has 2,372,625 trainable parameters


HBox(children=(FloatProgress(value=0.0, max=69.0), HTML(value='')))




{'acc': 97.26689027311932}

# User Defined Models With x-tagger Wrapper (toy example)

In [14]:
import nltk
from sklearn.model_selection import train_test_split

import torch
from xtagger import LSTMForTagging
from xtagger import xtagger_dataset_to_df, df_to_torchtext_data


nltk_data = list(nltk.corpus.conll2000.tagged_sents(tagset='universal'))
train_set, test_set = train_test_split(nltk_data,train_size=0.8,test_size=0.2)

df_train = xtagger_dataset_to_df(train_set)
df_test = xtagger_dataset_to_df(test_set)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_iterator, valid_iterator, test_iterator, TEXT, TAGS = df_to_torchtext_data(
    df_train, 
    df_test, 
    device, 
    batch_size = 32
)

Number of training examples: 8758
Number of testing examples: 2190
Unique tokens in TEXT vocabulary: 17533
Unique tokens in TAGS vocabulary: 13


In [15]:
import torch
import torch.nn as nn

class CNNTagger(nn.Module):
    def __init__(self, n_tags,  out_channels = 1):
        super(CNNTagger, self).__init__()
        self.n_tags = n_tags
        self.out_channels = out_channels
        self.cnn1 = nn.Conv1d(
            in_channels = 1, 
            out_channels = self.out_channels, 
            kernel_size=3, 
            padding=1
        )
        
        self.cnn2 = nn.Conv1d(
            in_channels = 1, 
            out_channels = self.out_channels, 
            kernel_size=5, 
            padding=2
        )
        
        self.cnn3 = nn.Conv1d(
            in_channels = 1, 
            out_channels = self.out_channels, 
            kernel_size=7, 
            padding=3
        )
        
        self.dropout = nn.Dropout()
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        self.fcn_1 = nn.Linear(self.out_channels, 5)
        self.fcn_out = nn.Linear(5, self.n_tags)
        
        
    def forward(self, x):
        x = x[:,None,:].cuda().float()
        out1 = self.relu(self.cnn1(x))
        out2 = self.relu(self.dropout(self.cnn2(x)))
        out3 = self.relu(self.dropout(self.cnn3(x)))
        out = (out1 + out2 + out3) / 3 # B x 3 x 7
        out = self.fcn_1(out.permute(0,2,1))
        out = self.fcn_out(out)
        return out

In [18]:
from xtagger import PyTorchTrainer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNTagger(13,3).to(device)
optimizer = torch.optim.Adam(model.parameters())
criterion = torch.nn.CrossEntropyLoss(ignore_index = 0)


trainer = PyTorchTrainer(
    model = model,
    criterion = criterion,
    optimizer = optimizer,
    device = device,
    train_iterator = train_iterator,
    val_iterator = test_iterator,
    TAGS = TAGS,
    TEXT = TEXT
)

In [19]:
trainer.train(3)

HBox(children=(FloatProgress(value=0.0, max=822.0), HTML(value='')))

Evaluating...


HBox(children=(FloatProgress(value=0.0, max=343.0), HTML(value='')))


{'eval': {'acc': 14.896630406987038}, 'train': {'acc': 14.798311782471538}, 'eval_loss': 16.651905405348625, 'train_loss': 20.64160191055632}
Evaluating...


HBox(children=(FloatProgress(value=0.0, max=343.0), HTML(value='')))


{'eval': {'acc': 30.601727088537878}, 'train': {'acc': 30.296686464970445}, 'eval_loss': 2.688630974811056, 'train_loss': 2.7951144632631846}
Evaluating...


HBox(children=(FloatProgress(value=0.0, max=343.0), HTML(value='')))


{'eval': {'acc': 30.593858803627278}, 'train': {'acc': 30.31157120427144}, 'eval_loss': 2.305877951608188, 'train_loss': 2.32932799837015}

