# LSTM

In [1]:
import nltk
from sklearn.model_selection import train_test_split

import torch
from xtagger import LSTMForTagging
from xtagger import xtagger_dataset_to_df, df_to_torchtext_data


nltk_data = list(nltk.corpus.conll2000.tagged_sents(tagset='universal'))
train_set, test_set = train_test_split(nltk_data,train_size=0.8,test_size=0.2)

df_train = xtagger_dataset_to_df(train_set)
df_test = xtagger_dataset_to_df(test_set)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_iterator, valid_iterator, test_iterator, TEXT, TAGS = df_to_torchtext_data(
    df_train, 
    df_test, 
    device, 
    batch_size = 32
)

Number of training examples: 8758
Number of testing examples: 2190
Unique tokens in TEXT vocabulary: 17493
Unique tokens in TAGS vocabulary: 13


In [2]:
input_dim = len(TEXT.vocab)
out_dim = len(TAGS.vocab)
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
tag_pad_idx = TAGS.vocab.stoi[TAGS.pad_token]


model = LSTMForTagging(
    input_dim, 
    out_dim, 
    TEXT, 
    TAGS, 
    cuda=True
)

The model has 2,383,425 trainable parameters


In [3]:
model.fit(
    train_iterator,
    test_iterator, 
    epochs = 3,
    eval_metrics=["acc", "avg_f1"]
)

HBox(children=(FloatProgress(value=0.0, max=822.0), HTML(value='')))

Evaluating...


HBox(children=(FloatProgress(value=0.0, max=343.0), HTML(value='')))


{'eval': {'acc': 94.98248585395896, 'avg_f1': {'weighted': 94.90493183476315, 'micro': 94.98248585395896, 'macro': 80.34477052748954}}, 'train': {'acc': 96.17178222123528, 'avg_f1': {'weighted': 96.142111475462, 'micro': 96.17178222123528, 'macro': 81.30081984670088}}, 'eval_loss': 0.1621395515790884, 'train_loss': 0.12737846896596197}
Evaluating...


HBox(children=(FloatProgress(value=0.0, max=343.0), HTML(value='')))


{'eval': {'acc': 96.67231225220371, 'avg_f1': {'weighted': 96.63656727463874, 'micro': 96.67231225220371, 'macro': 81.89577716131792}}, 'train': {'acc': 98.37940389869947, 'avg_f1': {'weighted': 98.37041670404544, 'micro': 98.37940389869947, 'macro': 83.19774994401263}}, 'eval_loss': 0.10416840400168861, 'train_loss': 0.05449008959325126}
Evaluating...


HBox(children=(FloatProgress(value=0.0, max=343.0), HTML(value='')))


{'eval': {'acc': 97.22660610493091, 'avg_f1': {'weighted': 97.18670726372295, 'micro': 97.22660610493091, 'macro': 82.32854302506409}}, 'train': {'acc': 99.11849613316211, 'avg_f1': {'weighted': 99.1068425147923, 'micro': 99.11849613316211, 'macro': 83.87096084768301}}, 'eval_loss': 0.0886973857447721, 'train_loss': 0.02978493071358352}



In [4]:
model.evaluate(valid_iterator)

HBox(children=(FloatProgress(value=0.0, max=69.0), HTML(value='')))




{'acc': 97.2265527262929}

In [5]:
s = ["There", "are", "no", "two", "words", "in", "the", "English", 
     "language", "more", "harmful", "than", "good", "job"]
model.predict(s)

([('there', 'DET'),
  ('are', 'VERB'),
  ('no', 'DET'),
  ('two', 'NUM'),
  ('words', 'NOUN'),
  ('in', 'ADP'),
  ('the', 'DET'),
  ('english', 'NOUN'),
  ('language', 'NOUN'),
  ('more', 'ADJ'),
  ('harmful', 'NOUN'),
  ('than', 'ADP'),
  ('good', 'ADJ'),
  ('job', 'NOUN')],
 ['harmful'])