In [1]:
!nvidia-smi

Fri Dec  4 01:11:25 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.100      Driver Version: 440.100      CUDA Version: 10.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce RTX 208...  Off  | 00000000:1A:00.0 Off |                  N/A |
| 27%   38C    P2    61W / 260W |   6276MiB / 11019MiB |      1%      Default |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 208...  Off  | 00000000:1B:00.0 Off |                  N/A |
| 27%   35C    P0    78W / 260W |     11MiB / 11019MiB |     45%      Default |
+-------------------------------+----------------------+----------------------+
|   2  GeForce RTX 208...  Off  | 00000000:1C:00.0 Off |                  N/A |
| 27%   

In [2]:
import pandas as pd
from ast import literal_eval
import re
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim



seed = 1
torch.manual_seed(seed)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


from flair.datasets import ColumnCorpus

import numpy as np
np.random.seed(42)

## Load corpus

In [3]:
cuda_device = torch.device('cuda:1')
n_gpu = torch.cuda.device_count()

for i in range(n_gpu):
    print(torch.cuda.get_device_name(i))

GeForce RTX 2080 Ti
GeForce RTX 2080 Ti
GeForce RTX 2080 Ti
GeForce RTX 2080 Ti
GeForce RTX 2080 Ti
GeForce RTX 2080 Ti
GeForce RTX 2080 Ti
GeForce RTX 2080 Ti


In [4]:
data_folder = './data'
corpora = ColumnCorpus(data_folder,
                      {0: 'text', 3 : 'toxic'},
                      train_file='train.conll2003', 
                      test_file='trial.conll2003',
                      dev_file='dev.conll2003')

print(corpora.obtain_statistics())

2020-12-04 01:11:32,194 Reading data from data
2020-12-04 01:11:32,195 Train: data/train.conll2003
2020-12-04 01:11:32,196 Dev: data/dev.conll2003
2020-12-04 01:11:32,197 Test: data/trial.conll2003
{
    "TRAIN": {
        "dataset": "TRAIN",
        "total_number_of_documents": 6351,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 267161,
            "min": 1,
            "max": 236,
            "avg": 42.06597386238388
        }
    },
    "TEST": {
        "dataset": "TEST",
        "total_number_of_documents": 690,
        "number_of_documents_per_class": {},
        "number_of_tokens_per_tag": {},
        "number_of_tokens": {
            "total": 28240,
            "min": 1,
            "max": 216,
            "avg": 40.927536231884055
        }
    },
    "DEV": {
        "dataset": "DEV",
        "total_number_of_documents": 1588,
        "number_of_documents_per_class": {},
        "number_

## CRF

In [5]:
import nltk


def create_dataset(flair_dataset, preprocess=False):
    dataset = []
    
    for sent in flair_dataset:
        tokens = [w.text for w in sent]
        labels = [w.get_tag('toxic').value for w in sent]
        
        if preprocess:
            dataset.append((list(zip(tokens, [nltk.pos_tag([tok])[0][1] for tok in tokens])), labels))
        else:
            dataset.append((tokens, labels))
        
    return dataset

In [6]:
train_dataset = create_dataset(corpora.train, preprocess=True)
dev_dataset = create_dataset(corpora.dev, preprocess=True)
test_dataset= create_dataset(corpora.test, preprocess=True)

In [10]:
train_dataset[:5]

[([('Another', 'DT'),
   ('violent', 'NN'),
   ('and', 'CC'),
   ('aggressive', 'JJ'),
   ('immigrant', 'NN'),
   ('killing', 'VBG'),
   ('a', 'DT'),
   ('innocent', 'NN'),
   ('and', 'CC'),
   ('intelligent', 'NN'),
   ('US', 'NN'),
   ('Citizen', 'NN'),
   ('...', ':'),
   ('.', '.'),
   ('Sarcasm', 'NN')],
  ['O',
   'B-SPAN',
   'I-SPAN',
   'I-SPAN',
   'I-SPAN',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O']),
 ([('I', 'PRP'),
   ('am', 'VBP'),
   ('56', 'CD'),
   ('years', 'NNS'),
   ('old', 'JJ'),
   (',', ','),
   ('I', 'PRP'),
   ('am', 'VBP'),
   ('not', 'RB'),
   ('your', 'PRP$'),
   ('fucking', 'VBG'),
   ('junior', 'NN'),
   ('pal', 'NN'),
   ('.', '.'),
   ('What', 'WP'),
   ('you', 'PRP'),
   ('are', 'VBP'),
   ('saying', 'VBG'),
   ('makes', 'VBZ'),
   ('no', 'DT'),
   ('sense', 'NN'),
   ('.', '.'),
   ('I', 'PRP'),
   ('do', 'VB'),
   ("n't", 'RB'),
   ('know', 'VB'),
   ('what', 'WP'),
   ('you', 'PRP'),
   ('are', 'VBP'),
   ('basin

In [11]:
import sklearn_crfsuite
from sklearn.model_selection import train_test_split
import numpy as np


def word_features(sentence, i):
    # Get the current word and POS
    word = sentence[i][0]
    pos = sentence[i][1]
    
    features = { "bias": 1.0,
                 "word.lower()": word.lower(),
                 "word[-3:]": word[-3:],
                 "word[-2:]": word[-2:],
                 "word.isupper()": word.isupper(),
                 "word.istitle()": word.istitle(),
                 "word.isdigit()": word.isdigit(),
                 "pos": pos,
                 "pos[:2]": pos[:2], # Generalized POS
               }
   
    # If this is not the first word in the sentence...
    if i > 0:
        # Get the sentence's previous word and POS
        prev_word = sentence[i-1][0] if len(sentence) > 1 else ''
        prev_pos = sentence[i-1][1] if len(sentence) > 1 else ''
        # Add characteristics of the sentence's previous word and POS to the feature dictionary
        features.update({ "-1:word.lower()": prev_word.lower(),
                          "-1:word.istitle()": prev_word.istitle(),
                          "-1:word.isupper()": prev_word.isupper(),
                          "-1:pos": prev_pos,
                          "-1:pos[:2]": prev_pos[:2],
                        })
        
    # Otherwise, add 'BOS' (beginning of sentence) to the feature dictionary
    else:
        features["BOS"] = True

    # If this is not the last word in the sentence...
    if i < len(sentence)-1:
        # Get the sentence's next word and POS
        next_word = sentence[i+1][0] if len(sentence) > 1 else ''
        next_pos = sentence[i+1][1] if len(sentence) > 1 else ''
        # Add characteristics of the sentence's previous next and POS to the feature dictionary
        features.update({ "+1:word.lower()": next_word.lower(),
                          "+1:word.istitle()": next_word.istitle(),
                          "+1:word.isupper()": next_word.isupper(),
                          "+1:pos": next_pos,
                          "+1:pos[:2]": next_pos[:2],
                        })
        
    # Otherwise, add 'EOS' (end of sentence) to the feature dictionary
    else:
        features["EOS"] = True
    
    return features


def sentence_features(sentence):
    return [word_features(sentence, i) for i in range(len(sentence))]

In [12]:
X = [sentence_features(sentence[0]) for sentence in train_dataset]
y = [sentence[1] for sentence in train_dataset]

In [13]:
y[1]

['O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-SPAN',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O']

In [14]:
X[1]

[{'bias': 1.0,
  'word.lower()': 'i',
  'word[-3:]': 'I',
  'word[-2:]': 'I',
  'word.isupper()': True,
  'word.istitle()': True,
  'word.isdigit()': False,
  'pos': 'PRP',
  'pos[:2]': 'PR',
  'BOS': True,
  '+1:word.lower()': 'am',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:pos': 'VBP',
  '+1:pos[:2]': 'VB'},
 {'bias': 1.0,
  'word.lower()': 'am',
  'word[-3:]': 'am',
  'word[-2:]': 'am',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'pos': 'VBP',
  'pos[:2]': 'VB',
  '-1:word.lower()': 'i',
  '-1:word.istitle()': True,
  '-1:word.isupper()': True,
  '-1:pos': 'PRP',
  '-1:pos[:2]': 'PR',
  '+1:word.lower()': '56',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:pos': 'CD',
  '+1:pos[:2]': 'CD'},
 {'bias': 1.0,
  'word.lower()': '56',
  'word[-3:]': '56',
  'word[-2:]': '56',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': True,
  'pos': 'CD',
  'pos[:2]': 'CD',
  '-1:word.lower()'

In [15]:
model = sklearn_crfsuite.CRF()
model.fit(X, y)

CRF(algorithm=None, all_possible_states=None, all_possible_transitions=None,
    averaging=None, c=None, c1=None, c2=None, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=None,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [16]:
pred_data = test_dataset[1]
X_pred = [sentence_features(sentence[0]) for sentence in [pred_data]]
preds = model.predict(X_pred)

list(zip([e[0] for e in pred_data[0]], preds[0]))

[('How', 'O'),
 ('about', 'O'),
 ('we', 'O'),
 ('stop', 'O'),
 ('protecting', 'O'),
 ('idiots', 'B-SPAN'),
 ('and', 'O'),
 ('let', 'O'),
 ('nature', 'O'),
 ('add', 'O'),
 ('some', 'O'),
 ('bleach', 'O'),
 ('to', 'O'),
 ('the', 'O'),
 ('gene', 'O'),
 ('pool', 'O'),
 ('.', 'O'),
 ('We', 'O'),
 ('can', 'O'),
 ('always', 'O'),
 ('submit', 'O'),
 ('their', 'O'),
 ('names', 'O'),
 ('for', 'O'),
 ('the', 'O'),
 ('Darwin', 'O'),
 ('awards', 'O'),
 ('.', 'O')]

In [17]:
from seqeval.metrics import f1_score

X_test = [sentence_features(sentence[0]) for sentence in test_dataset]
preds = model.predict(X_test)
f1_score = f1_score([e[1] for e in test_dataset], preds)

print(f'F1 score: {f1_score}')

F1 score: 0.5230179028132992


## BI-LSTM

In [18]:
# Example network

lstm = nn.LSTM(input_size=3, hidden_size=3) 
inputs = [torch.randn(1, 3) for _ in range(5)]  # for a sequence of length 5


hidden = (torch.randn(1, 1, 3),
          torch.randn(1, 1, 3))
for i in inputs:
    out, hidden = lstm(i.view(1, 1, -1), hidden)

inputs = torch.cat(inputs).view(len(inputs), 1, -1)
hidden = (torch.randn(1, 1, 3), torch.randn(1, 1, 3))  # clean out hidden state
out, hidden = lstm(inputs, hidden)
print(out)
print(hidden)

tensor([[[-0.0187,  0.1713, -0.2944]],

        [[-0.3521,  0.1026, -0.2971]],

        [[-0.3191,  0.0781, -0.1957]],

        [[-0.1634,  0.0941, -0.1637]],

        [[-0.3368,  0.0959, -0.0538]]], grad_fn=<StackBackward>)
(tensor([[[-0.3368,  0.0959, -0.0538]]], grad_fn=<StackBackward>), tensor([[[-0.9825,  0.4715, -0.0633]]], grad_fn=<StackBackward>))


In [19]:
word_to_ix = {}
tag_to_ix = {}

word_to_ix['PAD'] = 0
tag_to_ix['PAD'] = 0

for sent, tags in train_dataset:
    for word in sent:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
    
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag] = len(tag_to_ix)

print('Vocabulary to convert words to indexes:')
print(dict(list(word_to_ix.items())[:100]))

print()

print('Vocabulary to convert tags to indexes:')
print(tag_to_ix)

Vocabulary to convert words to indexes:
{'PAD': 0, ('Another', 'DT'): 1, ('violent', 'NN'): 2, ('and', 'CC'): 3, ('aggressive', 'JJ'): 4, ('immigrant', 'NN'): 5, ('killing', 'VBG'): 6, ('a', 'DT'): 7, ('innocent', 'NN'): 8, ('intelligent', 'NN'): 9, ('US', 'NN'): 10, ('Citizen', 'NN'): 11, ('...', ':'): 12, ('.', '.'): 13, ('Sarcasm', 'NN'): 14, ('I', 'PRP'): 15, ('am', 'VBP'): 16, ('56', 'CD'): 17, ('years', 'NNS'): 18, ('old', 'JJ'): 19, (',', ','): 20, ('not', 'RB'): 21, ('your', 'PRP$'): 22, ('fucking', 'VBG'): 23, ('junior', 'NN'): 24, ('pal', 'NN'): 25, ('What', 'WP'): 26, ('you', 'PRP'): 27, ('are', 'VBP'): 28, ('saying', 'VBG'): 29, ('makes', 'VBZ'): 30, ('no', 'DT'): 31, ('sense', 'NN'): 32, ('do', 'VB'): 33, ("n't", 'RB'): 34, ('know', 'VB'): 35, ('what', 'WP'): 36, ('basing', 'VBG'): 37, ('this', 'DT'): 38, ('on', 'IN'): 39, ('The', 'DT'): 40, ('cheap', 'NN'): 41, ('black', 'JJ'): 42, ('market', 'NN'): 43, ('crap', 'NN'): 44, ('is', 'VBZ'): 45, ('still', 'RB'): 46, ('coming'

In [20]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence


class BasicLstmTagger(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, 
                 tagset_size, dropout_rate=0.1):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)
        self.dropout = nn.Dropout(p=dropout_rate)
        self.hidden2tag = nn.Linear(hidden_dim*2, tagset_size)

    def forward(self, sentences, lengths):
        embeds = self.word_embeddings(sentences)

        packed_embeds = pack_padded_sequence(embeds.transpose(0, 1), lengths, enforce_sorted=False)
        packed_lstm_out, _ = self.lstm(packed_embeds)
        lstm_out, _ = pad_packed_sequence(packed_lstm_out)

        tag_space = self.hidden2tag(self.dropout(lstm_out.transpose(0, 1)))
        tag_scores = F.log_softmax(tag_space, dim=-1)
        return tag_scores

In [21]:
model = BasicLstmTagger(embedding_dim=300, 
                        hidden_dim=200, 
                        vocab_size=len(word_to_ix), 
                        tagset_size=len(tag_to_ix)).cuda()

In [22]:
def prepare_tokens(batch_tokens, word_to_ix):
    batch_token_ids = [[word_to_ix.get(tok, 0) for tok in inst] for inst in batch_tokens]
    batch_max_len = max([len(s) for s in batch_token_ids])
    
    batch_token_ids_padded = word_to_ix['PAD']*np.ones((len(batch_token_ids), batch_max_len))
    lengths = []
    for i in range(len(batch_token_ids)):
        cur_len = len(batch_token_ids[i])
        batch_token_ids_padded[i][:cur_len] = batch_token_ids[i]
        lengths.append(cur_len)

    #since all data are indices, we convert them to torch LongTensors
    return torch.LongTensor(batch_token_ids_padded), lengths


def prepare_tags(batch_tags, tag_to_ix):
    batch_tag_ids = [[tag_to_ix[tag] for tag in inst] for inst in batch_tags]
    batch_max_len = max([len(s) for s in batch_tag_ids])

    batch_tag_ids_padded = tag_to_ix['PAD']*np.ones((len(batch_tag_ids), batch_max_len))
    for i in range(len(batch_tag_ids)):
        cur_len = len(batch_tag_ids[i])
        batch_tag_ids_padded[i][:cur_len] = batch_tag_ids[i]

    #since all data are indices, we convert them to torch LongTensors
    return torch.LongTensor(batch_tag_ids_padded)


def prepare_instance_for_training(batch_instances, word_to_ix, tag_to_ix):
    tokens, lengths = prepare_tokens([inst[0] for inst in batch_instances], word_to_ix)
    tags = prepare_tags([inst[1] for inst in batch_instances], tag_to_ix)

    return tokens, lengths, tags

In [23]:
print(train_dataset[1])

([('I', 'PRP'), ('am', 'VBP'), ('56', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('I', 'PRP'), ('am', 'VBP'), ('not', 'RB'), ('your', 'PRP$'), ('fucking', 'VBG'), ('junior', 'NN'), ('pal', 'NN'), ('.', '.'), ('What', 'WP'), ('you', 'PRP'), ('are', 'VBP'), ('saying', 'VBG'), ('makes', 'VBZ'), ('no', 'DT'), ('sense', 'NN'), ('.', '.'), ('I', 'PRP'), ('do', 'VB'), ("n't", 'RB'), ('know', 'VB'), ('what', 'WP'), ('you', 'PRP'), ('are', 'VBP'), ('basing', 'VBG'), ('this', 'DT'), ('on', 'IN'), ('.', '.'), ('The', 'DT'), ('cheap', 'NN'), ('black', 'JJ'), ('market', 'NN'), ('crap', 'NN'), ('is', 'VBZ'), ('still', 'RB'), ('coming', 'VBG'), ('up', 'RB'), ('from', 'IN'), ('Mexican', 'JJ'), ('nationals', 'NNS'), (',', ','), ('and', 'CC'), ('the', 'DT'), ('good', 'JJ'), ('stuff', 'NN'), ('from', 'IN'), ('BC', 'NN'), ('and', 'CC'), ('Humbolt', 'NN'), ('.', '.'), ('The', 'DT'), ('I5', 'NN'), ('corridor', 'NN'), ('is', 'VBZ'), ('still', 'RB'), ('just', 'RB'), ('as', 'IN'), ('busy', 'JJ'), (','

In [24]:
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm


N_EPOCHS = 10
LEARNING_RATE = 0.1
BATCH_SIZE = 8

loss_function = nn.NLLLoss(ignore_index=tag_to_ix['PAD'])
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE)

f_preprocess = lambda batch: prepare_instance_for_training(batch, word_to_ix, tag_to_ix)
train_dataloader = DataLoader(train_dataset, 
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              num_workers=1,
                              collate_fn=f_preprocess)

try:
    model.train()
    for epoch in range(N_EPOCHS):
        print(f'Epoch #{epoch}')

        for sentences, lengths, targets in tqdm(train_dataloader):
            # Move data to the GPU.
            sentences = sentences.cuda()
            targets = targets.cuda()

            # Pytorch accumulates gradients. We clear them out before each instance.
            model.zero_grad()

            # Run our forward pass.
            tag_scores = model(sentences, lengths)

            # Compute the loss, gradients, and update the parameters. 
            loss = loss_function(tag_scores.reshape(-1, tag_scores.shape[-1]), targets.reshape(-1))
            loss.backward()
            optimizer.step()

except KeyboardInterrupt:
    pass

Epoch #0


HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))


Epoch #1


HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))


Epoch #2


HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))


Epoch #3


HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))


Epoch #4


HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))


Epoch #5


HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))


Epoch #6


HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))


Epoch #7


HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))


Epoch #8


HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))


Epoch #9


HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




In [25]:
# Evaluation

pred_dataset = [inst[0] for inst in test_dataset]

f_preprocess = lambda batch: prepare_tokens(batch, word_to_ix)
pred_dataloader = DataLoader(pred_dataset, 
                             batch_size=100, 
                             num_workers=1,
                             collate_fn=f_preprocess)

model.eval()

all_preds = []
all_probas = []
with torch.no_grad():
    for sentences, lengths in pred_dataloader:
        sentences = sentences.cuda()

        tag_scores = model(sentences, lengths)
        probas, pred_tags = tag_scores.max(dim=-1)
        probas = torch.exp(probas)
        
        pred_tags = pred_tags.cpu().tolist()
        probas = probas.cpu().tolist()
        all_preds += pred_tags
        all_probas += probas

In [26]:
# Restoring tag strings

ix_to_tag = {v: k for k, v in tag_to_ix.items()}

pred_tags = []
pred_probas = []
for pred_sent, preds, probas in zip(pred_dataset, all_preds, all_probas):
    pred_tags.append([ix_to_tag[tag_idx] for tag_idx in preds[:len(pred_sent)]])
    pred_probas.append(probas[:len(pred_sent)])

idx = 21
list(zip(pred_dataset[idx], pred_tags[idx], pred_probas[idx]))

[(('It', 'PRP'), 'O', 0.9478151798248291),
 (('did', 'VBD'), 'O', 0.9802039265632629),
 (("n't", 'RB'), 'O', 0.9680961966514587),
 (('take', 'VB'), 'O', 0.9443789720535278),
 (('much', 'JJ'), 'O', 0.9567553997039795),
 (('rope', 'NN'), 'O', 0.8854822516441345),
 (('for', 'IN'), 'O', 0.9473097920417786),
 (('the', 'DT'), 'O', 0.9666978120803833),
 (('village', 'NN'), 'O', 0.9211320281028748),
 (('idiot', 'NN'), 'B-SPAN', 0.8248081803321838),
 (('to', 'TO'), 'O', 0.9412413835525513),
 (('hang', 'NN'), 'O', 0.8933104872703552),
 (('himself', 'PRP'), 'O', 0.8529583811759949),
 (('.', '.'), 'O', 0.998956561088562)]

In [27]:
from seqeval.metrics import f1_score

f1_score(pred_tags, [inst[1] for inst in test_dataset])

0.5025252525252525

## AllenNLP LSTM model

In [5]:
from torch.utils.data import DataLoader
from allennlp.data import allennlp_collate
from allennlp.training.util import evaluate
from allennlp.predictors import SentenceTaggerPredictor
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder

In [7]:
from allennlp.data.dataset_readers.sequence_tagging import SequenceTaggingDatasetReader
from allennlp.data.dataset_readers import Conll2003DatasetReader
from allennlp.data.token_indexers import SingleIdTokenIndexer


indexer = SingleIdTokenIndexer()
#reader = Conll2003DatasetReader(token_indexers={'tokens':indexer})
reader = SequenceTaggingDatasetReader(token_indexers={'tokens':indexer})
train_dataset = reader.read('data/train.seq')
dev_dataset = reader.read('data/dev.seq')
test_dataset = reader.read('data/trial.seq')

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='reading instances', layout=Layout(width…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='reading instances', layout=Layout(width…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='reading instances', layout=Layout(width…




In [75]:
print(train_dataset[0])

Instance with fields:
 	 tokens: TextField of length 15 with text: 
 		[Another, violent, and, aggressive, immigrant, killing, a, innocent, and, intelligent, US, Citizen,
		..., ., Sarcasm]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'} 
 	 metadata: MetadataField (print field.metadata to see specific information). 
 	 tags: SequenceLabelField of length 15 with labels:
 		['O', 'B-SPAN', 'I-SPAN', 'I-SPAN', 'I-SPAN', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
 		in namespace: 'labels'. 



In [76]:
print(dev_dataset[0])

Instance with fields:
 	 tokens: TextField of length 8 with text: 
 		[Nice, come-back, to, quite, a, stupid, comment, .]
 		and TokenIndexers : {'tokens': 'SingleIdTokenIndexer'} 
 	 metadata: MetadataField (print field.metadata to see specific information). 
 	 tags: SequenceLabelField of length 8 with labels:
 		['O', 'O', 'O', 'O', 'O', 'B-SPAN', 'O', 'O']
 		in namespace: 'labels'. 



In [77]:
from allennlp.data.vocabulary import Vocabulary

vocab = Vocabulary.from_instances(train_dataset.instances)
train_dataset.index_with(vocab)
dev_dataset.index_with(vocab)
test_dataset.index_with(vocab)

HBox(children=(FloatProgress(value=0.0, description='building vocab', max=6351.0, style=ProgressStyle(descript…




In [78]:
print(vocab)

Vocabulary with namespaces:
 	Non Padded Namespaces: {'*tags', '*labels'}
 	Namespace: tokens, Size: 21875 
 	Namespace: labels, Size: 3 



In [79]:
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.seq2seq_encoders.pytorch_seq2seq_wrapper import LstmSeq2SeqEncoder
from allennlp.modules.token_embedders import Embedding


embedder = Embedding(embedding_dim=300, vocab=vocab)
text_field_embedder = BasicTextFieldEmbedder({'tokens': embedder})
encoder = LstmSeq2SeqEncoder(input_size=embedder.get_output_dim(), hidden_size=200, bidirectional=True)

In [80]:
from allennlp.models import SimpleTagger


model = SimpleTagger(text_field_embedder=text_field_embedder, 
                     vocab=vocab, 
                     encoder=encoder,
                     calculate_span_f1=True,
                     label_encoding='IOB1').cuda(device=cuda_device)

In [81]:
model

SimpleTagger(
  (text_field_embedder): BasicTextFieldEmbedder(
    (token_embedder_tokens): Embedding()
  )
  (encoder): LstmSeq2SeqEncoder(
    (_module): LSTM(300, 200, batch_first=True, bidirectional=True)
  )
  (tag_projection_layer): TimeDistributed(
    (_module): Linear(in_features=400, out_features=3, bias=True)
  )
)

In [82]:
import torch.optim as optim
from datetime import datetime

from allennlp.training.learning_rate_schedulers import ReduceOnPlateauLearningRateScheduler
from allennlp.data import allennlp_collate
from allennlp.training import GradientDescentTrainer

date_time = datetime.now()
date_str = date_time.strftime('%m/%d/%Y')
time_str = date_time.strftime('%H:%M:%S')

optimizer = optim.SGD(model.parameters(), lr=0.1, weight_decay=0.0)
train_data_loader = DataLoader(dataset=train_dataset, batch_size=8, 
                               collate_fn=allennlp_collate, shuffle=True)
dev_data_loader = DataLoader(dataset=dev_dataset, batch_size=100, collate_fn=allennlp_collate)
lr_scheduler = ReduceOnPlateauLearningRateScheduler(optimizer, patience=1, factor=0.5)

trainer = GradientDescentTrainer(
    model=model,
    serialization_dir=f'./workdir/{date_str}/{time_str}',
    data_loader=train_data_loader,
    validation_data_loader=dev_data_loader,
    num_epochs=10,
    optimizer=optimizer,
    cuda_device=cuda_device,
    learning_rate_scheduler=lr_scheduler,
    grad_clipping=1.
)

try:
    metrics = trainer.train()
except KeyboardInterrupt:
    pass

You provided a validation dataset but patience was set to None, meaning that early stopping is disabled


HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




In [83]:
from allennlp.training.util import evaluate

test_dataloader = DataLoader(test_dataset, batch_size=100, collate_fn=allennlp_collate)
evaluate(model, test_dataloader, cuda_device=cuda_device)

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




{'accuracy': 0.9471671388101983,
 'accuracy3': 1.0,
 'precision-overall': 0.753246753246753,
 'recall-overall': 0.29896907216494845,
 'f1-measure-overall': 0.4280442804427637,
 'loss': 0.30225372101579395}

## On Glove

In [96]:
embedding_dim = 300
glove_file = f'https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.{embedding_dim}d.txt.gz'

embedder = Embedding(embedding_dim=embedding_dim, vocab=vocab, 
                     pretrained_file=glove_file, trainable=False)
text_field_embedder = BasicTextFieldEmbedder({'tokens': embedder})
encoder = LstmSeq2SeqEncoder(input_size=embedder.get_output_dim(), hidden_size=200, bidirectional=True)

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [97]:
from allennlp.models import SimpleTagger


model = SimpleTagger(text_field_embedder=text_field_embedder, 
                     vocab=vocab, 
                     encoder=encoder,
                     calculate_span_f1=True,
                     label_encoding='IOB1').cuda(device=cuda_device)

In [98]:
import torch.optim as optim

from allennlp.training.learning_rate_schedulers import ReduceOnPlateauLearningRateScheduler
from allennlp.training import GradientDescentTrainer


optimizer = optim.SGD(model.parameters(), lr=0.1, weight_decay=0.)
train_data_loader = DataLoader(dataset=train_dataset, batch_size=8, 
                               collate_fn=allennlp_collate, shuffle=True)
dev_data_loader = DataLoader(dataset=dev_dataset, batch_size=100, collate_fn=allennlp_collate)
lr_scheduler = ReduceOnPlateauLearningRateScheduler(optimizer, patience=1, factor=0.5)

date_time = datetime.now()
date_str = date_time.strftime('%m/%d/%Y')
time_str = date_time.strftime('%H:%M:%S')

trainer = GradientDescentTrainer(
    model=model,
    serialization_dir=f'./workdir/{date_str}/{time_str}',
    data_loader=train_data_loader,
    validation_data_loader=dev_data_loader,
    num_epochs=10,
    optimizer=optimizer,
    cuda_device=cuda_device,
    learning_rate_scheduler=lr_scheduler,
    grad_clipping=1., 
    patience = 2
)

try:
    metrics = trainer.train()
except KeyboardInterrupt:
    pass

HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




In [99]:
test_dataloader = DataLoader(test_dataset, batch_size=100, collate_fn=allennlp_collate)
evaluate(model, test_dataloader, cuda_device=cuda_device)

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




{'accuracy': 0.9316495623972633,
 'accuracy3': 1.0,
 'precision-overall': 0.07377049180327863,
 'recall-overall': 0.0029004189494038026,
 'f1-measure-overall': 0.005581395348829929,
 'loss': 0.3618400309767042}

## With CRF output layer

In [94]:
from allennlp_models.tagging import CrfTagger


model = CrfTagger(text_field_embedder=text_field_embedder, 
                  vocab=vocab, 
                  encoder=encoder,
                  calculate_span_f1=True,
                  label_encoding='IOB1').cuda(device=cuda_device)

In [95]:
from datetime import datetime
from allennlp.training.learning_rate_schedulers import ReduceOnPlateauLearningRateScheduler

optimizer = optim.SGD(model.parameters(), lr=0.05, weight_decay=0.001)
train_data_loader = DataLoader(dataset=train_dataset, batch_size=8, 
                               collate_fn=allennlp_collate, shuffle=True)
dev_data_loader = DataLoader(dataset=dev_dataset, batch_size=100, collate_fn=allennlp_collate)
lr_scheduler = ReduceOnPlateauLearningRateScheduler(optimizer, patience=1, factor=0.5)

date_time = datetime.now()
date_str = date_time.strftime('%m/%d/%Y')
time_str = date_time.strftime('%H:%M:%S')

trainer = GradientDescentTrainer(
    model=model,
    serialization_dir=f'./workdir/{date_str}/{time_str}',
    data_loader=train_data_loader,
    validation_data_loader=dev_data_loader,
    num_epochs=8,
    optimizer=optimizer,
    cuda_device=cuda_device,
    learning_rate_scheduler=lr_scheduler,
    grad_clipping=1.
)

try:
    metrics = trainer.train()
except KeyboardInterrupt:
    pass

You provided a validation dataset but patience was set to None, meaning that early stopping is disabled


HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




In [93]:
test_dataloader = DataLoader(test_dataset, batch_size=100, collate_fn=allennlp_collate)
evaluate(model, test_dataloader, cuda_device=cuda_device)

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




{'accuracy': 0.9307011331444759,
 'accuracy3': 1.0,
 'precision-overall': 0.06666666666666661,
 'recall-overall': 0.007216494845360824,
 'f1-measure-overall': 0.013023255813935859,
 'loss': 773.1597551618304}

## With convolutional character encoder

In [8]:
from allennlp.data.token_indexers import SingleIdTokenIndexer, TokenCharactersIndexer
from allennlp.data.dataset_readers import Conll2003DatasetReader
from allennlp.data.dataset_readers.sequence_tagging import SequenceTaggingDatasetReader

token_indexer = SingleIdTokenIndexer()
char_indexer = TokenCharactersIndexer(min_padding_length=5)
#reader = SequenceTaggingDatasetReader(token_indexers={'tokens' : token_indexer, 
#                                                      'chars' : char_indexer})
reader = Conll2003DatasetReader(token_indexers={'tokens': token_indexer, 
                                                'chars': char_indexer})
train_dataset = reader.read('data/train.conll2003')
test_dataset = reader.read('data/trial.conll2003')
dev_dataset = reader.read('data/dev.conll2003')

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='reading instances', layout=Layout(width…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='reading instances', layout=Layout(width…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='reading instances', layout=Layout(width…




In [9]:
from allennlp.data.vocabulary import Vocabulary

vocab = Vocabulary.from_instances(train_dataset.instances)
train_dataset.index_with(vocab)
dev_dataset.index_with(vocab)
test_dataset.index_with(vocab)

HBox(children=(FloatProgress(value=0.0, description='building vocab', max=6351.0, style=ProgressStyle(descript…




In [10]:
from allennlp.modules.token_embedders import TokenCharactersEncoder
from allennlp.modules.seq2vec_encoders import CnnEncoder
from allennlp.modules.token_embedders import Embedding


char_embedding_size = 300
char_embedder = TokenCharactersEncoder(embedding=Embedding(embedding_dim=char_embedding_size, 
                                                           num_embeddings=300, 
                                                           trainable=True), 
                                       encoder=CnnEncoder(embedding_dim=char_embedding_size, 
                                                          num_filters=30, ngram_filter_sizes=(2, 3, 4, 5)))

embedding_dim = 300
glove_file = f'https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.{embedding_dim}d.txt.gz'
token_embedder = Embedding(embedding_dim=embedding_dim, vocab=vocab, 
                           pretrained_file=glove_file, trainable=False)

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [11]:
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder

text_field_embedder = BasicTextFieldEmbedder({'tokens': token_embedder, 
                                              'chars' : char_embedder})

In [12]:
token_embedder.get_output_dim()

300

In [13]:
char_embedder.get_output_dim()

120

In [19]:
from allennlp_models.tagging import CrfTagger
from allennlp.modules.seq2seq_encoders.pytorch_seq2seq_wrapper import LstmSeq2SeqEncoder


input_dim = token_embedder.get_output_dim() + char_embedder.get_output_dim()
encoder = LstmSeq2SeqEncoder(input_size=input_dim, hidden_size=300, bidirectional=True)

model = CrfTagger(text_field_embedder=text_field_embedder, 
                  vocab=vocab, 
                  encoder=encoder,
                  calculate_span_f1=True,
                  label_encoding='IOB1').cuda(device=cuda_device)
                  #label_encoding='IOB1').cpu()

In [22]:
from datetime import datetime
from allennlp.training.learning_rate_schedulers import ReduceOnPlateauLearningRateScheduler
from allennlp.training import GradientDescentTrainer

import torch.optim as optim
from transformers import AdamW
import transformers
from datetime import datetime

from allennlp.training.learning_rate_schedulers import LinearWithWarmup
from torch.utils.data import DataLoader
from allennlp.training import GradientDescentTrainer
from allennlp.training.learning_rate_schedulers import SlantedTriangular

import math


optimizer = optim.SGD(model.parameters(), lr=0.03, weight_decay=0.001)
train_data_loader = DataLoader(dataset=train_dataset, batch_size=8, 
                               collate_fn=allennlp_collate, shuffle=True)
dev_data_loader = DataLoader(dataset=dev_dataset, batch_size=100, collate_fn=allennlp_collate)
lr_scheduler = ReduceOnPlateauLearningRateScheduler(optimizer, patience=1, factor=0.5)

date_time = datetime.now()
date_str = date_time.strftime('%m/%d/%Y')
time_str = date_time.strftime('%H:%M:%S')

trainer = GradientDescentTrainer(
    model=model,
    serialization_dir=f'./workdir/{date_str}/{time_str}',
    data_loader=train_data_loader,
    validation_data_loader=dev_data_loader,
    num_epochs=10,
    optimizer=optimizer,
    cuda_device=cuda_device,
    learning_rate_scheduler=lr_scheduler,
    grad_clipping=1.,
)

try:
    metrics = trainer.train()
except KeyboardInterrupt:
    pass

You provided a validation dataset but patience was set to None, meaning that early stopping is disabled


HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




In [23]:
test_dataloader = DataLoader(test_dataset, batch_size=100, collate_fn=allennlp_collate)
evaluate(model, test_dataloader, cuda_device=cuda_device)

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




{'accuracy': 0.9379603399433428,
 'accuracy3': 1.0,
 'precision-overall': 0.0,
 'recall-overall': 0.0,
 'f1-measure-overall': 0.0,
 'loss': 889.3219517299107}

## ELMO

In [7]:
from allennlp.data.dataset_readers.sequence_tagging import SequenceTaggingDatasetReader
from allennlp.data.token_indexers import ELMoTokenCharactersIndexer

token_indexer = ELMoTokenCharactersIndexer()
reader = SequenceTaggingDatasetReader(token_indexers={'elmo_tokens': token_indexer})

train_dataset = reader.read('data/train.seq')
dev_dataset = reader.read('data/dev.seq')
test_dataset = reader.read('data/trial.seq')

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='reading instances', layout=Layout(width…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='reading instances', layout=Layout(width…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='reading instances', layout=Layout(width…




In [33]:
from allennlp.data.vocabulary import Vocabulary

vocab = Vocabulary.from_instances(train_dataset.instances)
train_dataset.index_with(vocab)
dev_dataset.index_with(vocab)
test_dataset.index_with(vocab)

HBox(children=(FloatProgress(value=0.0, description='building vocab', max=6351.0, style=ProgressStyle(descript…




In [34]:
from allennlp.modules.token_embedders import ElmoTokenEmbedder

elmo_embedder = ElmoTokenEmbedder()
text_field_embedder = BasicTextFieldEmbedder({"elmo_tokens": elmo_embedder})

In [35]:
from allennlp_models.tagging import CrfTagger
from allennlp.modules.seq2seq_encoders.pytorch_seq2seq_wrapper import LstmSeq2SeqEncoder


encoder = LstmSeq2SeqEncoder(input_size=elmo_embedder.get_output_dim(), 
                             hidden_size=300, bidirectional=True)

model = CrfTagger(text_field_embedder=text_field_embedder, 
                  vocab=vocab, 
                  encoder=encoder,
                  calculate_span_f1=True,
                  label_encoding='IOB1').cuda(device=cuda_device)

In [36]:
from datetime import datetime
from allennlp.training.learning_rate_schedulers import ReduceOnPlateauLearningRateScheduler
from allennlp.training import GradientDescentTrainer


optimizer = optim.SGD(model.parameters(), lr=0.05, weight_decay=0.001)
train_data_loader = DataLoader(dataset=train_dataset, batch_size=8, 
                               collate_fn=allennlp_collate, shuffle=True)
dev_data_loader = DataLoader(dataset=dev_dataset, batch_size=100, collate_fn=allennlp_collate)
lr_scheduler = ReduceOnPlateauLearningRateScheduler(optimizer, patience=1, factor=0.5)

date_time = datetime.now()
date_str = date_time.strftime('%m/%d/%Y')
time_str = date_time.strftime('%H:%M:%S')

trainer = GradientDescentTrainer(
    model=model,
    serialization_dir=f'./workdir/{date_str}/{time_str}',
    data_loader=train_data_loader,
    validation_data_loader=dev_data_loader,
    num_epochs=10,
    optimizer=optimizer,
    cuda_device=cuda_device,
    learning_rate_scheduler=lr_scheduler,
    grad_clipping=1.
)

try:
    metrics = trainer.train()
except KeyboardInterrupt:
    pass

You provided a validation dataset but patience was set to None, meaning that early stopping is disabled


HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=794.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=16.0), HTML(value='')))




In [37]:
test_dataloader = DataLoader(test_dataset, batch_size=100, collate_fn=allennlp_collate)
evaluate(model, test_dataloader, cuda_device=cuda_device)

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




{'accuracy': 0.9069759206798866,
 'accuracy3': 1.0,
 'precision-overall': 0.0822281167108753,
 'recall-overall': 0.031958762886597936,
 'f1-measure-overall': 0.04602821083886094,
 'loss': 1481.4861711774554}

## BERT

In [8]:
from allennlp.data.token_indexers import PretrainedTransformerMismatchedIndexer
from allennlp.data.vocabulary import Vocabulary


BERT_MODEL = 'bert-base-cased'
indexer = PretrainedTransformerMismatchedIndexer(model_name=BERT_MODEL)

In [9]:
wordpieces, offsets = indexer._allennlp_tokenizer.intra_word_tokenize(['We', 'are', 'living', 'in', 'Mönchengladbach'])
wordpieces

[[CLS], We, are, living, in, M, ##ö, ##nchen, ##g, ##lad, ##bach, [SEP]]

In [10]:
reader = SequenceTaggingDatasetReader(token_indexers={'tokens' : indexer})
train_dataset = reader.read('data/train.seq')
dev_dataset = reader.read('data/dev.seq')
test_dataset = reader.read('data/trial.seq')

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='reading instances', layout=Layout(width…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='reading instances', layout=Layout(width…




HBox(children=(FloatProgress(value=1.0, bar_style='info', description='reading instances', layout=Layout(width…




In [11]:
vocab = Vocabulary.from_instances(train_dataset.instances)
train_dataset.index_with(vocab)
dev_dataset.index_with(vocab)
test_dataset.index_with(vocab)

HBox(children=(FloatProgress(value=0.0, description='building vocab', max=6351.0, style=ProgressStyle(descript…




In [12]:
from allennlp.modules.token_embedders import PretrainedTransformerMismatchedEmbedder
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.seq2seq_encoders import PassThroughEncoder


embedder = PretrainedTransformerMismatchedEmbedder(model_name=BERT_MODEL)
text_field_embedder = BasicTextFieldEmbedder({'tokens': embedder})
seq2seq_encoder = PassThroughEncoder(input_dim=embedder.get_output_dim())

In [13]:
embedder

PretrainedTransformerMismatchedEmbedder(
  (_matched_embedder): PretrainedTransformerEmbedder(
    (transformer_model): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(28996, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense):

In [14]:
from allennlp.models import SimpleTagger


model = SimpleTagger(text_field_embedder=text_field_embedder, 
                      vocab=vocab, 
                      encoder=seq2seq_encoder,
                      calculate_span_f1=True,
                      label_encoding='IOB1').cuda(device=cuda_device)

In [15]:
import torch.optim as optim
from transformers import AdamW
import transformers
from datetime import datetime

from allennlp.training.learning_rate_schedulers import LinearWithWarmup
from torch.utils.data import DataLoader
from allennlp.training import GradientDescentTrainer
from allennlp.training.learning_rate_schedulers import SlantedTriangular

import math


num_epochs = 4
batch_size = 10
steps_per_epoch = math.ceil(len(train_dataset) / batch_size)

optimizer = optim.AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)
train_data_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, 
                               collate_fn=allennlp_collate, shuffle=True)
val_data_loader = DataLoader(dataset=dev_dataset, batch_size=50, collate_fn=allennlp_collate)
lr_scheduler = LinearWithWarmup(optimizer, 
                                num_epochs=num_epochs, 
                                warmup_steps=(steps_per_epoch*num_epochs)*0.1, 
                                num_steps_per_epoch=steps_per_epoch)

date_time = datetime.now()
date_str = date_time.strftime('%m/%d/%Y')
time_str = date_time.strftime('%H:%M:%S')


trainer = GradientDescentTrainer(
    model=model,
    optimizer=optimizer,
    data_loader=train_data_loader,
    validation_data_loader=val_data_loader,
    num_epochs=num_epochs,
    cuda_device=cuda_device,
    learning_rate_scheduler=lr_scheduler,
    num_gradient_accumulation_steps=1,
    serialization_dir=f'./workdir/{date_str}/{time_str}',
    grad_clipping=1.
)

try:
    metrics = trainer.train()
except KeyboardInterrupt:
    pass

You provided a validation dataset but patience was set to None, meaning that early stopping is disabled


HBox(children=(FloatProgress(value=0.0, max=636.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=636.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=636.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=636.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))




In [16]:
test_dataloader = DataLoader(test_dataset, batch_size=100, collate_fn=allennlp_collate)
evaluate(model, test_dataloader, cuda_device=cuda_device)

HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




{'accuracy': 0.9515226628895184,
 'accuracy3': 1.0,
 'precision-overall': 0.6449086161879894,
 'recall-overall': 0.5092783505154639,
 'f1-measure-overall': 0.5691244239630843,
 'loss': 0.2455632154430662}