# Lemmatization using Attention Mechanism Preloaded Model

In [1]:
# Import library
import random
import torch
import torch.nn as nn
import pandas as pd
import torch.optim as optim
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import spacy
import pickle
import csv
from utils import process_sentence, load_obj, save_obj, save_checkpoint, load_checkpoint, predict, evaluate
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
from torchtext.data import Field, BucketIterator, TabularDataset, Dataset
from sklearn.model_selection import train_test_split

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 13.6MB/s]                    
2020-09-03 23:47:40 INFO: Downloading default packages for language: en (English)...
2020-09-03 23:47:41 INFO: File exists: C:\Users\shoeb\stanza_resources\en\default.zip.
2020-09-03 23:47:44 INFO: Finished downloading models and saved to C:\Users\shoeb\stanza_resources.
2020-09-03 23:47:44 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| sentiment | sstplus   |
| ner       | ontonotes |

2020-09-03 23:47:45 INFO: Use device: gpu
2020-09-03 23:47:45 INFO: Loading: tokenize
2020-09-03 23:47:50 INFO: Loading: pos
2020-09-03 23:47:51 INFO: Loading: lemma
2020-09-03 23:47:51 INFO: Loading: depparse
2020-09-03 23:47:52 INFO: Loading: sentiment
2020-09-03 23:47:52 INFO: Loading: ner
20

In [4]:
spacy_input = spacy.load("en")
spacy_output = spacy.load("en")

def tokenize_input(text):
    return [token.text for token in spacy_input.tokenizer(text)]

def tokenize_output(text):
    return [token.text for token in spacy_output.tokenizer(text)]

In [5]:
inputText = Field(init_token="<sos>", eos_token="<eos>", tokenize=tokenize_input, lower=True)
outputText = Field(init_token="<sos>", eos_token="<eos>", tokenize=tokenize_output, lower=True)

In [8]:
fields = {"InputText": ("inputText", inputText), "OutputText": ("outputText", outputText)}

In [10]:
inputText.vocab = load_obj("inputText")
outputText.vocab = load_obj("outputText")

In [12]:
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
    ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)

        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, inputText):
        src_mask = inputText.transpose(0, 1) == self.src_pad_idx

        # (N, src_len)
        return src_mask.to(self.device)

    def forward(self, inputText, outputText):
        src_seq_length, N = inputText.shape
        trg_seq_length, N = outputText.shape

        src_positions = (
            torch.arange(0, src_seq_length)
            .unsqueeze(1)
            .expand(src_seq_length, N)
            .to(self.device)
        )

        trg_positions = (
            torch.arange(0, trg_seq_length)
            .unsqueeze(1)
            .expand(trg_seq_length, N)
            .to(self.device)
        )

        embed_src = self.dropout(
            (self.src_word_embedding(inputText) + self.src_position_embedding(src_positions))
        )
        embed_trg = self.dropout(
            (self.trg_word_embedding(outputText) + self.trg_position_embedding(trg_positions))
        )

        src_padding_mask = self.make_src_mask(inputText)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
            self.device
        )

        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask=src_padding_mask,
            tgt_mask=trg_mask,
        )
        out = self.fc_out(out)
        return out

In [14]:
# We're ready to define everything we need for training our Seq2Seq model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

load_model = True
save_model = True

# Training hyperparameters
num_epochs = 10
learning_rate = 3e-4
batch_size = 32

# Model hyperparameters
src_vocab_size = len(inputText.vocab)
trg_vocab_size = len(outputText.vocab)
embedding_size = 512    # default: 512
num_heads = 8
num_encoder_layers = 3  # 6 in paper
num_decoder_layers = 3
dropout = 0.10
max_len = 70           # max_len=70 for old model
forward_expansion = 4
src_pad_idx = inputText.vocab.stoi["<pad>"]

# Tensorboard to get nice loss plot
writer = SummaryWriter("runs/loss_plot")
step = 0

In [16]:
model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

In [25]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [20]:
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.1, patience=10, verbose=True
)

In [22]:
pad_idx = inputText.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

In [24]:
if load_model:
    load_checkpoint(torch.load("./model/my_checkpoint.pth.tar"), model, optimizer)

=> Loading checkpoint


RuntimeError: CUDA out of memory. Tried to allocate 190.00 MiB (GPU 0; 6.00 GiB total capacity; 4.13 GiB already allocated; 189.56 MiB free; 4.21 GiB reserved in total by PyTorch)

In [13]:
src = "The feasibility study estimates that it would take passengers about four minutes to cross the Potomac River on the gondola."

prediction = process_sentence(model, src, inputText, outputText, device)
prediction = prediction[:-1]  # remove <eos> token

print(prediction)

['the', 'feasibility', 'study', 'estimate', 'that', 'it', 'would', 'take', 'passenger', 'about', 'four', 'minute', 'to', 'cross', 'the', 'potomac', 'river', 'on', 'the', 'gondola', '.']


## Evaluation

In [32]:
# Loading the test dataset
test_data = pd.read_csv("./training/test/test-sample.txt", header=0, names=['InputText', 'OutputText'], sep='\t', encoding='utf-8')
test_data.shape

(10220, 2)

In [33]:
# test_data = test_data.sample(n=5000)
# test_data = test_data[:1000]
# test_data = test_data.reset_index()
# test_data = test_data.drop(['index'], axis=1)
test_data

Unnamed: 0,InputText,OutputText
0,I do nt remember hearing the phrase white guil...,i do nt remember hear the phrase white guilt v...
1,Growing up black in the s I never had the impr...,grow up black in the s i never have the impres...
2,When I would stray into the wrong restaurant i...,when i would stray into the wrong restaurant i...
3,I can see now that possibly she was but then a...,i can see now that possibly she be but then al...
4,If there was guilt it was mine for having made...,if there be guilt it be mine for have make a i...
...,...,...
10215,The second six variables are the intraindividu...,the second six variable be the intraindividual...
10216,The effect size is the sex difference expresse...,the effect size be the sex difference express ...
10217,Thus the largest sex difference is in movement...,thus the large sex difference be in movement t...
10218,This is the only statistically significant sex...,this be the only statistically significant sex...


In [34]:
# Making predictions
predictions, targets = predict(test_data["InputText"], test_data["OutputText"], model, inputText, outputText, device)

100%|██████████| 10220/10220 [41:30<00:00,  4.10it/s]


In [35]:
# Evaluating with score
score = evaluate(targets, predictions)
print(f"Accuracy {score * 100:.2f}")

Accuracy 22.75


## Evaluation with Stanza

In [36]:
import stanza
stanza.download('en')

nlp = stanza.Pipeline(processors = "tokenize,mwt,lemma,pos")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 10.1MB/s]
2020-09-03 21:50:02 INFO: Downloading default packages for language: en (English)...
2020-09-03 21:50:03 INFO: File exists: C:\Users\shoeb\stanza_resources\en\default.zip.
2020-09-03 21:50:06 INFO: Finished downloading models and saved to C:\Users\shoeb\stanza_resources.
2020-09-03 21:50:06 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |

2020-09-03 21:50:06 INFO: Use device: gpu
2020-09-03 21:50:06 INFO: Loading: tokenize
2020-09-03 21:50:06 INFO: Loading: pos
2020-09-03 21:50:06 INFO: Loading: lemma
2020-09-03 21:50:07 INFO: Done loading processors!


In [37]:
def extract_lemma(df):
    prediction = []
    for iter in tqdm(range(len(df))):
        doc = nlp(df[iter])
        for sent in doc.sentences:
            lemma = []
            for wrd in sent.words:
                lemma.append(str(wrd.lemma).lower())
                # word.append(str(wrd.text))
            prediction.append(lemma)
            # target.append(word)
        #return a dataframe
    return prediction

In [38]:
def predictionStanza(data):
    prediction = []
    for iter in tqdm(range(len(data))):
        doc = nlp(data[iter])
        text = ""
        for sent in doc.sentences:
            for wrd in sent.words:
                text = text + wrd.lemma + " "
        lemma = text.split()
        prediction.append(lemma)
    return prediction

In [39]:
predictStanza = predictionStanza(test_data["InputText"])

100%|██████████| 10220/10220 [06:10<00:00, 27.62it/s]


In [40]:
score = evaluate(targets, predictStanza)
print(f"Accuracy {score * 100:.2f}")

Accuracy 24.69
