# Lemmatization using Attention Mechanism Preloaded Model

In [1]:
# Import library
import random
import torch
import torch.nn as nn
import pandas as pd
import torch.optim as optim
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import spacy
import pickle
import csv
from utils import process_sentence, load_obj, save_obj, save_checkpoint, load_checkpoint, predict, evaluate
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
from torchtext.data import Field, BucketIterator, TabularDataset, Dataset
from sklearn.model_selection import train_test_split

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 20.4MB/s]
2020-08-31 13:13:17 INFO: Downloading default packages for language: en (English)...
2020-08-31 13:13:18 INFO: File exists: C:\Users\shoeb\stanza_resources\en\default.zip.
2020-08-31 13:13:21 INFO: Finished downloading models and saved to C:\Users\shoeb\stanza_resources.
2020-08-31 13:13:21 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| pos       | ewt       |
| lemma     | ewt       |
| depparse  | ewt       |
| sentiment | sstplus   |
| ner       | ontonotes |

2020-08-31 13:13:22 INFO: Use device: gpu
2020-08-31 13:13:22 INFO: Loading: tokenize
2020-08-31 13:13:24 INFO: Loading: pos
2020-08-31 13:13:25 INFO: Loading: lemma
2020-08-31 13:13:25 INFO: Loading: depparse
2020-08-31 13:13:25 INFO: Loading: sentiment
2020-08-31 13:13:26 INFO: Loading: ner
2020-08-31 13:13:27 IN

In [2]:
spacy_input = spacy.load("en")
spacy_output = spacy.load("en")

def tokenize_input(text):
    return [token.text for token in spacy_input.tokenizer(text)]

def tokenize_output(text):
    return [token.text for token in spacy_output.tokenizer(text)]

In [3]:
inputText = Field(init_token="<sos>", eos_token="<eos>", tokenize=tokenize_input, lower=True)
outputText = Field(init_token="<sos>", eos_token="<eos>", tokenize=tokenize_output, lower=True)

In [4]:
fields = {"InputText": ("inputText", inputText), "OutputText": ("outputText", outputText)}

In [5]:
inputText.vocab = load_obj("inputText")
outputText.vocab = load_obj("outputText")

In [6]:
class Transformer(nn.Module):
    def __init__(
        self,
        embedding_size,
        src_vocab_size,
        trg_vocab_size,
        src_pad_idx,
        num_heads,
        num_encoder_layers,
        num_decoder_layers,
        forward_expansion,
        dropout,
        max_len,
        device,
    ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)

        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout,
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = src_pad_idx

    def make_src_mask(self, inputText):
        src_mask = inputText.transpose(0, 1) == self.src_pad_idx

        # (N, src_len)
        return src_mask.to(self.device)

    def forward(self, inputText, outputText):
        src_seq_length, N = inputText.shape
        trg_seq_length, N = outputText.shape

        src_positions = (
            torch.arange(0, src_seq_length)
            .unsqueeze(1)
            .expand(src_seq_length, N)
            .to(self.device)
        )

        trg_positions = (
            torch.arange(0, trg_seq_length)
            .unsqueeze(1)
            .expand(trg_seq_length, N)
            .to(self.device)
        )

        embed_src = self.dropout(
            (self.src_word_embedding(inputText) + self.src_position_embedding(src_positions))
        )
        embed_trg = self.dropout(
            (self.trg_word_embedding(outputText) + self.trg_position_embedding(trg_positions))
        )

        src_padding_mask = self.make_src_mask(inputText)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
            self.device
        )

        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask=src_padding_mask,
            tgt_mask=trg_mask,
        )
        out = self.fc_out(out)
        return out

In [7]:
# We're ready to define everything we need for training our Seq2Seq model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

load_model = True
save_model = True

# Training hyperparameters
num_epochs = 10
learning_rate = 3e-4
batch_size = 32

# Model hyperparameters
src_vocab_size = len(inputText.vocab)
trg_vocab_size = len(outputText.vocab)
embedding_size = 512    # default: 512
num_heads = 8
num_encoder_layers = 3  # 6 in paper
num_decoder_layers = 3
dropout = 0.10
max_len = 70
forward_expansion = 4
src_pad_idx = inputText.vocab.stoi["<pad>"]

# Tensorboard to get nice loss plot
writer = SummaryWriter("runs/loss_plot")
step = 0

In [8]:
model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_len,
    device,
).to(device)

In [9]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [10]:
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.1, patience=10, verbose=True
)

In [11]:
pad_idx = inputText.vocab.stoi["<pad>"]
criterion = nn.CrossEntropyLoss(ignore_index = pad_idx)

In [12]:
if load_model:
    load_checkpoint(torch.load("./model/my_checkpoint.pth.tar"), model, optimizer)

=> Loading checkpoint


In [13]:
src = "The feasibility study estimates that it would take passengers about four minutes to cross the Potomac River on the gondola."

prediction = process_sentence(model, src, inputText, outputText, device)
prediction = prediction[:-1]  # remove <eos> token

print(prediction)

['the', 'feasibility', 'study', 'estimate', 'that', 'it', 'would', 'take', 'passenger', 'about', 'four', 'minute', 'to', 'cross', 'the', 'potomac', 'river', 'on', 'the', 'gondola', '.']


## Evaluation

In [14]:
# Loading the test dataset
test_data = pd.read_csv("./training/test/test-sample.txt", header=0, names=['InputText', 'OutputText'], sep='\t', encoding='utf-8')
test_data.shape

(10220, 2)

In [15]:
test_data = test_data.sample(n=2000)
# test_data = test_data[:1000]
test_data = test_data.reset_index()
test_data = test_data.drop(['index'], axis=1)
test_data

Unnamed: 0,InputText,OutputText
0,The stenciled images are of things an insect r...,the stenciled image be of thing a insect repea...
1,The justice of any particular holding is a mat...,the justice of any particular holding be a mat...
2,The major psychiatric illnesses are diseases,the major psychiatric illness be disease
3,Anyone who went through this curriculum got a ...,anyone who go through this curriculum get a li...
4,It was in that car that Max and I would drive ...,it be in that car that maximum and i would dri...
...,...,...
1995,And it would consider how to get students to a...,and it would consider how to get student to ac...
1996,After the third condition another trials of th...,after the third condition another trial of the...
1997,Santayana was criticized by pragmatists for hi...,santayana be criticize by pragmatist for his i...
1998,He would not assume an adversarial relationshi...,he would not assume a adversarial relationship...


In [16]:
# Making predictions
predictions, targets = predict(test_data["InputText"], test_data["OutputText"], model, inputText, outputText, device)

100%|██████████| 2000/2000 [24:42<00:00,  1.35it/s]


In [17]:
# Evaluating with score
score = evaluate(targets, predictions)
print(f"Accuracy {score * 100:.2f}")

Accuracy 22.20


## Evaluation with Stanza

In [18]:
import stanza
stanza.download('en')

nlp = stanza.Pipeline(processors = "tokenize,mwt,lemma,pos")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.1.0.json: 122kB [00:00, 13.6MB/s]
2020-08-31 13:38:17 INFO: Downloading default packages for language: en (English)...
2020-08-31 13:38:17 INFO: File exists: C:\Users\shoeb\stanza_resources\en\default.zip.
2020-08-31 13:38:20 INFO: Finished downloading models and saved to C:\Users\shoeb\stanza_resources.
2020-08-31 13:38:20 INFO: Loading these models for language: en (English):
| Processor | Package |
-----------------------
| tokenize  | ewt     |
| pos       | ewt     |
| lemma     | ewt     |

2020-08-31 13:38:20 INFO: Use device: gpu
2020-08-31 13:38:20 INFO: Loading: tokenize
2020-08-31 13:38:20 INFO: Loading: pos
2020-08-31 13:38:21 INFO: Loading: lemma
2020-08-31 13:38:21 INFO: Done loading processors!


In [19]:
def extract_lemma(df):
    prediction = []
    for iter in tqdm(range(len(df))):
        doc = nlp(df[iter])
        for sent in doc.sentences:
            lemma = []
            for wrd in sent.words:
                lemma.append(str(wrd.lemma).lower())
                # word.append(str(wrd.text))
            prediction.append(lemma)
            # target.append(word)
        #return a dataframe
    return prediction

In [20]:
def predictionStanza(data):
    prediction = []
    for iter in tqdm(range(len(data))):
        doc = nlp(data[iter])
        text = ""
        for sent in doc.sentences:
            for wrd in sent.words:
                text = text + wrd.lemma + " "
        lemma = text.split()
        prediction.append(lemma)
    return prediction

In [21]:
predictStanza = predictionStanza(test_data["InputText"])

100%|██████████| 2000/2000 [01:12<00:00, 27.54it/s]


In [22]:
score = evaluate(targets, predictStanza)
print(f"Accuracy {score * 100:.2f}")

Accuracy 23.60


# test

In [14]:
import json

testData = []
print("Started Reading JSON file which contains multiple JSON document")
with open('./data/test.json') as f:
    for jsonObj in f:
        testDict = json.loads(jsonObj)
        testData.append(testDict)

Started Reading JSON file which contains multiple JSON document


In [15]:
inText = []
outText = []
for iter in tqdm(range(len(testData))):
    inText.append([testData[iter]['InputText']])
    outText.append([testData[iter]['OutputText']])
# testData[0]["OutputText"]

100%|██████████| 916/916 [00:00<00:00, 918694.99it/s]


In [16]:
outText[0]

['oil start flow from the Swanson River oil field in 1957 .']

In [17]:
inText[0]

['Oil started flowing from the Swanson River oil field in 1957 .']

In [18]:
inputData = pd.DataFrame(inText)
targetData = pd.DataFrame(outText)
inputData.columns = ["InputText"]
targetData.columns = ["OutputText"]

In [19]:
inputData.head()

Unnamed: 0,InputText
0,Oil started flowing from the Swanson River oil...
1,According to the National Institute of Mental ...
2,If you did n't find what you were looking for ...
3,We were terrified when the epidemic started be...
4,It 's this detection technique that caught Tyl...


In [20]:
inputData["InputText"][0]

'Oil started flowing from the Swanson River oil field in 1957 .'

In [21]:
target = targetData["OutputText"].apply(tokenize_input)

In [22]:
target[0]

['oil',
 'start',
 'flow',
 'from',
 'the',
 'Swanson',
 'River',
 'oil',
 'field',
 'in',
 '1957',
 '.']