## Imports 

In [2]:
import numpy as np
import json
import re
import math
import copy
from sklearn.metrics import classification_report

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataloader import default_collate

## Some useful constants

In [3]:
EMB_SIZE = 300
PAD_TOKEN = '<PAD>'
UNK_TOKEN = '<UNK>'
TRAIN_BATCH_SIZE = 32
TEST_EVAL_BATCH_SIZE = 32
# You can define your own constant in here
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


## Load the data

This data is based on
<a href="http://www.cs.cornell.edu/people/pabo/movie-review-data/">this link</a>
and contains movie reviews sentiment-analysis.

In [4]:
with open('./dataset.json') as f:
    all_dataset = json.load(f)
    
for section in all_dataset.keys():
    l = len(all_dataset[section])
    print(f"{section} lenght is: {l}")

train lenght is: 8000
eval lenght is: 2000
test lenght is: 662


## Download and extract the embeddings

In [None]:
!wget https://nlp.stanford.edu/data/glove.6B.zip

In [None]:
!unzip ./glove.6B.zip -d "./glove/"

Archive:  ./glove.6B.zip
  inflating: ./glove/glove.6B.50d.txt  
  inflating: ./glove/glove.6B.100d.txt  
  inflating: ./glove/glove.6B.200d.txt  
  inflating: ./glove/glove.6B.300d.txt  


## Create embedding matrix and useful functions

In [5]:
word_list = []
emb_list = []
with open(f'./glove/glove.6B.{EMB_SIZE}d.txt','r') as f:
    for line in f.read().strip().split('\n'):
        values = line.split()
        word = values[0]
        emb = values[1:]
        word_list.append(word)
        emb_list.append(emb)
        
emb_matrix = np.array(emb_list, 'float32')

In [6]:
# We initialize <UNK> token as an average of all embedings
unk_emb = np.mean(emb_matrix, axis=0, keepdims=True)
word_list.append(UNK_TOKEN)
emb_matrix = np.vstack((emb_matrix, unk_emb))

# We initialize <PAD> token as zeroes
pad_emb = np.zeros((1, EMB_SIZE))
word_list.append(PAD_TOKEN)
emb_matrix = np.vstack((emb_matrix, pad_emb))

In [7]:
reverse_map = {word: id for (id, word) in enumerate(word_list)}

def word_to_ids(word: str) -> list:
    word = word.strip()
    if word == "":
        return []
    if word in reverse_map:
        return [reverse_map[word]]
    elif word[-3:] in ["n't", "'re"]:
        return word_to_ids(word[:-3]) + word_to_ids(word[-3:])
    elif word[-2:] in ["'s", "'d", "'m"]:
        return word_to_ids(word[:-2]) + word_to_ids(word[-2:])
    else:
        word = word.replace("'", "")
        if word in reverse_map:
            return [reverse_map[word]]
    return [reverse_map[UNK_TOKEN]]
    
def id_to_word(id: int) -> str:
    return word_list[id]

## Tokenizer and sentence useful tools

In [8]:
def tokenizer(sentence: str) -> list:
    sentence = sentence.strip()
    return re.split("[ -]+", sentence)

def sentence_to_ids(sentence: str) -> list:
    return sum(map(word_to_ids, tokenizer(sentence)), [])

def ids_to_sentence(ids: list) -> list:
    return ' '.join(map(id_to_word, ids))

## Build embedding layer

In [9]:
matrix_len = len(word_list)
weights_matrix = np.zeros((matrix_len, EMB_SIZE))
words_found = 0

for i, word in enumerate(word_list):
    try: 
        weights_matrix[i] = emb_matrix[word_to_ids(word)]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(EMB_SIZE, ))

def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': weights_matrix})
    if non_trainable:
        print("NOT TRAIN EMBED")
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

In [10]:
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        x, trg = batch
        optimizer.zero_grad()
        x = x.type(torch.LongTensor).to(device)
        output = model(x)

        loss = criterion(output, trg.to(device))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            x, trg = batch
            x = x.type(torch.LongTensor).to(device)
            output = model(x)

            loss = criterion(output, trg.to(device))
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

def get_all_targets_and_predicted(model, iterator):
    all_trg = []
    all_prd = []
    with torch.no_grad():
        for batch in iterator:
            x, trg = batch
            x = x.type(torch.LongTensor).to(device)
            output = model(x)
            
            prd = output.argmax(1).tolist()
            
            all_trg += trg
            all_prd += prd
    return all_trg, all_prd

In [11]:
class LSTM(nn.Module):
    def __init__(self, weights_matrix, hidden_size, num_layers, output_size, train_embedding=False):
        super(LSTM, self).__init__()

        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix, not train_embedding)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(embedding_dim, hidden_size, num_layers, batch_first=True)

        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_().to(device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).requires_grad_().to(device)
        x = self.embedding(x)
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        out = out[:, -1,:]
        out = self.fc(out)
        return out

In [12]:
def pad_features(review, seq_length):
    review_len = len(review)
    
    if review_len <= seq_length:
        zeroes = list(np.zeros(seq_length-review_len))
        new = zeroes+list(review)
    elif review_len > seq_length:
        new = review[0:seq_length]    
    return np.array(new, dtype='int')

In [13]:
class Method2Dataset(Dataset):
    def __init__(self, datadict):
        self.data = [(pad_features(sentence_to_ids(sentence), 50), semantic) for sentence, semantic in datadict]

    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

train_dataset = Method2Dataset(all_dataset['train'])
eval_dataset = Method2Dataset(all_dataset['eval'])
test_dataset = Method2Dataset(all_dataset['test'])

train_dataloader = DataLoader(train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True, drop_last=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=TEST_EVAL_BATCH_SIZE, shuffle=True, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=TEST_EVAL_BATCH_SIZE, shuffle=True, drop_last=True)

In [14]:
input_size = 1
hidden_size = 100
num_layers = 2
output_size = 2

num_epochs = 10
learning_rate = 1e-3

def train_and_report(weights_matrix, train_embed):
    weights_matrix = torch.tensor(weights_matrix).clone()
    lstm = LSTM(weights_matrix, hidden_size, num_layers, output_size, train_embed).float().to(device)
    optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate, weight_decay=1e-3)
    criterion = nn.CrossEntropyLoss()
    best_model_wight = None
    min_loss = 1e8
    for epoch in range(num_epochs):
        train_loss = train(lstm, train_dataloader, optimizer, criterion)
        valid_loss = evaluate(lstm, eval_dataloader, criterion)

        if min_loss > valid_loss:
            min_loss = valid_loss
            best_model_wight = copy.deepcopy(lstm.state_dict())
        
        print(f'Epoch: {epoch+1:02}')
        print(f'\tTrain Loss: {train_loss:.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f}')
    
    # Load best model
    lstm.load_state_dict(best_model_wight)
    
    print('__________________TRAIN DATASET__________________')
    trg, prd = get_all_targets_and_predicted(lstm, train_dataloader)
    print(classification_report(trg, prd))
    print('__________________EVAL DATASET__________________')
    trg, prd = get_all_targets_and_predicted(lstm, eval_dataloader)
    print(classification_report(trg, prd))
    print('__________________TEST DATASET__________________')
    trg, prd = get_all_targets_and_predicted(lstm, test_dataloader)
    print(classification_report(trg, prd))

## Freeze embeding layer

In [15]:
train_and_report(weights_matrix, False)

NOT TRAIN EMBED
Epoch: 01
	Train Loss: 0.582
	 Val. Loss: 0.528
Epoch: 02
	Train Loss: 0.495
	 Val. Loss: 0.498
Epoch: 03
	Train Loss: 0.471
	 Val. Loss: 0.483
Epoch: 04
	Train Loss: 0.457
	 Val. Loss: 0.501
Epoch: 05
	Train Loss: 0.431
	 Val. Loss: 0.479
Epoch: 06
	Train Loss: 0.420
	 Val. Loss: 0.458
Epoch: 07
	Train Loss: 0.400
	 Val. Loss: 0.466
Epoch: 08
	Train Loss: 0.386
	 Val. Loss: 0.469
Epoch: 09
	Train Loss: 0.365
	 Val. Loss: 0.459
Epoch: 10
	Train Loss: 0.344
	 Val. Loss: 0.493
__________________TRAIN DATASET__________________
              precision    recall  f1-score   support

           0       0.82      0.85      0.83      4000
           1       0.84      0.81      0.83      4000

    accuracy                           0.83      8000
   macro avg       0.83      0.83      0.83      8000
weighted avg       0.83      0.83      0.83      8000

__________________EVAL DATASET__________________
              precision    recall  f1-score   support

           0       0.76

## Train Embedding layer

In [57]:
train_and_report(weights_matrix, True)

Epoch: 01
	Train Loss: 0.572
	 Val. Loss: 0.520
Epoch: 02
	Train Loss: 0.471
	 Val. Loss: 0.479
Epoch: 03
	Train Loss: 0.409
	 Val. Loss: 0.493
Epoch: 04
	Train Loss: 0.349
	 Val. Loss: 0.499
Epoch: 05
	Train Loss: 0.281
	 Val. Loss: 0.535
Epoch: 06
	Train Loss: 0.221
	 Val. Loss: 0.598
Epoch: 07
	Train Loss: 0.190
	 Val. Loss: 0.689
Epoch: 08
	Train Loss: 0.147
	 Val. Loss: 0.640
Epoch: 09
	Train Loss: 0.136
	 Val. Loss: 0.848
Epoch: 10
	Train Loss: 0.099
	 Val. Loss: 0.754
__________________TRAIN DATASET__________________
              precision    recall  f1-score   support

           0       0.85      0.91      0.88      4000
           1       0.90      0.84      0.87      4000

    accuracy                           0.87      8000
   macro avg       0.88      0.87      0.87      8000
weighted avg       0.88      0.87      0.87      8000

__________________EVAL DATASET__________________
              precision    recall  f1-score   support

           0       0.76      0.83      