In [1]:
import pandas as pd

In [2]:
import spacy
import re
nlp = spacy.load('en_core_web_lg')
def tokenize(sentence):
    sentence = sentence.lower()
    sent = nlp(sentence)
    tokens = [token.lemma_ for token in sent if not token.is_stop]
    return tokens

def load_dataset(path):
    df = pd.read_csv(path, header=None, sep=';')
    df.columns=['sentence', 'sentiment']
    mapping = {'sadness': 1,
              'fear' : 1,
              'anger' : 1,
              'love' : 0,
              'surprise' : 0,
              'joy' : 0}
    df['sentiment'] = df.sentiment.map(mapping)
    df['sentence'] = df['sentence'].apply(tokenize)
    return df

In [3]:
df_train = load_dataset('train.txt')
df_val = load_dataset('val.txt')
df_test = load_dataset('test.txt')

In [4]:
import numpy as np
def X_y_builder(df):
    X = list([sent for sent in df.sentence ])
    y = np.array([label for label in df.sentiment])
    return X, y
X_train, y_train = X_y_builder(df_train)
X_val, y_val = X_y_builder(df_val)
X_test, y_test = X_y_builder(df_test)

In [5]:
import gensim.downloader as api
from gensim.models import Word2Vec

model_wiki = api.load('fasttext-wiki-news-subwords-300')
model_w2v = Word2Vec(X_train, size = 300, min_count = 2).wv  


In [6]:
from torchtext.vocab import build_vocab_from_iterator

    
vocab = build_vocab_from_iterator(X_train, specials=['<unk>'])
vocab.set_default_index(vocab['<unk>'])

In [7]:
len(vocab)

11857

In [8]:
def transform_vocab(X):
    return [vocab(sent) for sent in X]
X_train_tok = transform_vocab(X_train)
X_val_tok = transform_vocab(X_val)
X_test_tok = transform_vocab(X_test)

In [9]:
x_train_max_len = max([len(i) for i in X_train])
x_test_max_len = max([len(i) for i in X_test])
x_validation_max_len = max([len(i) for i in X_val])

MAX_LEN = max(x_train_max_len, x_test_max_len, x_validation_max_len)
MAX_LEN

34

In [10]:
import numpy as np
def padding(X):
    X_pad =  np.array([np.pad(sent, 
                              (0, MAX_LEN-len(sent)), 
                              constant_values=(0,0)) 
                       for sent in X])
    return X_pad

X_train_tok = padding(X_train_tok)
X_val_tok = padding(X_val_tok)
X_test_tok = padding(X_test_tok)

In [11]:
DICT_SIZE = len(vocab)
def create_weight_matrix(model, second_model=False):
    '''
    Accepts word embedding model
    and the second model, if provided
    Returns weight matrix of size m*n, where
    m - size of the dictionary
    n - size of the word embedding vector
    '''
    vector_size = model.get_vector('like').shape[0]
    w_matrix = np.zeros((DICT_SIZE, vector_size))
    skipped_words = []

    for index in range(1, len(vocab)):
        word = vocab.lookup_token(index)
        if index < DICT_SIZE:
            if word in model.index2word: 
                w_matrix[index] = model.get_vector(word)
        else:
            if second_model:
                if word in second_model.index2word:
                    w_matrix[index] = second_model.get_vector(word)
                else:
                    skipped_words.append(word)
            else:
                skipped_words.append(word)
 
    print(f'{len(skipped_words)} words were skipped. Some of them:')
    print(skipped_words[:50])
    return w_matrix

In [12]:
weight_matrix = create_weight_matrix(model_wiki, model_w2v)

0 words were skipped. Some of them:
[]


In [13]:
weight_matrix.shape

(11857, 300)

In [14]:
import torch
from torch.utils.data import TensorDataset, DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = 32

def dataset_creator(X, y):
    X = torch.from_numpy(X).to(device)
    y = torch.from_numpy(y).to(device)
    dataset = TensorDataset(X, y)
    return DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

dataloader_train = dataset_creator(X_train_tok, y_train)
dataloader_val = dataset_creator(X_val_tok, y_val)
dataloader_test = dataset_creator(X_test_tok, y_test)



In [15]:
from torch import nn

class BRNN(nn.Module):
    def __init__(self, weight, output_dim): 
        super().__init__()     
        self.embedding = nn.Embedding.from_pretrained(weight).float()
        
        self.lstm1 = nn.LSTM(weight.shape[-1], 128, bidirectional=True, batch_first=True)
        self.dropout1 = nn.Dropout(p=0.2)
        
        self.lstm2 = nn.LSTM(128*2, 256, bidirectional=True, batch_first=True)
        self.dropout2 = nn.Dropout(p=0.2)
        
        self.lstm3 = nn.LSTM(256*2, 128, bidirectional=True, batch_first=True)
        self.dropout3 = nn.Dropout(p=0.2)
        
        self.fc = nn.Linear(128*2, output_dim)
        
    def forward(self, text):  
        embedded = self.embedding(text)   
          
        out_lstm1, _ = self.lstm1(text)
        out_lstm1 = self.dropout1(out_lstm1)
        
        out_lstm2, _ = self.lstm2(out_lstm1)
        out_lstm2 = self.dropout2(out_lstm2)
        
        out_lstm3, _ = self.lstm3(out_lstm2)
        out_lstm3 = self.dropout3(out_lstm3)

        out = self.fc(out_lstm3[:,-1,:])
        return out

In [16]:
num_classes = 2
model = BRNN(weight=torch.tensor(weight_matrix).to(device), output_dim=num_classes)
model.to(device)



BRNN(
  (embedding): Embedding(11857, 300)
  (lstm1): LSTM(300, 128, batch_first=True, bidirectional=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (lstm2): LSTM(256, 256, batch_first=True, bidirectional=True)
  (dropout2): Dropout(p=0.2, inplace=False)
  (lstm3): LSTM(512, 128, batch_first=True, bidirectional=True)
  (dropout3): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)

In [17]:
import torch.optim as optim

optimizer = optim.SGD(model.parameters(), lr=0.01) #optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()


In [18]:
def binary_accuracy(preds, y):
    correct = preds.argmax(axis=1)==y
    acc = correct.sum() / len(correct)
    return acc


In [19]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    for batch in iterator:
        predictions = model.forward(batch[0])
        loss = criterion(predictions, batch[1])  
        acc = binary_accuracy(predictions, batch[1]) 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()


        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)



In [20]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad(): 
        for batch in iterator:
            predictions = model(batch[0])
            loss = criterion(predictions, batch[1])       
            acc = binary_accuracy(predictions, batch[1])

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [21]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [22]:
import time


N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, dataloader_train, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, dataloader_val, criterion)
    valid_loss, valid_acc = 0,0

    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut1-model.pt')
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

RuntimeError: input must have 3 dimensions, got 2

In [None]:
model.eval()
for batch in dataloader_train:
    print(model(batch[0]))
    print(batch[1])
    break