# LSTM in Pytorch

In [None]:
#library imports
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
import spacy
#import jovian
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


In [None]:
import os
import re
import collections
import timeit

In [None]:
from sklearn.model_selection import train_test_split


## Basic LSTM in Pytorch with random numbers

## Multiclass Text Classification

We are going to predict item ratings based on customer reviews bsed on this dataset from Kaggle:
https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews

In [None]:
!unzip data.zip

In [None]:
mypath = 'data'
folders = [f for f in os.listdir(mypath)]
folder_indexes = {v:i for i,v in enumerate(folders)}
folder_indexes

{'alt.atheism': 10,
 'comp.graphics': 3,
 'comp.os.ms-windows.misc': 12,
 'comp.sys.ibm.pc.hardware': 9,
 'comp.sys.mac.hardware': 16,
 'comp.windows.x': 17,
 'misc.forsale': 13,
 'rec.autos': 8,
 'rec.motorcycles': 7,
 'rec.sport.baseball': 4,
 'rec.sport.hockey': 18,
 'sci.crypt': 19,
 'sci.electronics': 1,
 'sci.med': 6,
 'sci.space': 5,
 'soc.religion.christian': 15,
 'talk.politics.guns': 0,
 'talk.politics.mideast': 2,
 'talk.politics.misc': 14,
 'talk.religion.misc': 11}

In [None]:
classes = []
X = []
y = []
mypath = 'data'
folders = [f for f in os.listdir(mypath)]

for folder in folders:
    cls = []
    f_path = os.path.join(mypath,folder)
    for files in os.listdir(f_path):
        text = ''
        path = os.path.join(f_path,files)
        with open(path,'r',errors='ignore',encoding="utf8") as f:
            cur = f.read()
            cur = cur.lower()
            cur = re.sub(r'[\w\.-]+@[\w\.-]+',' ',cur)
            cur = re.sub("[^a-zA-Z,.']", '  ', cur)
            cur = re.sub(r'\.{2,}',' ',cur)   
            cur = re.sub('\s+',' ',cur)
            X.append(cur)
        y.append(int(folder_indexes[folder]))
        #cls.append(text)


In [None]:
df = pd.DataFrame({'doc':X,
                         'labels':y})

In [None]:
#tokenization
tok = spacy.load('en')
def tokenize (text):
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]') # remove punctuation and numbers
    nopunct = regex.sub(" ", text.lower())
    return [token.text for token in tok.tokenizer(nopunct)]

In [None]:
#count number of occurences of each word
counts = Counter()
for index, row in df.iterrows():
    counts.update(tokenize(row['doc']))

In [None]:
#deleting infrequent words
print("num_words before:",len(counts.keys()))
for word in list(counts):
    if counts[word] < 2:
        del counts[word]
print("num_words after:",len(counts.keys()))

num_words before: 85667
num_words after: 55497


In [None]:
#creating vocabulary
vocab2index = {"":0, "UNK":1}
words = ["", "UNK"]
for word in counts:
    vocab2index[word] = len(words)
    words.append(word)

In [None]:
def encode_sentence(text, vocab2index, N=500):
    tokenized = tokenize(text)
    encoded = np.zeros(N, dtype=int)
    enc1 = np.array([vocab2index.get(word, vocab2index["UNK"]) for word in tokenized])
    length = min(N, len(enc1))
    encoded[:length] = enc1[:length]
    return encoded, length

In [None]:
df['encoded'] = df['doc'].apply(lambda x: np.array(encode_sentence(x,vocab2index )))

In [None]:
df.head()

Unnamed: 0,doc,labels,encoded
0,a few comments on the atf's botched handling o...,0,"[[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ..."
1,"in article qvh n , gedaliah friedenberg writes...",0,"[[19, 295, 296, 297, 14, 298, 299, 300, 19, 29..."
2,.acns.colostate.edu douglas craig holland writ...,0,"[[40, 547, 1, 548, 549, 550, 551, 300, 552, 40..."
3,has anyone noticed or commented on the fact th...,0,"[[106, 668, 657, 249, 669, 5, 6, 670, 31, 671,..."
4,in article r ito c. d. tavares writes it's har...,0,"[[19, 295, 553, 702, 652, 40, 56, 40, 703, 300..."


In [None]:
#check how balanced the dataset is
Counter(df['labels'])

In [None]:
X = list(df['encoded'])
y = list(df['labels'])

X_train, X_valid, y_train, y_valid = train_test_split(X, y,stratify=y, test_size=0.2)

#### Pytorch Dataset

In [None]:
class ReviewsDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.y = Y
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return torch.from_numpy(self.X[idx][0].astype(np.int32)), self.y[idx], self.X[idx][1]

In [None]:
train_ds = ReviewsDataset(X_train, y_train)
valid_ds = ReviewsDataset(X_valid, y_valid)

In [None]:
batch_size = 5000
vocab_size = len(words)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_dl = DataLoader(valid_ds, batch_size=batch_size)

In [None]:
def train_model(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.long()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.cross_entropy(y_pred, y)
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss, val_acc, val_rmse = validation_metrics(model, val_dl)
        if i % 5 == 1:
            print("train loss %.3f, val loss %.3f, val accuracy %.3f, and val rmse %.3f" % (sum_loss/total, val_loss, val_acc, val_rmse))

def validation_metrics (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    sum_rmse = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.long()
        y_hat = model(x, l)
        loss = F.cross_entropy(y_hat, y)
        pred = torch.max(y_hat, 1)[1]
        correct += (pred == y).float().sum()
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
        sum_rmse += np.sqrt(mean_squared_error(pred, y.unsqueeze(-1)))*y.shape[0]
    return sum_loss/total, correct/total, sum_rmse/total

### LSTM with fixed length input

In [None]:
class LSTM_fixed_len(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 20)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [None]:
model_fixed =  LSTM_fixed_len(vocab_size, 50, 5)

In [None]:
train_model(model_fixed, epochs=30, lr=0.01)

train loss 3.012, val loss 3.004, val accuracy 0.052, and val rmse 7.430
train loss 2.982, val loss 2.985, val accuracy 0.058, and val rmse 6.500
train loss 2.963, val loss 2.974, val accuracy 0.062, and val rmse 6.070
train loss 2.940, val loss 2.969, val accuracy 0.061, and val rmse 6.106
train loss 2.916, val loss 2.971, val accuracy 0.062, and val rmse 6.088


KeyboardInterrupt: ignored

In [None]:
train_model(model_fixed, epochs=30, lr=0.01)

In [None]:
train_model(model_fixed, epochs=30, lr=0.01)

### LSTM with variable length input

In [None]:
class LSTM_variable_input(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(0.3)
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        
    def forward(self, x, s):
        x = self.embeddings(x)
        x = self.dropout(x)
        x_pack = pack_padded_sequence(x, s, batch_first=True, enforce_sorted=False)
        out_pack, (ht, ct) = self.lstm(x_pack)
        out = self.linear(ht[-1])
        return out

In [None]:
model = LSTM_variable_input(vocab_size, 50, 50)

In [None]:
train_model(model, epochs=30, lr=0.1)

train loss 1.328, val loss 1.250, val accuracy 0.515, and val rmse 1.312
train loss 1.031, val loss 1.063, val accuracy 0.577, and val rmse 1.017
train loss 0.904, val loss 0.995, val accuracy 0.603, and val rmse 0.941
train loss 0.849, val loss 1.000, val accuracy 0.599, and val rmse 0.940
train loss 0.845, val loss 1.009, val accuracy 0.598, and val rmse 0.921
train loss 0.834, val loss 1.005, val accuracy 0.593, and val rmse 0.902


In [None]:
train_model(model, epochs=30, lr=0.05)

train loss 0.828, val loss 1.000, val accuracy 0.599, and val rmse 0.920
train loss 0.790, val loss 0.989, val accuracy 0.605, and val rmse 0.894
train loss 0.775, val loss 0.992, val accuracy 0.614, and val rmse 0.884
train loss 0.755, val loss 0.994, val accuracy 0.597, and val rmse 0.883
train loss 0.738, val loss 0.987, val accuracy 0.608, and val rmse 0.872
train loss 0.741, val loss 1.005, val accuracy 0.611, and val rmse 0.888


In [None]:
train_model(model, epochs=30, lr=0.05)

train loss 0.758, val loss 1.028, val accuracy 0.616, and val rmse 0.884
train loss 0.725, val loss 0.994, val accuracy 0.621, and val rmse 0.877
train loss 0.715, val loss 0.999, val accuracy 0.607, and val rmse 0.881
train loss 0.707, val loss 1.008, val accuracy 0.608, and val rmse 0.879
train loss 0.698, val loss 1.018, val accuracy 0.615, and val rmse 0.890
train loss 0.686, val loss 1.017, val accuracy 0.603, and val rmse 0.893


### LSTM with pretrained Glove word embeddings

Download weights from : https://nlp.stanford.edu/projects/glove/

In [None]:
def load_glove_vectors(glove_file="./data/glove.6B/glove.6B.50d.txt"):
    """Load the glove word vectors"""
    word_vectors = {}
    with open(glove_file) as f:
        for line in f:
            split = line.split()
            word_vectors[split[0]] = np.array([float(x) for x in split[1:]])
    return word_vectors

In [None]:
def get_emb_matrix(pretrained, word_counts, emb_size = 50):
    """ Creates embedding matrix from word vectors"""
    vocab_size = len(word_counts) + 2
    vocab_to_idx = {}
    vocab = ["", "UNK"]
    W = np.zeros((vocab_size, emb_size), dtype="float32")
    W[0] = np.zeros(emb_size, dtype='float32') # adding a vector for padding
    W[1] = np.random.uniform(-0.25, 0.25, emb_size) # adding a vector for unknown words 
    vocab_to_idx["UNK"] = 1
    i = 2
    for word in word_counts:
        if word in word_vecs:
            W[i] = word_vecs[word]
        else:
            W[i] = np.random.uniform(-0.25,0.25, emb_size)
        vocab_to_idx[word] = i
        vocab.append(word)
        i += 1   
    return W, np.array(vocab), vocab_to_idx

In [None]:
word_vecs = load_glove_vectors()
pretrained_weights, vocab, vocab2index = get_emb_matrix(word_vecs, counts)

In [None]:
class LSTM_glove_vecs(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, glove_weights) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embeddings.weight.data.copy_(torch.from_numpy(glove_weights))
        self.embeddings.weight.requires_grad = False ## freeze embeddings
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 5)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [None]:
model = LSTM_glove_vecs(vocab_size, 50, 50, pretrained_weights)

In [None]:
train_model(model, epochs=30, lr=0.1)

train loss 1.281, val loss 1.255, val accuracy 0.556, and val rmse 1.355
train loss 1.210, val loss 1.207, val accuracy 0.556, and val rmse 1.354
train loss 1.206, val loss 1.204, val accuracy 0.556, and val rmse 1.354
train loss 1.201, val loss 1.202, val accuracy 0.556, and val rmse 1.354
train loss 1.173, val loss 1.168, val accuracy 0.557, and val rmse 1.352
train loss 1.131, val loss 1.122, val accuracy 0.562, and val rmse 1.249


In [None]:
train_model(model, epochs=30, lr=0.05)

train loss 1.112, val loss 1.113, val accuracy 0.556, and val rmse 1.349
train loss 1.061, val loss 1.051, val accuracy 0.570, and val rmse 1.109
train loss 1.014, val loss 1.014, val accuracy 0.582, and val rmse 1.058
train loss 0.979, val loss 0.990, val accuracy 0.599, and val rmse 0.995
train loss 0.948, val loss 0.961, val accuracy 0.610, and val rmse 0.950
train loss 0.923, val loss 0.952, val accuracy 0.612, and val rmse 0.935


In [None]:
train_model(model, epochs=30, lr=0.05)

train loss 1.189, val loss 1.014, val accuracy 0.586, and val rmse 1.033
train loss 0.946, val loss 0.964, val accuracy 0.606, and val rmse 0.950
train loss 0.912, val loss 0.951, val accuracy 0.612, and val rmse 0.941
train loss 0.895, val loss 0.949, val accuracy 0.615, and val rmse 0.913
train loss 0.886, val loss 0.947, val accuracy 0.617, and val rmse 0.901
train loss 0.872, val loss 0.938, val accuracy 0.621, and val rmse 0.890


## Predicting ratings using regression instead of classification

In [None]:
def train_model_regr(model, epochs=10, lr=0.001):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    for i in range(epochs):
        model.train()
        sum_loss = 0.0
        total = 0
        for x, y, l in train_dl:
            x = x.long()
            y = y.float()
            y_pred = model(x, l)
            optimizer.zero_grad()
            loss = F.mse_loss(y_pred, y.unsqueeze(-1))
            loss.backward()
            optimizer.step()
            sum_loss += loss.item()*y.shape[0]
            total += y.shape[0]
        val_loss = validation_metrics_regr(model, val_dl)
        if i % 5 == 1:
            print("train mse %.3f val rmse %.3f" % (sum_loss/total, val_loss))

def validation_metrics_regr (model, valid_dl):
    model.eval()
    correct = 0
    total = 0
    sum_loss = 0.0
    for x, y, l in valid_dl:
        x = x.long()
        y = y.float()
        y_hat = model(x, l)
        loss = np.sqrt(F.mse_loss(y_hat, y.unsqueeze(-1)).item())
        total += y.shape[0]
        sum_loss += loss.item()*y.shape[0]
    return sum_loss/total

In [None]:
class LSTM_regr(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim) :
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(0.2)
        
    def forward(self, x, l):
        x = self.embeddings(x)
        x = self.dropout(x)
        lstm_out, (ht, ct) = self.lstm(x)
        return self.linear(ht[-1])

In [None]:
model =  LSTM_regr(vocab_size, 50, 50)

In [None]:
train_model_regr(model, epochs=30, lr=0.05)

train mse 1.663 val rmse 1.313
train mse 1.215 val rmse 1.125
train mse 1.151 val rmse 1.109
train mse 1.114 val rmse 1.115
train mse 1.082 val rmse 1.121
train mse 1.043 val rmse 1.116


In [None]:
train_model_regr(model, epochs=30, lr=0.05)

train mse 1.214 val rmse 1.193
train mse 0.884 val rmse 1.032
train mse 0.631 val rmse 0.903
train mse 0.483 val rmse 0.837
train mse 0.416 val rmse 0.806
train mse 0.363 val rmse 0.799


In [None]:
jovian.commit("lstm multiclass text classification, regression")

[jovian] Saving notebook..


<IPython.core.display.Javascript object>