<a href="https://colab.research.google.com/github/rhn19/NN_TextClassification/blob/master/IMDB_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!wget "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

In [0]:
!tar xzf aclImdb_v1.tar.gz

In [0]:
!wget "http://nlp.stanford.edu/data/wordvecs/glove.6B.zip"

In [0]:
!unzip /content/glove.6B.zip

In [0]:
#build a word index mapping on glove
#do not split this cell for future usability
!pip install bcolz
import bcolz
import numpy as np
import pickle

words = []
idx = 0
word2idx = {}
vectors = bcolz.carray(np.zeros(1), rootdir=f'/content/glove.6B.300.dat', mode='w')
#print(vectors)

with open(f'/content/glove.6B.300d.txt', 'rb') as f:
    for l in f:
        line = l.decode().split()
        word = line[0]
        words.append(word)
        word2idx[word] = idx
        idx += 1
        vect = np.array(line[1:]).astype(np.float)
        vectors.append(vect)
    
#print(len(vectors))
vectors = bcolz.carray(vectors[1:].reshape((400001, 300)), rootdir=f'/content/glove.6B.300.dat', mode='w')
vectors.flush()
#print(vectors)

pickle.dump(words, open(f'/content/glove.6B.300_words.pkl', 'wb'))
pickle.dump(word2idx, open(f'/content/glove.6B.300_idx.pkl', 'wb'))

vectors = bcolz.open(f'/content/glove.6B.300.dat')[:]
words = pickle.load(open(f'/content/glove.6B.300_words.pkl', 'rb'))
word2idx = pickle.load(open(f'/content/glove.6B.300_idx.pkl', 'rb'))

glove = {w: vectors[word2idx[w]] for w in words}

In [0]:
import os
import re
import time
from collections import Counter
from itertools import chain

import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

DEVICE = torch.device("cuda:0")
### x -> reviews, y -> sentiments

In [0]:
def data_from_file(train_or_test_dir):
    """
        Load data from dataset folder
        @param train_or_test_dir : (str) path to the directory
        @returns reviews : (List[str]) list of reviews
        @returns labels : (List[int]) list of sentiments (0 for neg, 1 for pos)
    """
    reviews = []
    labels = []
    for sentiment in ["pos", "neg"]:
        dir = os.path.join(train_or_test_dir, sentiment)
        for file_name in os.listdir(dir):
            file = open(os.path.join(dir, file_name))
            reviews.append(file.read().lower())
            file.close()
            if sentiment == "pos":
                labels.append(1)
            else:
                labels.append(0)
    return reviews, labels

In [0]:
train_dir = "/content/aclImdb/train/"
test_dir = "/content/aclImdb/test/"
x_train ,y_train = data_from_file(train_dir)
x_test ,y_test = data_from_file(test_dir)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))
#print(x_train[:2])

#x_train, x_test = x_train[:10000], x_test[:10000]
#y_train, y_test = y_train[:10000], y_test[:10000]

In [0]:
def sent_to_wordlist(corpus):
    """
        Convert sentences to list of words
        Remove the <br> tags which were very frequent in this dataset
        @param corpus : (List[str]) List of reviews
        @returns word_list : (List[List[str]]) List of words for every sentence
    """
    word_list = []
    #nobr = re.compile('\W*<br.*?>\W*', re.I)
    for sent in corpus:
        #sent = nobr.sub('', sent)
        sent = sent.replace('<br />', '')
        sent = sent.strip().split()
        word_list.append(sent)
    return word_list

In [0]:
def build_vocab(word_list, vocab_size=None):
    """
        Build vocab on train & test set (change this later to just the train set with a <unk> token for test set)
        @param word_list : (List[List[str]]) List of words for every sentence
        @param vocab_size : (int) Max vocab size [Default : Max Possible]
        @retuns word2idx : (Dict) Word to Index mapping Dictionary
    """
    word_count = Counter(chain(*word_list))
    if vocab_size is None:
        vocab_size = len(word_count)
    #print(vocab_size)
    sorted_count = word_count.most_common(vocab_size)
    word2idx = {w:i+1 for i, (w, _) in enumerate(sorted_count)}
    word2idx['<pad>'] = 0
    return word2idx

In [0]:
def words_to_int(word_list, word2idx):
    """
        Convert list of words to list of integers from word mapping
        @param word_list : (List[List[str]]) List of words for every sentence
        @param word2idx : (Dict) Word to Index mapping Dictionary
        @returns int_list : (List[List[int]]) List of integers for every sentence
    """
    int_list = []
    for sent in word_list:
        sent = [word2idx[w] for w in sent]
        int_list.append(sent)
    return int_list

In [0]:
x_train_word = sent_to_wordlist(x_train)
x_test_word = sent_to_wordlist(x_test)
#all_list = x_train_word + x_test_word
#print(len(all_list))
word2idx = build_vocab(x_train_word + x_test_word)
print(len(word2idx))

x_train_int = words_to_int(x_train_word , word2idx)
x_test_int = words_to_int(x_test_word , word2idx)
print(len(x_train_int), len(x_test_int))

x_train_len = [len(s) for s in x_train_int]
x_test_len = [len(s) for s in x_test_int]
print(len(x_train_len), len(x_test_len))

In [0]:
#Build a weight matrix on the vocabulary using glove mapping
matrix_len = len(word2idx)
weights_matrix = np.zeros((matrix_len, 300))
words_found = 0

for i, word in enumerate(word2idx):
    try: 
        weights_matrix[i] = glove[word]
        words_found += 1
    except KeyError:
        weights_matrix[i] = np.random.normal(scale=0.6, size=(300, ))

#print(len(weights_matrix))
#print(weights_matrix)

weights_matrix = torch.tensor(weights_matrix, device=DEVICE)
print(weights_matrix.size())

In [0]:
#analyze & remove outliers
sns.boxplot(x=x_train_len + x_test_len)

In [0]:
LIMIT = 500
x_train_int = [x_train_int[i] for i, len in enumerate(x_train_len) if len<LIMIT]
x_test_int = [x_test_int[i] for i, len in enumerate(x_test_len) if len<LIMIT]
y_train = [y_train[i] for i, len in enumerate(x_train_len) if len<LIMIT]
y_test = [y_test[i] for i, len in enumerate(x_test_len) if len<LIMIT]
print(len(x_train_int), len(x_test_int))
print(len(y_train), len(y_test))

x_train_len = [len(s) for s in x_train_int]
x_test_len = [len(s) for s in x_test_int]
print(len(x_train_len), len(x_test_len))

In [0]:
def pad_sequences(int_list, LIMIT, pad_token=0):
    """
        Pad sequences to a fixed length
        @param int_list : (List[List[int]]) List of integers for every sentence
        @param LIMIT : (int) Padding length
        @param pad_token : (int) Token for padding sequences [Default : 0]
        @returns padded_list : (List[List[int]]) List of padded sequences
    """
    padded_list = []
    for sent in int_list:
        sent = sent + [pad_token] * (LIMIT - len(sent))
        padded_list.append(sent)
    return padded_list

In [0]:
x_train_padded = pad_sequences(x_train_int, LIMIT)
x_test_padded = pad_sequences(x_test_int, LIMIT)
print(len(x_train_padded), len(x_test_padded))
#print(x_train_padded[1])
#t_len = [len(s) for s in x_train_padded]
#print(t_len[:10])
#print(x_train_len[1])

In [0]:
#Sanity Check for padding
print(len(x_train_padded[0]), len(x_train_padded[-1]))
print(len(x_test_padded[0]), len(x_test_padded[-1]))

In [0]:
#validation split
VAL_SPLIT_RATIO = 0.5
split_len = int(len(x_test_padded) * VAL_SPLIT_RATIO)
x_val, x_test = x_test_padded[:split_len:], x_test_padded[split_len:]
print(len(x_val), len(x_test))

split_len = int(len(y_test) * VAL_SPLIT_RATIO)
y_val, y_test = y_test[:split_len], y_test[split_len:]
print(len(y_val), len(y_test))

In [0]:
#tensors & batches
train_data = TensorDataset(torch.tensor(x_train_padded, device=DEVICE), torch.tensor(y_train, device=DEVICE))
val_data = TensorDataset(torch.tensor(x_val, device=DEVICE), torch.tensor(y_val, device=DEVICE))
test_data = TensorDataset(torch.tensor(x_test, device=DEVICE), torch.tensor(y_test, device=DEVICE))

BATCH_SIZE = 50
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True)

In [0]:
#Sanity check for batches
dataiter = iter(val_loader)
sample_x, sample_y = dataiter.next()
print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

In [0]:
class RNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_outputs, embedding_dim, seq_len, bidirectional=False, num_layers=1, dropout=0.3, weights_matrix=None):
        super(RNN, self).__init__()
        if bidirectional:
            self.dirn = 2
        else:
            self.dirn = 1

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if weights_matrix is not None:
            self.embedding.load_state_dict({'weight': weights_matrix})
            self.embedding.weight.requires_grad = False

        self.LSTM = nn.LSTM(embedding_dim, hidden_size, num_layers=num_layers, bidirectional=bidirectional)
        self.dropout = nn.Dropout(p=dropout)
        #self.fc = nn.Linear(hidden_size * self.dirn * seq_len, num_outputs)
        self.fc = nn.Linear(hidden_size * self.dirn, num_outputs)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input:torch.Tensor)->torch.Tensor:
        """
            @param input : (torch.Tensor[batch_size, seq_len])
            @returns output : (torch.Tensor[batch_size])
        """
        #input = batch_size, seq_len
        batch_size = input.size()[0]
        sent_len = [len(s) for s in input]

        #print(input.size())
        embed_inp = self.embedding(input)   #batch_size, seq_len, embedding_dim
        #print(embed_inp.size())
        embed_inp = embed_inp.permute(1,0,2)    #seq_len, batch_size, embedding_dim
        #print(embed_inp.size())
        #figure out packing with dataloader- sending seq_lens of batch without another computation here
        packed_inp = pack_padded_sequence(embed_inp, sent_len, enforce_sorted=False)
        rnn , (hidden, cell) = self.LSTM(packed_inp) #rnn - seq_len, batch_size, num_dir * hidden_size
        #rnn, _  = pad_packed_sequence(rnn)
        #print(rnn.size())
        #rnn = rnn.view(batch_size, -1)  #batch_size, num_dir*hidden_size*seq_len
        #print(rnn.size())

        #drop = self.dropout(rnn)
        #fc = self.fc(drop)  #batch_size, num_outputs
        #print(fc.size())

        drop = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        fc = self.fc(drop)

        output = self.sigmoid(fc) #batch_size
        return output.squeeze()

In [0]:
#CONSTANTS & VARIABLES
VOCAB_SIZE = len(word2idx)
print(VOCAB_SIZE)
HIDDEN_SIZE = 256
NUM_OUTPUTS = 1
EMBEDDING_DIM = 300
NUM_LAYERS = 2
LEARNING_RATE = 1e-3

In [0]:
model = RNN(VOCAB_SIZE, HIDDEN_SIZE, NUM_OUTPUTS, EMBEDDING_DIM, 500, bidirectional=True, num_layers=NUM_LAYERS, weights_matrix=weights_matrix)
print(model)
model =  model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.BCELoss()
criterion = criterion.to(DEVICE)

In [0]:
def accuracy(y_pred, y):
    rounded_pred = torch.round(y_pred)
    correct = (rounded_pred == y).float()   #float for division
    return correct.sum()/len(correct)

In [0]:
def train(model, optimizer, criterion, train_loader, DEVICE):
    epoch_loss = 0
    epoch_acc = 0
    size = len(train_loader)
    train_iter = iter(train_loader)
    model.train()

    for x_batch, y_batch in train_iter:
        optimizer.zero_grad()
        preds = model(x_batch)
        y_batch = y_batch.type_as(preds)
        loss = criterion(preds, y_batch)
        acc = accuracy(preds, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss/size, epoch_acc/size

In [0]:
def evaluate(model, criterion, val_loader, DEVICE):
    epoch_loss = 0
    epoch_acc = 0
    size = len(train_loader)
    val_iter = iter(val_loader)
    model.eval()

    with torch.no_grad():
        for x_batch, y_batch in val_iter:
            preds = model(x_batch)
            y_batch = y_batch.type_as(preds)
            loss = criterion(preds, y_batch)
            acc = accuracy(preds, y_batch)
            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss/size, epoch_acc/size

In [0]:
NUM_EPOCHS = 10
valid_loss = float('inf')
for epoch in range(NUM_EPOCHS):
    start = time.time()
    train_loss, train_acc = train(model, optimizer, criterion, train_loader, DEVICE)
    eval_loss, eval_acc = evaluate(model, criterion, val_loader, DEVICE)
    end = time.time()

    if eval_loss < valid_loss:
        valid_loss = eval_loss
        torch.save(model.state_dict(), 'imdb_best')

    print("EPOCH: ", epoch+1)
    print("TIME: ", end-start)
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {eval_loss:.3f} |  Val. Acc: {eval_acc*100:.2f}%')

In [0]:
model.load_state_dict(torch.load('imdb_best'))
test_loss, test_acc = evaluate(model, criterion, test_loader, DEVICE)
print(f'\t Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')