In [None]:
import time
import pickle as pkl
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [None]:
import contractions
import spacy
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import wordnet

In [None]:
# load data from disk

df = pd.read_csv('~/Data/IMDB_Dataset.csv')
df.head()

In [None]:
df['y'] = df['sentiment'].apply(lambda x : 0 if x == 'negative' else 1)

In [None]:
# find stop words
nlp = spacy.load('en')
stops = nlp.Defaults.stop_words
retain_words = ['always', 'nobody', 'cannot', 'none', 'never', 'no', 'not']

for j in retain_words:
    stops.discard(j)
    
remove_chars = ['br', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
               'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '~', '`', '!', '@', '#', '$', '%', '^',
               '&', '*', '(', ')', '-', '_', '+', '=', '[', ']', '{', '}', '|', ':', ';', '<', '>', ',',
               '.', '?', ",", '"']

for j in remove_chars:
    stops.add(j)

In [None]:
def get_wordnet_pos(word):
    "Map pos tags to first character lemmatize function accepts"
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J" : wordnet.ADJ,
               "N" : wordnet.NOUN,
               "V" : wordnet.VERB,
               "R" : wordnet.ADV
               }
    
    return tag_dict.get(tag, wordnet.NOUN)

# regular expression based tokenizer

In [None]:
def regex_tokenizer(text, stops):
    
    # fix contractions
    text2 = contractions.fix(text)
    
    # tokennzer
    tokenizer = RegexpTokenizer(r'\w+')
    words1 = tokenizer.tokenize(text2)
    
    # remove numbers
    #words2 = [x for x in words1 if x.digit() == False]
    
    # convert to lowercase
    words3 = [x.lower() for x in words1]
    
    # remove stopwords
    words4 = [w for w in words3 if w not in stops]
    
    # use lemmatizer
    words5 = [wordnet_lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in words4]
    
    return words5

In [None]:
wordnet_lemmatizer = WordNetLemmatizer()

In [None]:
def text_preprocessing(df, x_col, y_col=None, max_seq_len=128):
    
    self.MaxFeature = 20000

    if y_col is not None:
        data = df[[x_col, y_col]]

    else:
        data = df[[x_col]]

    print("Data Reading Completed ...")
    print("Train Samples : ", df.shape[0])

    # parse and tokenize text data
    
    data['parse_text'] = data.apply(lambda x : regex_tokenizer(x, stops))
    print("Tokenization Completed ...")

    # build dictionary
    seq_list = data['parse_text'].tolist()
    big_list = list(itertools.chain.from_iterable(seq_list))
    big_uniq_list = list(set(big_list))

    token2idx = {}
    for j in range(len(big_uniq_list)):
        token2idx[big_uniq_list[j]] = j

    # select the top max_features 
    #???
    # add support for padding and unknown tokens
    token2idx['<pad>'] = max(token2idx.values())+1
    token2idx['<unk>'] = max(token2idx.values())+1

    self.token2idx = token2idx
    print("Dictionary Completed ...")

    # cut long sentences short
    data['parse_text_short'] = data['parse_text'].apply(
        lambda x : x + [token2idx['<pad>']]*(self.max_seq_len - len(x))
        )
    print("Sentence Normalization Completed ...")

    # convert tokens to indicies
    data['tokenized'] = data['parse_text_short'].apply(
        lambda x : [token2idx[j] if j in token2idx.keys()
                                    else token2idx['<unk>'] for j in x]
        )
    print("Index Conversion Completed ...")

    # add padding to make all samples of equal length
    data['tok_pad'] = data['tokenized'].apply(
        lambda x : x + [token2idx['<pad>']]*(self.max_seq_len - len(x))
        )
    print("Padding Completed ...")
    
    return data, token2idx

In [None]:
# design class to read and preprocess data
class text_dataset(Dataset):
    def __init__(self, df, x_col, y_col=None):
    
        if y_col is not None:
            df = df[[x_col]]
            self.target = df[y_col].tolist()
            
        self.sequence = df['tok_pad'].tolist()
        
    def __getitem__(self, i):
        if y_col is not None:
            return np.array(self.sequence[i]), self.target[i]
        else:
            self.sequence[i]

    def __len__(self):
        return len(self.sequence)        

In [None]:
def collate(batch):
    inputs = torch.LongTensor([item[0] for item in batch])
    targets = torch.FloatTensor([item[1] for item in batch])
    return inputs, targets

In [None]:
df, token2idx = text_preprocessing(df, x_col='review', y_col='y', max_seq_len=128)

In [None]:
train_size = 0.8
msk = np.random.rand(len(df)) < train_size
train = df[msk]
test = df[~msk]

In [None]:
# load train dataset
t1 = time.time()
batch_size = 32
max_seq_len = 128
train_data = text_dataset(train, x_col = 'tok_pad', y_col = 'y', max_seq_len = max_seq_len)
train_data_loader = DataLoader(train_data, batch_size = batch_size, shuffle = True, 
                              num_workers=4, collate_fn = collate)

test_data = text_dataset(test, x_col = 'tok_pad', y_col = 'y', max_seq_len = max_seq_len)
test_data_loader = DataLoader(test_data, batch_size = batch_size, shuffle = True, 
                              num_workers=4, collate_fn = collate)

t2 = time.time()
print("Time Taken in Text Processing : ", t2-t1)

In [None]:
# load word embeddings
token2idx = train_data.vocab_dictionary()
print("Length of Dictionary : ", len(token2idx))

In [None]:
vocab_size = len(token2idx)
embed_dim = 300

In [None]:
seed = 99
np.random.seed(seed)
embed_mat = np.random.rand(vocab_size, embed_dim)

In [None]:
words_found = 0
vocab = token2idx.keys()

In [None]:
t3 = time.time()
with open('~/Data/glove.6B/glove.6B.300d.txt', 'rb') as embed_file:
    for line in embed_file:
        l = line
        l = l.decode().split()
        word = l[0]
        vec = np.array(l[1:]).astype(np.float)
        
        # check if word is in vocab
        if word in vocab:
            embed_mat[token2idx['word']] = vec
            words_found += 1
            
print("Words found : ", words_found)
t4 = time.time()
print("Time Taken in Embedding Generation : ", t4-t3)

In [None]:
path = '~/Data/'
filename = path + 'IMDB_Embed'
fileObject = open(fileName, 'wb')

save = True
load = True
if save:
    pkl.dump(embed_mat, fileObject)
    fileObject.close()

if load:
    fileObject2 = open(fileName, 'wb')
    embed_mat = pkl.load(fileObject2)
    fileObject2.close()

In [None]:
# design GRU model

class GRU_Model(nn.Module):
    def __init__(self, vocab_size, embed_dim, embed_mat, non_trainable=True,
                gru_layers=2, bidirectional=True):
        super(GRU_Model, self).__init__()
        
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.embed_mat = embed_mat
        
        self.gru_layers = gru_layers
        self.bidirectional = bidirectional
        self.gru_hidden = 300
        self.fc1_size = 200
        self.fc2_size = 32
        self.output_size =1
        
        # Define the word embedding layer
        self.encoder = nn.Embedding(self.vocab_size, self.embed_dim)
        
        # Load embedding weights into the layer
        embed_weights = torch.tensor(self.embed_mat, dtype=torch.float)
        self.encoder.load_state_dict({'weight': embed_weights})
        
        if non_trainable:
            self.encoder.weight.requires_grad = False
            
        # create a bidirectional GRU layer
        self.gru = nn.GRU(self.embed_dim, self.gru_hidden, self.gru_layers, batch_first=True, dropout=0.5, 
                         bidirectional=self.bidirectional)
        
        self.batch_norm1 = nn.BatchNorm1d(self.fc1_size)
        self.batch_norm2 = nn.BatchNorm1d(self.fc2_size)
        
        if self.bidirectional:
            self.num_directions = 2
        else:
            self.num_directions = 1
            
        self.fc1 = nn.Linear(self.gru_hidden * self.num_directions, self.fc1_size)
        self.dropout1 = nn.Dropout(0.10)
        self.fc2 = nn.Linear(self.fc1_size, self.fc2_size)
        self.dropout2 = nn.Dropout(0.05)
        self.fc3 = nn.Linear(self.fc2_size, self.output_size)
        
        
    def forward(self, x):
        
        print("Input Shape : ", x.shape)
        out, hidden = self.gru(self.encoder(x))
        print("Output Shape : ", out.shape)
        out = out[:,-1,:]
        out = F.relu(self.batch_norm1(self.fc1(out)))
        out = self.dropout1(out)
        out = F.relu(self.batch_norm2(self.fc2(out)))
        out = self.dropout2(out)
        out = self.fc3(out)
        return out
        
        
        

In [None]:
# create model
model = GRU_Model(vocab_size, embed_dim, embed_mat, non_trainable=True, gru_layers=2, bidirectional=True)

In [None]:
optimizer = optim.Adam(model.parameters(), lr=0.0001)
criterion = nn.BCEWithLogitsLoss()

In [None]:
n_epochs=5

In [None]:
running_loss = []

In [None]:
for n_epi in range(n_epochs):
    print("epoch : ", n_epi+1)
    step = 0
    
    t5 = time.time()
    
    for i,data in enumerate(train_data_loader, 0):
        step =step+1
        inputs, labels = data
        out = model(inputs)
        optimizer.zero_grad()
        loss = criterion(labels.view(-1,1), out.view(-1,1))
        print("Step : ", step+1, " Loss : ", loss.item())
        running_loss.append(loss.item())
        loss.backward()
        optimizer.step()
    
    t6 = time.time()
    print("Tiem Taken in Training Epoch : ", t6-t5)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.plot(running_loss)
plt.show()