# 1. Import

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings('ignore')
import random
import time

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

cuda = torch.cuda.is_available()
if cuda:
    print("Cuda Available")
else:
    print("No Cuda")

Cuda Available


In [2]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df.head(3)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive


# 2. Preprocessing

In [3]:
df['sentiment'] = df.sentiment.apply(lambda x: int(x=='positive'))
df = df.sample(frac=1).reset_index(drop=True)
df['kfold'] = [1,2,3,4,5]*10000
df.head(3)

Unnamed: 0,review,sentiment,kfold
0,"Alright, I'm 12, so this is where you get to s...",1,1
1,"for my opinion, the middle of the film, specia...",1,2
2,"In post civil war America the President, (Van ...",1,3


In [4]:
def html_parse(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text()

def remove_stopwords(text):
    tokens = word_tokenize(text)
    stop = stopwords.words('english')
    return [token for token in tokens if token.lower() not in stop]

def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in text]

def erase_mark(text):
    return text.replace('!', '').replace('?', '').replace("'", "").replace('"', '').replace(',', '').replace('.', '')

In [5]:
t0 = time.time()
df.review = df.review.apply(html_parse).apply(erase_mark).apply(remove_stopwords).apply(lemmatize)
t1 = time.time()
print("Duration: {:.1f}M".format((t1-t0)/60))
df.head()

Duration: 3.0M


Unnamed: 0,review,sentiment,kfold
0,"[Alright, Im, 12, get, see, movie, pre-teens, ...",1,1
1,"[opinion, middle, film, specially, love, scene...",1,2
2,"[post, civil, war, America, President, (, Van,...",1,3
3,"[thought, get, heart, film, :, 1, ), never, se...",1,4
4,"[Vow, Cherish, wonderful, movie, based, novel,...",1,5


In [6]:
def int2char(text):
    return dict(enumerate(text))
def char2int(text):
    return {ch: i for i, ch in int2char(text).items()}
chars = []
for i, v in df.iterrows():
    chars += v['review']
chars = tuple(set(chars))
i2c = int2char(chars)
c2i = char2int(chars)

# 3. Pretrained Word Embeddings

In [7]:
import codecs
import tqdm

fasttext = {}
f = codecs.open('../input/fasttext/wiki.simple.vec', encoding='utf-8')
for line in f:
    values = line.rstrip().rsplit(' ')
    word = values[0]
    vec = np.asarray(values[1:], dtype=np.float32)
    fasttext[word]=vec
f.close()

In [8]:
def create_emb_matrix(dic, embedding_dict, dim):
    embedding_matrix = np.zeros((len(dic)+1, dim))
    for i, v in dic.items():
        if v in embedding_dict:
            embedding_matrix[i] = embedding_dict[v]
    return embedding_matrix
            
embedding_matrix = create_emb_matrix(i2c, fasttext, 300)

# 4. Model Building

In [9]:
class IMDB(Dataset):
    def __init__(self, review, sentiment):
        self.review = review
        self.sentiment = sentiment
        
    def __getitem__(self, index):
        return {'review': torch.LongTensor(self.review[index]), 'sentiment': torch.LongTensor([self.sentiment[index]])}
    
    def __len__(self):
        return len(self.sentiment)
    
class LSTM(nn.Module):
    def __init__(self, embedding_matrix):
        super(LSTM, self).__init__()
        num_words = embedding_matrix.shape[0]
        emb_dim = embedding_matrix.shape[1]
        self.embedding = nn.Embedding(num_embeddings=num_words, embedding_dim=emb_dim)
        self.embedding.weight = nn.Parameter(torch.FloatTensor(embedding_matrix))
        self.embedding.weight.requires_grad = False
        
        self.lstm = nn.LSTM(emb_dim, 256, 2, bidirectional=True, batch_first=True)
        self.fc1 = nn.Linear(1024, 1)
#         self.fc2 = nn.Linear(32, 1)
        
    def forward(self, x):
        x = self.embedding(x)
        hidden, _ = self.lstm(x)
        avg_pool = torch.mean(hidden, 1)
        max_pool, max_idx = torch.max(hidden, 1)
        out = torch.cat((avg_pool, max_pool), 1)
        out = self.fc1(out)
        return F.sigmoid(out)

In [10]:
def accuracy(yhat, y):
    return torch.sum((yhat>0.5)==y).item()/len(y)

In [11]:
def char2idx(list_):
    return [c2i[ch] for ch in list_]
    
df['length'] = df.review.apply(len)
df = df[df['length']>4]
df.review = df.review.apply(char2idx)
df.review = df.review.apply(np.array)

df.head()

Unnamed: 0,review,sentiment,kfold,length
0,"[155296, 217473, 127720, 173521, 107818, 65254...",1,1,84
1,"[186398, 185476, 88682, 250, 190031, 225370, 1...",1,2,30
2,"[160240, 221405, 212517, 220262, 143263, 192, ...",1,3,107
3,"[166688, 173521, 5227, 88682, 207637, 172513, ...",1,4,171
4,"[53165, 206747, 8554, 65254, 161573, 176195, 3...",1,5,61


In [14]:
model = LSTM(embedding_matrix)
if cuda:
    model.cuda()

In [15]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
    
epochs=5
lr = 0.002
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCELoss()


for fold in range(1,6):
    train, valid = df[df['kfold']!=fold].reset_index(drop=True), df[df['kfold']==fold].reset_index(drop=True)
    
    xtrain, xtest = train.review.values, valid.review.values
    
    xtrain = pad_sequences(xtrain, maxlen=1447)
    xtest = pad_sequences(xtest, maxlen=1447)
    
#     print(xtrain[0].shape, train.sentiment.values.shape)
#     print(xtrain[0], y[0])
    
    train_dset = IMDB(xtrain, train.sentiment.values)
    valid_dset = IMDB(xtest, valid.sentiment.values)
    trainloader = DataLoader(train_dset, batch_size=128, shuffle=True)
    validloader = DataLoader(valid_dset, batch_size=128, shuffle=True)
    
    for epoch in range(epochs):
        
        train_loss, valid_loss = 0, 0
        train_accuracy, valid_accuracy = 0,0
        
        for batch in trainloader:
            optimizer.zero_grad()
            x, y = batch['review'], batch['sentiment']
            if cuda:
                x, y = x.cuda(), y.cuda().float()
            output = model(x)
            train_accuracy += accuracy(output.cpu().detach(), y.cpu().detach())
            loss = criterion(output, y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        with torch.no_grad():
            for batch in validloader:
                x, y = batch['review'], batch['sentiment']
                if cuda:
                    x, y = x.cuda(), y.cuda().float()
                output = model(x)
                loss = criterion(output, y)
                valid_loss += loss.item()
                valid_accuracy += accuracy(output.cpu().detach(), y.cpu().detach())
        print("Epoch: {}/{}".format(epoch+1, epochs))
        print("Train Acc: {:.2f}%".format(100*train_accuracy/len(trainloader)))
        print("Valid Acc: {:.2f}%".format(100*valid_accuracy/len(validloader)))

Epoch: 1/5
Train Acc: 77.62%
Valid Acc: 84.46%
Epoch: 2/5
Train Acc: 84.84%
Valid Acc: 85.02%
Epoch: 3/5
Train Acc: 87.13%
Valid Acc: 86.64%
Epoch: 4/5
Train Acc: 89.00%
Valid Acc: 87.19%
Epoch: 5/5
Train Acc: 91.01%
Valid Acc: 87.64%
Epoch: 1/5
Train Acc: 91.93%
Valid Acc: 92.94%
Epoch: 2/5
Train Acc: 94.70%
Valid Acc: 92.88%
Epoch: 3/5
Train Acc: 97.10%
Valid Acc: 92.00%
Epoch: 4/5
Train Acc: 98.73%
Valid Acc: 91.64%
Epoch: 5/5
Train Acc: 99.54%
Valid Acc: 92.02%
Epoch: 1/5
Train Acc: 96.54%
Valid Acc: 99.01%
Epoch: 2/5
Train Acc: 99.26%
Valid Acc: 99.00%
Epoch: 3/5
Train Acc: 99.87%
Valid Acc: 98.59%
Epoch: 4/5
Train Acc: 99.98%
Valid Acc: 99.21%
Epoch: 5/5
Train Acc: 99.99%
Valid Acc: 99.44%
Epoch: 1/5
Train Acc: 98.34%
Valid Acc: 99.06%
Epoch: 2/5
Train Acc: 99.31%
Valid Acc: 99.14%
Epoch: 3/5
Train Acc: 99.74%
Valid Acc: 98.89%
Epoch: 4/5
Train Acc: 99.92%
Valid Acc: 99.36%
Epoch: 5/5
Train Acc: 99.98%
Valid Acc: 99.50%
Epoch: 1/5
Train Acc: 98.77%
Valid Acc: 99.24%
Epoch: 2/5
Tr