<a href="https://www.kaggle.com/siddhss20/review-classification-with-lstm-96?scriptVersionId=87717893" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
from torch.utils.data import DataLoader,TensorDataset
from sklearn import model_selection
from sklearn import metrics
import io
from string import punctuation
import tensorflow as tf
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv
/kaggle/input/glove6b100dtxt/glove.6B.100d.txt


In [2]:
if __name__ == '__main__':
    df = pd.read_csv("../input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
    df.sentiment = df.sentiment.apply(lambda x: 1 if x=='positive' else 0)
    df['review'] = df['review'].apply(lambda x : x.lower())
    df['review'] = df['review'].apply(lambda x : ''.join([c for c in x if c not in punctuation]))
    df = df.sample(frac=1).reset_index(drop=True)
    y = df.sentiment.values
    df['fold'] = -1
    kf =  model_selection.StratifiedKFold(n_splits=5)
    for f,(tr,ts) in enumerate(kf.split(X=df,y=y)):
        df.loc[ts,'fold'] = f

In [3]:
class IMDBdataset:

    def __init__(self, reviews, targets):
        self.reviews = reviews
        self.targets = targets

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, item):
        review = self.reviews[item, :]
        target = self.targets[item]

        return {
            "review": torch.tensor(review, dtype=torch.long),
            "target": torch.tensor(target, dtype=torch.float)
        }

In [4]:
#model architecture
class LSTM(nn.Module):
    def __init__(self,embedding_matrix,hidden_dim):
        super().__init__()
        number_of_words = embedding_matrix.shape[0]
        embed_dims = embedding_matrix.shape[1]
        self.hidden_dim = hidden_dim
        self.n_layers = 2
        # embedding layer
        self.embedding = nn.Embedding(num_embeddings = number_of_words,embedding_dim = embed_dims)
        self.embedding.weights = nn.Parameter(torch.tensor(embedding_matrix,dtype = torch.float32))
        self.embedding.weights.requires_grad = False
        self.lstm = nn.LSTM(input_size=embed_dims,
                           hidden_size=self.hidden_dim,
                           num_layers=self.n_layers,
                           batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(128,1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self,x,hidden):
        batch_size = x.size(0)
        x = self.embedding(x)
        x,hidden = self.lstm(x,hidden)
        x = x.contiguous().view(-1,self.hidden_dim)
        out = self.dropout(x)
        out = self.fc(out)
        sig_out = self.sigmoid(out)
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1]
        sig_out = torch.unsqueeze(sig_out,1)
        return sig_out,hidden
        
    def init_hidden(self,batch_size):
        # initialise hidden and cell states
        weight = next(self.parameters()).data
        if (torch.cuda.is_available()):
            hidden = (weight.new(self.n_layers, batch_size,self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden

In [5]:
def train(data_loader, model, optimizer, device,batch_size):
    model.train()
    h = model.init_hidden(batch_size)
    for data in data_loader:
        reviews = data["review"]
        targets = data["target"]
        h = tuple([each.data for each in h])
        # move the data to device that we want to use
        reviews = reviews.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)
        optimizer.zero_grad()
        predictions,h = model(reviews,h)
        loss = nn.BCEWithLogitsLoss()(predictions, targets.view(-1, 1))
        loss.backward()
        optimizer.step()

In [6]:
def evaluate(data_loader, model, device,batch_size):
    final_predictions = []
    final_targets = []
    test_h = model.init_hidden(batch_size)
    model.eval()

    # disable the gradient calculation
    with torch.no_grad():
        for data in data_loader:
            reviews = data["review"]
            test_h = tuple([each.data for each in test_h])
            reviews = reviews.to(device, dtype=torch.long)
            predictions,test_h = model(reviews,test_h)
            predictions = predictions.cpu().numpy().tolist()
            targets = data["target"].cpu().numpy().tolist()
            final_predictions.extend(predictions)
            final_targets.extend(targets)

    return final_predictions, final_targets

In [7]:
df['len'] = df['review'].apply(lambda x : len(x))
print(df['len'].describe())
df.head()

count    50000.000000
mean      1256.852460
std        950.870961
min         30.000000
25%        672.000000
50%        931.000000
75%       1526.000000
max      13346.000000
Name: len, dtype: float64


Unnamed: 0,review,sentiment,fold,len
0,i caught this on cinemax very late at nightnot...,0,0,300
1,it starts off with a view of earth and jupiter...,1,0,1667
2,a may day 1938 when happen a huge rally celebr...,1,0,1915
3,if like me you like your films to be unique an...,1,0,3106
4,can someone please help me i missed the last v...,0,0,750


In [8]:
MAX_LEN =2000
batch_size = 50
EPOCHS = 5

In [9]:
def GLOVE(fname):

    f = open(fname,'r')
    gloveModel = {}
    for line in f:
        splitLines = line.split()
        word = splitLines[0]
        wordEmbedding = np.array([float(value) for value in splitLines[1:]])
        gloveModel[word] = wordEmbedding
    print(len(gloveModel)," words loaded!")
    f.close()
    return gloveModel

def create_embedding_matrix(word_index, embedding_dict):
    # 100 dimensional glove model
    embedding_matrix = np.zeros((len(word_index) + 1, 100))
    for word, i in word_index.items():
        if word in embedding_dict:
            embedding_matrix[i] = embedding_dict[word]
    return embedding_matrix

In [10]:
def run(df, fold, params=None):
    PATH = './model_vals'
    train_df = df[df.fold != fold].reset_index(drop=True)
    test_df = df[df.fold == fold].reset_index(drop=True)
    tokenizer = tf.keras.preprocessing.text.Tokenizer()
    tokenizer.fit_on_texts(df.review.values.tolist())
    xtrain = tokenizer.texts_to_sequences(train_df.review.values)
    xtest = tokenizer.texts_to_sequences(test_df.review.values)

    xtrain = tf.keras.preprocessing.sequence.pad_sequences(xtrain, maxlen=MAX_LEN)
    xtest = tf.keras.preprocessing.sequence.pad_sequences(xtest, maxlen=MAX_LEN)

    train_data = IMDBdataset(reviews = xtrain,targets = train_df.sentiment.values)
    test_data = IMDBdataset(reviews = xtest,targets = test_df.sentiment.values)

    train_data_loader = torch.utils.data.DataLoader(train_data,batch_size = batch_size,num_workers = 2)
    test_data_loader = torch.utils.data.DataLoader(test_data,batch_size = batch_size,num_workers = 1)

    # Loading the embeddings 
    embedding_dict = GLOVE("../input/glove6b100dtxt/glove.6B.100d.txt")
    embedding_matrix = create_embedding_matrix(tokenizer.word_index, embedding_dict)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(device)
    model = LSTM(embedding_matrix,128)
    try:
        checkpoint = torch.load(PATH)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    except:
        pass
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    best_accuracy = 0.0
    early_stopping_counter = 0

    for epoch in range(EPOCHS):
        
        train(train_data_loader, model, optimizer, device,batch_size)
        output, targets = evaluate(test_data_loader, model, device,batch_size)
        outputs = np.array(output) >= 0.5
        
        accuracy = metrics.accuracy_score(targets, outputs)

        print(f"Fold: {fold}, Epoch: {epoch}, Accuracy Score = {accuracy}")
        
        # simple early stopping
        if accuracy > best_accuracy:
            best_accuracy = accuracy

        else:
            early_stopping_counter += 1

        if early_stopping_counter > 2:
            break
    torch.save({
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()}, PATH)

In [11]:
# loading and preprocessing
if __name__ == '__main__':
    for f in range(5):
        run(df, f)

400000  words loaded!
cuda
Fold: 0, Epoch: 0, Accuracy Score = 0.5002
Fold: 0, Epoch: 1, Accuracy Score = 0.5238
Fold: 0, Epoch: 2, Accuracy Score = 0.8277
Fold: 0, Epoch: 3, Accuracy Score = 0.8725
Fold: 0, Epoch: 4, Accuracy Score = 0.8862
400000  words loaded!
cuda
Fold: 1, Epoch: 0, Accuracy Score = 0.9321
Fold: 1, Epoch: 1, Accuracy Score = 0.9211
Fold: 1, Epoch: 2, Accuracy Score = 0.9284
Fold: 1, Epoch: 3, Accuracy Score = 0.9285
400000  words loaded!
cuda
Fold: 2, Epoch: 0, Accuracy Score = 0.9444
Fold: 2, Epoch: 1, Accuracy Score = 0.939
Fold: 2, Epoch: 2, Accuracy Score = 0.9437
Fold: 2, Epoch: 3, Accuracy Score = 0.9394
400000  words loaded!
cuda
Fold: 3, Epoch: 0, Accuracy Score = 0.9612
Fold: 3, Epoch: 1, Accuracy Score = 0.9534
Fold: 3, Epoch: 2, Accuracy Score = 0.9548
Fold: 3, Epoch: 3, Accuracy Score = 0.9549
400000  words loaded!
cuda
Fold: 4, Epoch: 0, Accuracy Score = 0.9648
Fold: 4, Epoch: 1, Accuracy Score = 0.9693
Fold: 4, Epoch: 2, Accuracy Score = 0.9698
Fold: 