## IMDB Movie Sentiments using the basic Neural Networks Architecture.

- 1. Gather and Exploring the data.

In [None]:
# Using the pandas library for viewing the data into the pandas dataframe.
import pandas as pd
raw_df = pd.read_csv('IMDB Dataset.csv')
raw_df

In [None]:
# Convert 'positive' to 1 and 'negative' to 0
raw_df['sentiment'] = raw_df['sentiment'].replace({'positive': 1, 'negative': 0})

In [None]:
raw_df.info()

In [None]:
raw_df

In [None]:
# Let's gather some more information of our data.
raw_df.info()

In [None]:
# Viewing and analysing the probability distribution of the Sentiments across the dataset.
raw_df.sentiment.value_counts(normalize=True).plot(kind='bar')

- It looks like we have balanced data with 50-50 Probability of both the Sentiments i.e., Positive and Negative

2. Now let's create the Vocabulary from our data using text pre-processing techniques like TF-IDF..

- Tokenization and Stemming of our data.

In [None]:
#Using the nltk library for implementing this task.
import nltk 
from nltk.tokenize import word_tokenize # For Tokening the words from the sentences.
from nltk.stem.snowball import SnowballStemmer #For stemming the tokens.
stemmer = SnowballStemmer(language='english') #instance of stemming class.

In [None]:
#Let's go with some of the stopwords, as this are one of the important factors in the sentiment analysis.
from nltk.corpus import stopwords
english_stopwords = stopwords.words('english')
print(", ".join(english_stopwords))

In [None]:
# Taking the first 115 words which will certainly have most of the negative words..
selected_stopwords = english_stopwords[:100]
print(", ".join(selected_stopwords))

## Here we are using the TF-IDF method for vectorisation of text..

In [None]:
# Now let's define the helper function for implementing the both tokenization and stemming..
def tokenize(text):
    return [stemmer.stem(word) for word in word_tokenize(text)]


In [None]:
# Making the TF-IDF vectorizer for making the vocabulary for our model..
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=tokenize,
                             stop_words=selected_stopwords,
                             ngram_range=(1,2),
                             max_features=5000)

In [None]:
vectorizer.fit(raw_df.review)

In [None]:
# Transforming the Sentiments into the vectors using the tf-idf vectorizer
inputs = vectorizer.transform(raw_df.review)

In [None]:
# Checking the shape of the inputs
inputs.shape

- Now let's split our data into Training, Validation and Test Sets..

In [None]:
from sklearn.model_selection import train_test_split
train_val_inputs, test_inputs, train_val_targets, test_targets = train_test_split(inputs, raw_df.sentiment, test_size=0.15, random_state=29)
train_inputs, val_inputs, train_targets, val_targets = train_test_split(train_val_inputs, train_val_targets, test_size=0.15, random_state=29)

In [None]:
# Checking the dimensions of the training set, validation set and test set..
f"{train_inputs.shape}, {train_targets.shape} | {val_inputs.shape}, {val_targets.shape} | {test_inputs.shape}, {test_targets.shape}"

# Now let's convert the training, val and test into PyTorch Tensors as our Deep Learning model needs Tensors to Work on.

In [None]:
# Importing the torch module which is a PyTorch framework..
# Converting the inputs into the tensors..
import torch
train_input_tensors = torch.tensor(train_inputs.toarray()).float()
val_input_tensors = torch.tensor(val_inputs.toarray()).float()
test_input_tensors = torch.tensor(test_inputs.toarray()).float()

In [None]:
# Viewing the shape of the input tensors..
f"{train_input_tensors.shape} | {val_input_tensors.shape} | {test_input_tensors.shape}"

In [None]:
# Converting the targets into the tensors..
train_target_tensors = torch.tensor(train_targets.values).float()
val_target_tensors = torch.tensor(val_targets.values).float()
test_target_tensors = torch.tensor(test_targets.values).float()

In [None]:
# Viewing the shape of the target tensors..
f"{train_target_tensors.shape} | {val_target_tensors.shape} | {test_target_tensors.shape}"

## Creating the PyTorch Dataset and PyTorch Dataloader for Batching the data..

- 1. Tensor Datasets --

In [None]:
# Creating the train, val and test tensor datasets..
from torch.utils.data import TensorDataset, DataLoader
train_ds = TensorDataset(train_input_tensors, train_target_tensors) 
val_ds = TensorDataset(val_input_tensors, val_target_tensors)
test_ds = TensorDataset(test_input_tensors, test_target_tensors)

- Tensor dataloader for batching --

In [None]:
# Fixing the batch size..
BATCH_SIZE = 150

In [None]:
# Creating the train, val and test dataloaders..
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE)
test_dl = DataLoader(test_ds, batch_size=BATCH_SIZE)

In [None]:
# Checking the length of train, val and test dataloaders..
f"{len(train_dl)} | {len(val_dl)} | {len(test_dl)}"

-  Working of Batches...

- We use the 'for-in' conditions while working with batches..

In [None]:
for batch in train_dl:
    batch_inputs, batch_targets  = batch
    print("batch_input_shape", batch_inputs.shape)
    print("batch_target_shape", batch_targets.shape)
    break

## Now It's time to make our Deep Neural Networks..

In [None]:
# importing the nn module for making the neural networks..
import torch.nn as nn
import torch.nn.functional as F # The loss function

In [None]:
# Making the custom class for making the Deep Neural Networks..
class IMDBNet(nn.Module):

    # First defining the init method which requires only self as argument.
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(5000, 2500) # Because we have 5000 TF-IDF vectors.
        self.layer2 = nn.Linear(2500, 1250)
        self.layer3 = nn.Linear(1250, 625)
        self.layer4 = nn.Linear(625, 312)
        self.layer5 = nn.Linear(312, 156)
        self.layer6 = nn.Linear(156, 1)

    # Defining the function for the forward pass..
    def forward(self, inputs):
        out = self.layer1(inputs)
        out = F.relu(out)
        out = self.layer2(out)
        out = F.relu(out)
        out = self.layer3(out)
        out = F.relu(out)
        out = self.layer4(out)
        out = F.relu(out)
        out = self.layer5(out)
        out = F.relu(out)
        out = self.layer6(out)

        return out

In [None]:
# Creating an instance of a model.
model = IMDBNet()

In [None]:
from sklearn.metrics import accuracy_score, f1_score

In [None]:
for batch in train_dl:
    bi, bt = batch
    print('input_shape', bi.shape)
    print('target_shape', bt.shape)

    bo = model(bi)
    print('bo_shape', bo.shape)

    # Convert outputs to the probabilities
    probs = torch.sigmoid(bo[:,0])
    print("Probs: ", probs[:10])

    # Convert probabilities to prediction
    preds = (probs > 0.48).int()
    print("Predictions: ", preds[:10])
    print("Targets: ", bt[:10])

    #Check metrics for evaluation..
    print('Accuracy: ', accuracy_score(bt, preds))
    print("F1-Score: ", f1_score(bt, preds, average='weighted'))

    # Implementing the loss function for checking the loss..
    print("Loss: ", F.binary_cross_entropy(preds.float(), bt))
    break

In [None]:
# Actual targets..
bt[:10]

In [None]:
# Outputs..
bo[:10]

- Now let's make the Evulation Function for our model..

In [None]:
# Evaluation function..
def evaluate(model, dl):

    # Making the lists for appending the results of  accuracy, f1-score and loss.
    losses, accs, f1s = [], [], []

    # looping over the batches.
    for batch in dl:
        inputs, targets = batch 

        # Pass inputs to the model.
        outputs = model(inputs)

        # Convert to the probabilites.
        probs = torch.sigmoid(outputs[:,0])

        # Compute loss
        loss = F.binary_cross_entropy(probs, targets)

        # Compute predictions.
        preds = (probs > 0.48).int()

        # Compute accuracy and F1-score.
        acc = accuracy_score(targets, preds)
        f1 = f1_score(targets, preds, average='weighted')

        # Appending the loss, accuracy and f1-score 's.
        losses.append(loss)
        accs.append(acc)
        f1s.append(f1) 

    return (torch.mean(torch.tensor(losses)).item(),
           torch.mean(torch.tensor(accs)).item(),
           torch.mean(torch.tensor(f1s)).item())

In [None]:
evaluate(model, train_dl)

- Train and fitting the model batch by batch..

In [None]:
# Implementing the function..
def fit(epochs, lr, model, train_dl, val_dl):

    #Keeping the history.
    history = []

    # Optimization method
    optimizer = torch.optim.Adam(model.parameters(), lr, weight_decay=1e-5)

    for epoch in range(epochs):
        for batch in train_dl:

            # Get inputs and targets.
            inputs, targets = batch

            # Get model outputs.
            outputs = model(inputs)

            # Get probabilities.
            probs = torch.sigmoid(outputs[:,0])

            # loss function
            loss = F.binary_cross_entropy(probs, targets)

            # Doing mathematical calculations like gradients, etc.
            loss.backward() # Back propagation.
            optimizer.step() # Optimization.
            optimizer.zero_grad() # Converting to zero gradients after one run.

    # Evaluation of the model.
    loss, acc, f1 = evaluate(model, val_dl)
    print('Epoch : {}; Loss: {}; Accuracy: {}; F1-Score: {};'.format(epoch+1, loss, acc, f1))
    history.append([loss, acc, f1]) 
    return history       


In [None]:
fit(5, 0.001, model, train_dl, val_dl)

- Let's run the model again..

In [None]:
model = IMDBNet()

In [None]:
history = []

In [None]:
history.append(evaluate(model, val_dl))

In [None]:
history

In [None]:
history += fit(5, 0.001, model, train_dl, val_dl)

In [None]:
history

In [None]:
losses = [item [0] for item in history]
accs = [item[1] for item in history]
f1s = [item[2] for item in history]

In [None]:
import matplotlib.pyplot as plt
plt.title("Loss")
plt.plot(losses)

In [None]:
plt.title("Accuracy")
plt.plot(accs)

In [None]:
plt.title("F1-Score")
plt.plot(accs)

# Making prediction on an example data.

In [None]:
ex_df = raw_df.sample(10)
ex_df

In [None]:
def predict_df(df):
    inputs = vectorizer.transform(df.review)
    input_tensors = torch.tensor(inputs.toarray()).float()
    outputs = model(input_tensors)
    probs = torch.sigmoid(outputs[:,0])
    preds = (probs > 0.48).int()
    return preds

In [None]:
ex_df.sentiment.values

In [None]:
predict_df(ex_df)

In [None]:
ex_df.review