Import dataset '**imdb-reviews.csv**' from google drive and use pandas to parse.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
dataset_path = '/content/drive/MyDrive/imdb-reviews.csv'
testset_path = None

In [None]:
# imports
import pandas
import re
import nltk
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import wordnet
nltk.download('omw-1.4')
from textblob import Word 
from collections import Counter
import operator
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from gensim.scripts.glove2word2vec import glove2word2vec
import torch
import torch.nn as nn
from sklearn.metrics import classification_report
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc, roc_auc_score
import random


In [None]:
df = pandas.read_csv(dataset_path, sep='\t', engine='python')

In [None]:
df.head()

In [None]:
df.info()

Clear review comments. Remove:

    HTML
    Numbers
    Punctuation
    Uppercase
    Stopwords
    Lemmatization



In [None]:
# HTML

def remove_html(text):
  return  re.sub('<.*?>', '', text)
  
def clean_html(df):
  df['review'] = df['review'].apply(remove_html)
  print(df.head())
  return df

In [None]:
# Numbers

def clean_numbers(df):
  df['review'] = df['review'].str.replace(r'\d+', '', regex=True)
  print(df.head())
  return df

In [None]:
# Punctuation

def clean_punctuation(df):
  df['review'] = df['review'].str.replace(r'[^\w\s]+', '', regex=True)
  print(df.head())
  return df

In [None]:
# Uppercase

def clean_uppercase(df):
  df['review'] = df['review'].str.lower()
  print(df.head())
  return df

Before we perform the other removals we need to tokenize the words

In [None]:
# Tokenization

def clean_tokenize(df):
  df['review'] = df['review'].apply(word_tokenize)
  print(df.head())
  return df

In [None]:
# Stopwords

pattern = stopwords.words('english')

def clean_stopwords(df):
  df['review'] = df['review'].apply(lambda words: [w for w in words if w not in pattern])
  print(df.head())
  return df

In [None]:
# Lemmatization

lemmatizer = nltk.stem.WordNetLemmatizer()

def clean_lemmatize(df):
  df['review'] = df['review'].apply(lambda word: [lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(lemmatizer.lemmatize(w, 'n'), 'a'), 'v'), 'r'), 's') for w in word])
  print(df.head())
  return df

Now we transform rating to 0 for negative (values in range [0, 4.0]) and 1 for positive (values in range [7.0, 10.0])

In [None]:
def clean_scale(df):
  df['rating'] = df['rating'].apply(lambda x: 0 if x <= 4.0 else 1)
  print(df.head())
  return df

Now we remove some rare words:

In [None]:
# rarewords

def clean_rarewords(df):
  temp = df['review'].apply(lambda l: [item for item in l])
  flat_list = [item for sublist in temp for item in sublist]
  counter_list = Counter(flat_list).most_common()
  final_list_desc = counter_list[:10]
  final_list_asc  = counter_list[-10:]
  only_first = [x for x,y in final_list_desc]
  only_last  = [x for x,y in final_list_asc]

  df['review'] = df['review'].apply(lambda words: [x for x in words if (x not in only_first) and (x not in only_last)])
  print(df.head())
  return df

In [None]:
def transform(df):
  df = clean_html(df)
  df = clean_numbers(df)
  df = clean_punctuation(df)
  df = clean_uppercase(df)
  df = clean_tokenize(df)
  df = clean_stopwords(df)
  df = clean_lemmatize(df)
  df = clean_scale(df)
  df = clean_rarewords(df)
  return df

In [None]:
df = transform(df)

Now we split the dataset into training, validation, and testing sets

In [None]:
X = df['review'].apply(lambda x: ' '.join(x))
Y = df['rating']

trainX, testX, trainY, testY = train_test_split(X, Y, train_size=0.80, random_state=13)
testX, valX, testY, valY = train_test_split(testX, testY, train_size=0.5, random_state=13)

print(len(trainX))
print(len(valX))
print(len(testX))

Now we convert the reviews into numbers using the Glove model.

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip  
!unzip glove.6B.zip  

In [None]:
glove_dict = {}
with open('glove.6B.300d.txt', "r") as file:  
    for line in file:      
      word, coefs = line.split(maxsplit=1)
      coefs = np.fromstring(coefs, dtype=float, sep=" ")
      glove_dict[word] = coefs

print("Dictionary size: ", len(glove_dict))


In [None]:
print("Example:\n");
print("good -> ", glove_dict["good"])

In [None]:
def glove_dataframe(df):
  recognized_words = 0
  total_words = 0
  review_vectors = []

  for i, row in enumerate(df):
    words = str(row).split(' ')
    cur_words = len(words)
    total_words += cur_words
    coefs = np.zeros(300)

    for j, word in enumerate(words):
      if word in glove_dict:
        recognized_words += 1
        cur_coefs = glove_dict[word]
        
        for idx, c in enumerate(cur_coefs):
          coefs[idx] += c
    
    for idx, c in enumerate(coefs):
      coefs[idx] = c / cur_words
    
    review_vectors.append(coefs)
  
  return np.array(review_vectors), recognized_words, total_words
      

In [None]:
glove_trainX, recognized_words, total_words = glove_dataframe(trainX)


Recognized words:  3754726  Total words:  3922278  Ratio:  0.9572819672649414

Recognized words:  473874  Total words:  494881  Ratio:  0.9575514113493951

Recognized words:  469472  Total words:  489972  Ratio:  0.9581608744989509


In [None]:
print("Recognized words: ", recognized_words, " Total words: ", total_words, " Ratio: ", recognized_words / total_words)

In [None]:
glove_valX, recognized_words, total_words = glove_dataframe(valX)


In [None]:
print("Recognized words: ", recognized_words, " Total words: ", total_words, " Ratio: ", recognized_words / total_words)

In [None]:
glove_testX, recognized_words, total_words = glove_dataframe(testX)


In [None]:
print("Recognized words: ", recognized_words, " Total words: ", total_words, " Ratio: ", recognized_words / total_words)

Now we will convert datasets and labels to tensors

In [None]:
glove_trainX = torch.tensor(glove_trainX)
trainY = torch.squeeze(torch.from_numpy(trainY.to_numpy()).float())
glove_valX = torch.tensor(glove_valX)
valY = torch.squeeze(torch.from_numpy(valY.to_numpy()).float())
glove_testX = torch.tensor(glove_testX)
testY = torch.squeeze(torch.from_numpy(testY.to_numpy()).float())

Now we will build the Feed Forward Neural Network

In [None]:
class Net(nn.Module):
  def __init__(self, D_in, H1, H2, H3, D_out):
        super(Net, self).__init__()
        
        self.linear1 = nn.Linear(D_in, H1)
        self.linear2 = nn.Linear(H1, H2)
        self.linear3 = nn.Linear(H2, H3)
        self.linear4 = nn.Linear(H3, D_out)

        self.sigmoid = nn.Sigmoid()
  
  def forward(self, x):
        h1 = self.linear1(x)
        h2 = self.linear2(h1)
        h3 = self.linear3(h2)
        h4 = self.linear4(h3)
        out = self.sigmoid(h4)
        return out

In [None]:
def get_classification_report(model):
  x_batch = torch.squeeze(glove_testX.to(torch.float32))
  y_pred = torch.squeeze(model(x_batch))
  y_pred = torch.round(y_pred)

  print(classification_report(testY.detach().numpy(), y_pred.detach().numpy()))


In [None]:
def get_roc_curve(model):
  x_batch = torch.squeeze(glove_testX.to(torch.float32))
  y_pred = torch.squeeze(model(x_batch))
  y_pred = y_pred.detach().numpy()
  y_real = testY.detach().numpy()
  false_positive_rate, true_positive_rate, threshold = roc_curve(y_real, y_pred)
  print("Print roc_auc score: ", roc_auc_score(y_real, y_pred))

  plt.plot(false_positive_rate, true_positive_rate)
  plt.plot([0, 1], ls="--")
  plt.plot([0, 0], [1, 0] , c=".7")
  plt.plot([1, 1] , c=".7")

  plt.ylabel('True Positive Rate')
  plt.xlabel('False Positive Rate')
  plt.show()


In [None]:
def train_and_test_hyperparameters(H1_, H2_, H3_, learning_rate_, batch_size_, epoch_, optimizer_, loss_func_):
  D_in = glove_trainX.shape[1]
  H1 = H1_
  H2 = H2_
  H3 = H3_
  D_out = 1

  model = Net(D_in, H1, H2, H3, D_out)
  loss_func = loss_func_
  learning_rate = learning_rate_
  optimizer = optimizer_(model.parameters(), lr=learning_rate)

  dataset = torch.utils.data.TensorDataset(glove_trainX, trainY)
  dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size_, shuffle=True)

  dataset1 = torch.utils.data.TensorDataset(glove_valX, valY)
  dataloader1 = torch.utils.data.DataLoader(dataset1, batch_size=len(glove_valX), shuffle=True)

  avg_loss = []
  avg_acc = []
  val_loss = []
  val_acc = []

  for epoch in range(epoch_):
    batch_losses = []
    batch_acc = []

    for x_batch, y_batch in dataloader:

      x_batch = torch.squeeze(x_batch.to(torch.float32))
      y_pred = torch.squeeze(model(x_batch))
      loss = loss_func(y_pred, y_batch)

      batch_losses.append(loss.item())
      y_pred = torch.round(y_pred)
      predicted = (y_pred == y_batch).float()
      if len(predicted) > 0:
        batch_acc.append(predicted.sum() / len(predicted))
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

    avg_loss.append(sum(batch_losses)/len(dataloader))
    avg_acc.append(sum(batch_acc) / len(batch_acc))
    

    x_batch = torch.squeeze(glove_valX.to(torch.float32))
    y_pred = torch.squeeze(model(x_batch))
    loss = loss_func(y_pred, valY)

    val_loss.append(loss.item())
    y_pred = torch.round(y_pred)
    predicted = (y_pred == valY).float()
    if len(predicted) > 0:
      val_acc.append(predicted.sum() / len(predicted))

  return model, avg_loss, avg_acc, val_loss, val_acc

In [None]:
def plot_loss_accuracy(epoch, avg_loss, val_loss, avg_acc, val_acc):
  print("Batches average loss: ", sum(avg_loss) / len(avg_loss))
  print("Batches average acc: ", float(sum(avg_acc) / len(avg_acc)))
  print("Validation average loss: ", sum(val_loss) / len(val_loss))
  print("Validation average acc: ", float(sum(val_acc) / len(val_acc)))
  
  plt.plot(range(100), avg_loss, marker="x")
  plt.plot(range(100), val_loss, marker="*")
  plt.legend(["Training loss", "Validation loss"])
  plt.xlabel("epoch")
  plt.ylabel("loss")
  plt.show()

  plt.plot(range(100), avg_acc, marker="x")
  plt.plot(range(100), val_acc, marker="*")
  plt.legend(["Training accuracy", "Validation accuracy"])
  plt.xlabel("epoch")
  plt.ylabel("loss")
  plt.show()


We will experiment to make it better. We will not use grid search because of the time complexity

Hidden layer sizes:

In [None]:
# for i in range(8):
#   h1 = random.randint(1, 600)
#   h2 = random.randint(1, h1)
#   h3 = random.randint(1, h2)

#   print("h1, h2, h3 = ", h1, h2, h3)
#   model, avg_loss, avg_acc, val_loss, val_acc = train_and_test_hyperparameters(h1, h2, h3, 0.005, 64, 100, torch.optim.SGD)
#   plot_loss_accuracy(range(100), avg_loss, val_loss, avg_acc, val_acc)
#   get_roc_curve(model)
#   get_classification_report(model)



We have no significant improvement so we will stick with the old values.

Best hidden layer sizes:

In [None]:
best_h1 = 256
best_h2 = 128
best_h3 = 64


Now we will test learning rate:

In [None]:
# learning_rate = [1, 0.5, 0.1, 0.05, 0.01, 0.005, 0.001]

# for lr in learning_rate:

#   model, avg_loss, avg_acc, val_loss, val_acc = train_and_test_hyperparameters(best_h1, best_h2, best_h3, lr, 64, 100, torch.optim.SGD)
#   plot_loss_accuracy(range(100), avg_loss, val_loss, avg_acc, val_acc)
#   get_roc_curve(model)
#   get_classification_report(model)

We can see that we have good results with learning rate 0.01, 0.005, and 0.001. We will use 0.005 for our best learning rate.

In [None]:
best_lr = 0.01

Now we will test batch size:

In [None]:
# batch_size = [2, 4, 8, 16, 32, 64, 128, 256]

# for batch in batch_size:

#   model, avg_loss, avg_acc, val_loss, val_acc = train_and_test_hyperparameters(best_h1, best_h2, best_h3, best_lr, batch, 100, torch.optim.SGD)
#   plot_loss_accuracy(range(100), avg_loss, val_loss, avg_acc, val_acc)
#   get_roc_curve(model)
#   get_classification_report(model)

Is seems that batch size does not change accuracy a lot. We will use as best batch_size: 128

In [None]:
best_batch = 128

Now we will test different optimizers

In [None]:
# optimizers = [torch.optim.Adadelta, torch.optim.ASGD, torch.optim.Adamax, torch.optim.Rprop, torch.optim.SGD]

# for optimizer in optimizers:

#   model, avg_loss, avg_acc, val_loss, val_acc = train_and_test_hyperparameters(best_h1, best_h2, best_h3, best_lr, best_batch, 100, optimizer)
#   plot_loss_accuracy(range(100), avg_loss, val_loss, avg_acc, val_acc)
#   get_roc_curve(model)
#   get_classification_report(model)

Best optimizer:

In [None]:
best_optimizer = torch.optim.Adadelta

In [None]:
loss_func_ = [nn.L1Loss(), nn.MSELoss(), nn.CrossEntropyLoss(), nn.BCELoss(), nn.BCEWithLogitsLoss(), nn.SoftMarginLoss()]

In [None]:
# for loss_func__ in loss_func_:
#   model, avg_loss, avg_acc, val_loss, val_acc = train_and_test_hyperparameters(best_h1, best_h2, best_h3, best_lr, best_batch, 100, best_optimizer, loss_func__)
#   plot_loss_accuracy(range(100), avg_loss, val_loss, avg_acc, val_acc)
#   get_roc_curve(model)
#   get_classification_report(model)

In [None]:
best_loss_func = nn.BCELoss()

In [None]:
glove_testX_ = glove_testX
testY_ = testY

if testset_path is not None:
  df_test = pandas.read_csv(testset_path, sep='\t', engine='python')
  df_test = transform(df_test)
  X_test = df_test['review'].apply(lambda x: ' '.join(x))
  Y_test = df_test['rating']

  glove_testX__, recognized_words, total_words = glove_dataframe(X_test)
  testY__ = Y_test

  glove_testX_ = torch.tensor(glove_testX__)
  testY_ = torch.squeeze(torch.from_numpy(testY__.to_numpy()).float())

glove_testX = glove_testX_
testY = testY_



In [None]:
model, avg_loss, avg_acc, val_loss, val_acc = train_and_test_hyperparameters(best_h1, best_h2, best_h3, best_lr, best_batch, 100, best_optimizer, best_loss_func)
plot_loss_accuracy(range(100), avg_loss, val_loss, avg_acc, val_acc)
get_roc_curve(model)
get_classification_report(model)