Imports

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from gensim.scripts.glove2word2vec import glove2word2vec
!pip install contractions
import contractions
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove*.zip
import torch
import torch.nn as nn
!pip install torchmetrics
import torchmetrics
from torchmetrics import F1Score
from torchmetrics import Accuracy
from torchmetrics import Precision
from torchmetrics import Recall
from torchmetrics import ROC
from torchmetrics.classification import BinaryROC

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from collections import Counter
import string

Open dataset

In [None]:
df = pd.read_csv('drive/MyDrive/imdb-reviews.csv', sep='\t')

test_df = None

df

# Pre-processing

Remove unnecessary columns and add sentiment column

In [None]:
def remove_cols(df):
  # διώχνω τα άχρηστα δεδομένα πχ. url rating
  df = df.drop(['url'], axis=1)

  # add sentiment col 0 for negative 1 for positive
  df['sentiment'] = np.where(df['rating']>4.0, 1, 0)

  # no more need for col rating
  df = df.drop(['rating'], axis=1)

  return df

df = remove_cols(df)

if test_df is not None:
  test_df = remove_cols(test_df)

In [None]:
df

Clean review from html tags, numbers, punctuation and stopwords.

In [None]:
df['review'] = df['review'].str.lower()
df['review'] = df['review'].replace('<br />', ' ', regex=True)
df['review'] = df['review'].replace('<[^<]+?>', '', regex=True)

if test_df is not None:
  test_df['review'] = test_df['review'].str.lower()
  test_df['review'] = test_df['review'].replace('<br />', ' ', regex=True)
  test_df['review'] = test_df['review'].replace('<[^<]+?>', '', regex=True)

def clean_review(review):
  review = re.sub(r'[^a-zA-Z0-9\s]', '', review)

  return review

df['review'] = df['review'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))

df['review'] = df['review'].apply(lambda x: clean_review(x))

if test_df is not None:
  test_df['review'] = test_df['review'].apply(lambda x:''.join([i for i in x if i not in string.punctuation]))

  test_df['review'] = test_df['review'].apply(lambda x: clean_review(x))

df

Expand Contractions

In [None]:
def expand_contractions(review):
  return contractions.fix(review)

df['review'] = df['review'].apply(lambda x: expand_contractions(x))

if test_df is not None:
  test_df['review'] = test_df['review'].apply(lambda x: expand_contractions(x))

df

Remove stopwords


In [None]:
stop_words = stopwords.words('english')
stopwords_dict = Counter(stop_words)  # dictionary for faster lookup

def remove_stopwords(review):
  return ' '.join([word for word in review.split() if word not in stopwords_dict])

df['review'] = df['review'].apply(lambda x: remove_stopwords(x))

if test_df is not None:
  test_df['review'] = test_df['review'].apply(lambda x: remove_stopwords(x))

Remove duplicate words from review

In [None]:
def remove_dups(review):
  return ' '.join(dict.fromkeys(review.split()))

df['review'] = df['review'].apply(lambda x: remove_dups(x))

if test_df is not None:
  test_df['review'] = test_df['review'].apply(lambda x: remove_dups(x))

Tokenization

In [None]:
tokenizer = nltk.tokenize.WhitespaceTokenizer()

Lemmatization

In [None]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatization(review):
    return " ".join([lemmatizer.lemmatize(w, pos="v") for w in tokenizer.tokenize(review)])

df['review'] = df['review'].apply(lemmatization)

if test_df is not None:
  test_df['review'] = test_df['review'].apply(lemmatization)

df

Glove

In [None]:
glove_input_file = 'glove.6B.300d.txt'
w2v_output_file = 'glv_with_w2v_format.txt'

dim = 300

glove2word2vec(glove_input_file, w2v_output_file)

Φτιάχνω ένα dictionary για κάθε λέξη το οποίο θα την αντιστοιχεί με το vector της (πχ. dict['the'] = vector_of['the'])

In [None]:
with open(w2v_output_file, 'r') as infile:
    w2v = infile.read().splitlines()

words = []      # list of words
vectors = []    # holds the vector of corresponding word
index = 0       # index of word in list of vectors
dictionary = {} # word: , index_in_vectors:

# first element of w2v is 400000,50 so we dont need that
for word_vector_pair in w2v[1:]:
    word_vector_pair = word_vector_pair.split()
    words.append(word_vector_pair[0])           # add word in list
    vectors.append(np.array(word_vector_pair[1:]).astype(float))        # add vector in list (need to convert for later)
    dictionary[word_vector_pair[0]] = index     # update dictionary
    index += 1
print(type(vectors[0][0]))
word_vector = {w: vectors[dictionary[w]] for w in words}

#word_vector['the']

Για κάθε λέξη σε ένα review θα έχουμε ένα vector dim θέσεων. Το input του νευρωνικού δικτύου θα είναι επίσης dim. Επειδή δε μπορούμε να βάλουμε όλα τα word vectors του review στο input, θα προσθέσουμε τα word vectors και θα τα διαιρέσουμε με το πλήθος των λέξεων του review. Αν κάποια λέξη υπάρχει στο review και όχι στο glove τότε δεν την μετράμε σε αυτή την πρόσθεση επομένως το word vector της θα είναι 0 και θα διαιρέσουμε το άθροισμα με το πλήθος των λέξεων στο review - 1.

In [None]:
def create_review_vector(df, word_vector, dim):
  review_vectors = []  # list with mean vector values of each review
  for review in df['review']:
    words = review.split(' ')
    words_in_glove = 0   # number of words in review
    review_vector_mean = np.zeros((len(words),dim))           # we are going to add all the word vectors in review (that also exist in glove)
    i=0
    for word in words:
      if word in word_vector:
        review_vector_mean[i] += word_vector[word]
        words_in_glove += 1
      i+=1
    # we now have number of words in review and number of words also in glove
    mean = np.sum(review_vector_mean, axis=0)
    mean /= words_in_glove
    review_vectors.append(mean)

  return review_vectors

review_vectors = create_review_vector(df, word_vector, dim)

print(len(review_vectors))
print(len(review_vectors[0]))

In [None]:
y1 = df['sentiment'].apply(lambda x: float(x))  # must convert for y_pred = model(x_batch)
y1 = y1.to_numpy()

X_train, X_test, Y_train, Y_test = train_test_split(np.array(review_vectors), y1, test_size = 0.1, random_state=42)

X_train = torch.tensor(X_train)
X_test = torch.tensor(X_test)

Y_train = torch.squeeze(torch.from_numpy(Y_train))
Y_test = torch.squeeze(torch.from_numpy(Y_test))

if test_df is not None:
  y1_val = test_df['sentiment'].apply(lambda x: float(x))  # must convert for y_pred = model(x_batch)
  y1_val = y1_val.to_numpy()
  # y_test_val = torch.squeeze(torch.from_numpy(y1_val))
  # print(y_test_val.shape)

print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

#Create a Neural Network

In [None]:
class Net(nn.Module):
    #def __init__(self, D_in, H1, H2, H3, D_out):
    def __init__(self, D_in, H1, H2, D_out):
        super(Net, self).__init__()

        self.linear1 = nn.Linear(D_in, H1)
        nn.init.kaiming_uniform_(self.linear1.weight, mode='fan_in', nonlinearity='relu')
        self.relu_1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.5)


        self.linear2 = nn.Linear(H1, H2)
        nn.init.kaiming_uniform_(self.linear2.weight, mode='fan_in', nonlinearity='relu')
        self.relu_2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.3)

        self.linear3 = nn.Linear(H2, D_out)

        # self.linear3 = nn.Linear(H2, H3)
        # self.relu_3 = nn.ReLU()
        # self.dropout2 = nn.Dropout(0.3)

        # self.linear4 = nn.Linear(H3, D_out)


    def forward(self, x):
        h1 = self.linear1(x)
        out = self.relu_1(h1)

        h2 = self.linear2(h1)
        out = self.relu_2(h2)

        # h3 = self.linear3(h2)
        # out = self.relu_3(h3)

        # out = self.linear4(h3)

        out = self.linear3(h2)

        return torch.sigmoid(out)
        #return out

In [None]:
from torch.optim.lr_scheduler import ExponentialLR

#Define layer sizes
D_in = X_train.shape[1]
# H1 = 128
# H2 = 64
# H3 = 16
H1 = 64
H2 = 16
D_out = 1

#Define Hyperparameters
learning_rate = 1e-4

#Initialise model, loss, optimizer
#model = Net(D_in, H1, H2, H3, D_out)
model = Net(D_in, H1, H2, D_out)

#loss_func = nn.MSELoss(reduction='sum')
#loss_func = nn.CrossEntropyLoss()  # + sigmoid(out) -> 1500 loss
loss_func = nn.BCELoss()            # + sigmoid(out)
#loss_func = nn.BCEWithLogitsLoss()

#optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-4)
#optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate,weight_decay=1e-4)
optimizer = torch.optim.Adamax(model.parameters(),lr=learning_rate,weight_decay=1e-4)
#optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate,weight_decay=1e-4)
#optimizer = torch.optim.RMSprop(model.parameters(),lr=learning_rate,weight_decay=1e-3)
#optimizer = torch.optim.Adagrad(model.parameters(),lr=learning_rate,weight_decay=1e-4)

#Initialise dataloader
dataset = torch.utils.data.TensorDataset(X_train, Y_train)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=1000, shuffle=False)

#initialise test dataloader
test_dataset = torch.utils.data.TensorDataset(X_test, Y_test)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1000, shuffle=False)

In [None]:
model

In [None]:
from sklearn import metrics

f1_score = F1Score(task="binary", num_classes=1)
acc = Accuracy(task="binary", num_classes=1)
rec = Recall(task="binary", num_classes=1)
prec = Precision(task="binary", num_classes=1)

train_mean_accuracy = []
train_mean_loss = []
train_mean_roc = []
train_mean_fpr = []
train_mean_tpr = []
train_mean_thresholds = []

test_mean_accuracy = []
test_mean_loss = []
test_mean_fpr = []
test_mean_tpr = []
test_mean_thresholds = []

for epoch in range(100):
  batch_losses = []
  accuracy = []
  precision = []
  recall = []
  f1 = []
  fpr_list = []
  tpr_list = []
  thresholds_list = []
  test_batch_losses = []
  test_accuracy = []
  test_precision = []
  test_recall = []
  test_f1 = []
  test_fpr_list = []
  test_tpr_list = []
  test_thresholds_list = []
  for x_batch, y_batch in dataloader:
    y_pred = model(x_batch.float())

    loss = loss_func(torch.squeeze(y_pred), y_batch.float())
    batch_losses.append(loss.item())

    precision.append(prec(torch.squeeze(torch.round(y_pred)), y_batch))
    accuracy.append(acc(torch.squeeze(torch.round(y_pred)), y_batch))
    f1.append(f1_score(torch.squeeze(torch.round(y_pred)), y_batch))
    recall.append(rec(torch.squeeze(torch.round(y_pred)), y_batch))

    #Delete previously stored gradients
    optimizer.zero_grad()
    #Perform backpropagation starting from the loss calculated in this epoch
    loss.backward()
    #Update model's weights based on the gradients calculated during backprop
    optimizer.step()

  for test_x, test_y in test_dataloader:
    y_pred = model(test_x.float())

    loss = loss_func(torch.squeeze(y_pred), test_y.float())
    test_batch_losses.append(loss.item())

    test_precision.append(prec(torch.squeeze(torch.round(y_pred)), test_y))
    test_accuracy.append(acc(torch.squeeze(torch.round(y_pred)), test_y))
    test_f1.append(f1_score(torch.squeeze(torch.round(y_pred)), test_y))
    test_recall.append(rec(torch.squeeze(torch.round(y_pred)), test_y))

    fpr, tpr, thresholds = metrics.roc_curve(np.round_(y_pred.detach().numpy()), test_y, pos_label=1)
    fpr_list.append(fpr)
    tpr_list.append(tpr)
    thresholds_list.append(thresholds)

  train_mean_accuracy.append(sum(accuracy)/len(dataloader))
  test_mean_accuracy.append(sum(test_accuracy)/len(test_dataloader))
  train_mean_loss.append(sum(batch_losses)/len(dataloader))
  test_mean_loss.append(sum(test_batch_losses)/len(test_dataloader))

  test_mean_fpr.append(sum(fpr_list)/len(test_dataloader))
  test_mean_tpr.append(sum(tpr_list)/len(test_dataloader))

  print(f"Epoch {epoch:3}: Accuracy = {sum(accuracy)/len(dataloader):.5f} -  Test Accuracy = {sum(test_accuracy)/len(test_dataloader):.5f}")
  print(f"Epoch {epoch:3}: Precision = {sum(precision)/len(dataloader):.5f} -  Test Precision = {sum(test_precision)/len(test_dataloader):.5f}")
  print(f"Epoch {epoch:3}: Recall = {sum(recall)/len(dataloader):.5f} -  Test Recall = {sum(test_recall)/len(test_dataloader):.5f}")
  print(f"Epoch {epoch:3}: F1 = {sum(f1)/len(dataloader):.5f} -  Test F1 = {sum(test_f1)/len(test_dataloader):.5f}")
  print(f"Epoch {epoch:3}: Test Fpr = {sum(fpr_list)/len(test_dataloader)}")
  print(f"Epoch {epoch:3}: Test Tpr = {sum(tpr_list)/len(test_dataloader)}")
  print(f"Epoch {epoch:3}: Loss = {sum(batch_losses)/len(dataloader):.5f} -  Test Loss = {sum(test_batch_losses)/len(test_dataloader):.5f}")
  print()

Optimization Learning Curves: Learning curves calculated on the metric by which the parameters of the model are being optimized e.g. loss

In [None]:
def plot_lc(train, test, title, ylabel, loc):
  figure(figsize=(8,8))
  plt.plot(train)
  plt.plot(test)
  plt.title(title)
  plt.ylabel(ylabel)
  plt.xlabel("Epoch")
  plt.legend(['train', 'test'], loc=loc)
  plt.show()

plot_lc(train_mean_accuracy, test_mean_accuracy, "Learning Curve Accuracy", "Accuracy", 'lower right')
print()
plot_lc(train_mean_loss, test_mean_loss, "Learning Curve Loss", "Loss", 'upper right')

ROC


In [None]:
y_pred = model(X_test.float())
display = metrics.RocCurveDisplay.from_predictions(Y_test, y_pred.detach().numpy())

Accuracy on validation (given) set

In [None]:
if test_df is not None:
  val_review_vectors = create_review_vector(test_df, word_vector, dim)

  x_val = np.array(val_review_vectors)
  x_val = torch.tensor(x_val)
  y_pred = model(x_val.float().clone().detach().requires_grad_(True))
  y1_val = torch.squeeze(torch.from_numpy(y1_val))

  print(prec(torch.squeeze(torch.round(y_pred)), y1_val))
  print(acc(torch.squeeze(torch.round(y_pred)), y1_val))
  print(f1_score(torch.squeeze(torch.round(y_pred)), y1_val))
  print(rec(torch.squeeze(torch.round(y_pred)), y1_val))

