In [None]:
!pip install pandas --quiet
!pip install torchtext --quiet

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from sklearn import metrics

import torchtext
from torchtext.data import get_tokenizer

from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
def set_device():
  device = "cuda" if torch.cuda.is_available() else "cpu"
  if device != "cuda":
    print("WARNING: For this notebook to perform best, "
          "if possible, in the menu under `Runtime` -> "
          "`Change runtime type.`  select `GPU` ")
  else:
    print("GPU is enabled in this notebook.")

  return device

In [None]:
# Set the device (check if gpu is available)
device = set_device()

In [None]:
df = pd.read_csv('/kaggle/input/english-contractions/df_processed_6labels.csv', delimiter=',')

# Let's have a look at it
df.head()

**Remove some classess: happiness, hate, fun, love, surprise**

In [None]:
indexForRemove = df[ (df['sentiment'] == 'happiness') | 
                    (df['sentiment'] == 'hate') | 
                    (df['sentiment'] == 'fun') | 
                    (df['sentiment'] == 'love') | 
                    (df['sentiment'] == 'surprise') | 
                    (df['sentiment'] == 'worry') ].index
print(len(indexForRemove))
df.drop(index=indexForRemove , inplace=True)
df.head()

In [None]:
VOC_CLASSES = df['sentiment'].unique()
LEN_CLASSES = len(VOC_CLASSES)
encoding2label = dict(enumerate(VOC_CLASSES))
label2encoding = {value: key for key, value in encoding2label.items()}

print(VOC_CLASSES, LEN_CLASSES)
print(encoding2label)
print(label2encoding)

In [None]:
X = df.content.values
y = [label2encoding[l] for l in df.sentiment.values]
print(len(X), len(y))

In [None]:
# Split the data into train and test
x_train_text, x_test_text, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Tokenaizer sentences
this part will be change if use BERT

In [None]:
for x, y in zip(x_train_text[:5], y_train[:5]):
  print('{}: {}'.format(encoding2label[y], x))

In [None]:
tokenizer = get_tokenizer("basic_english")

print('Before Tokenize: ', x_train_text[1])
print('After Tokenize: ', tokenizer(x_train_text[1]))
x_train_token = [tokenizer(s) for s in tqdm(x_train_text)]
x_test_token = [tokenizer(s) for s in tqdm(x_test_text)]

In [None]:
words = Counter()
for s in x_train_token:
  for w in s:
    words[w] += 1

sorted_words = list(words.keys())
sorted_words.sort(key=lambda w: words[w], reverse=True)
print(f"Number of different Tokens in our Dataset: {len(sorted_words)}")
print(sorted_words[:100])

In [None]:
count_occurences = sum(words.values())

accumulated = 0
counter = 0

while accumulated < count_occurences * 0.8:
  accumulated += words[sorted_words[counter]]
  counter += 1

print(f"The {counter * 100 / len(words)}% most common words "
      f"account for the {accumulated * 100 / count_occurences}% of the occurrences")
plt.bar(range(100), [words[w] for w in sorted_words[:100]])
plt.show()

In [None]:
num_words_dict = 30000
# We reserve two numbers for special tokens.
most_used_words = sorted_words[:num_words_dict-2]

In [None]:
# dictionary to go from words to idx 
word_to_idx = {}
# dictionary to go from idx to words (just in case) 
idx_to_word = {}


# We include the special tokens first
PAD_token = 0   
UNK_token = 1

word_to_idx['PAD'] = PAD_token
word_to_idx['UNK'] = UNK_token

idx_to_word[PAD_token] = 'PAD'
idx_to_word[UNK_token] = 'UNK'

# We popullate our dictionaries with the most used words
for num,word in enumerate(most_used_words):
  word_to_idx[word] = num + 2
  idx_to_word[num+2] = word

In [None]:
# A function to convert list of tokens to list of indexes
def tokens_to_idx(sentences_tokens,word_to_idx):
  sentences_idx = []
  for sent in sentences_tokens:
    sent_idx = []
    for word in sent:
      if word in word_to_idx:
        sent_idx.append(word_to_idx[word])
      else:
        sent_idx.append(word_to_idx['UNK'])
    sentences_idx.append(sent_idx)
  return sentences_idx

In [None]:
x_train_idx = tokens_to_idx(x_train_token,word_to_idx)
x_test_idx = tokens_to_idx(x_test_token,word_to_idx)

In [None]:
some_number = 1
print('Before converting: ', x_train_token[some_number])
print('After converting: ', x_train_idx[some_number])

In [None]:
tweet_lens = np.asarray([len(sentence) for sentence in x_train_idx])
print('Max tweet word length: ',tweet_lens.max())
print('Mean tweet word length: ',np.median(tweet_lens))
print('99% percent under: ',np.quantile(tweet_lens,0.99))

In [None]:
# We choose the max length
max_length = 40

# A function to make all the sequence have the same lenght
# Note that the output is a Numpy matrix
def padding(sentences, seq_len):
  features = np.zeros((len(sentences), seq_len),dtype=int)
  for ii, tweet in enumerate(sentences):
    len_tweet = len(tweet) 
    if len_tweet != 0:
      if len_tweet <= seq_len:
        # If its shorter, we fill with zeros (the padding Token index)
        features[ii, -len(tweet):] = np.array(tweet)[:seq_len]
      if len_tweet > seq_len:
        # If its larger, we take the last 'seq_len' indexes
        features[ii, :] = np.array(tweet)[-seq_len:]
  return features

In [None]:
# We convert our list of tokens into a numpy matrix
# where all instances have the same lenght
x_train_pad = padding(x_train_idx,max_length)
x_test_pad = padding(x_test_idx,max_length)

# We convert our target list a numpy matrix
y_train_np = np.asarray(y_train)
y_test_np = np.asarray(y_test)

In [None]:
some_number = 3
print('Before padding: ', x_train_idx[some_number])
print('After padding: ', x_train_pad[some_number])

# Define dataloader and model

In [None]:
# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(x_train_pad), torch.from_numpy(y_train_np))
test_data = TensorDataset(torch.from_numpy(x_test_pad), torch.from_numpy(y_test_np))

# Batch size (this is an important hyperparameter)
batch_size = 100

# dataloaders
# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, drop_last = True)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size, drop_last = True)

In [None]:
# Obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('x size: ', sample_x.size(), 'Y size', sample_y.size()) # batch_size, seq_length
# print('Sample input: \n', sample_x)
# print('Sample output: \n', sample_y)

In [None]:
class SentimentRNN(nn.Module):
    def __init__(self, no_layers, vocab_size, hidden_dim, embedding_dim, output_dim, drop_prob=0.55):
        super(SentimentRNN, self).__init__()

        self.hidden_dim = hidden_dim
        self.no_layers = no_layers

        # Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM Layers
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=hidden_dim,
                            num_layers=no_layers, 
                            batch_first=True, 
                            dropout=drop_prob,
                            bidirectional=True
                            )
                                  

        # Dropout layer
        self.dropout = nn.Dropout(drop_prob)

        # Linear and Sigmoid layer
        # why not no_layers*hidden_dim? and for biLSTM x2?
        self.fc = nn.Linear(2*hidden_dim, output_dim)
#         self.sig = nn.Sigmoid()
    
    def forward(self, x, hidden):
        batch_size = x.size(0)

        # Embedding out
        embeds = self.embedding(x)
        #Shape: [batch_size x max_length x embedding_dim]

        # LSTM out
        lstm_out, hidden = self.lstm(embeds, hidden)
        # Shape: [batch_size x max_length x hidden_dim]

        # Select the activation of the last Hidden Layer
        lstm_out = lstm_out[:,-1,:].contiguous()
        # Shape: [batch_size x hidden_dim]

         ## You can instead average the activations across all the times
        # lstm_out = torch.mean(lstm_out, 1).contiguous()

        # Dropout and Fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)

        # Sigmoid function
#         sig_out = self.sig(out)

        # return last sigmoid output and hidden state
        return out, hidden

    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        #random initialization is better, i think 
        h0 = torch.randn((2*self.no_layers, batch_size, self.hidden_dim)).to(device)
        c0 = torch.randn((2*self.no_layers, batch_size, self.hidden_dim)).to(device)
        hidden = (h0, c0)
        return hidden


In [None]:
# Let's define our model
model = SentimentRNN(no_layers=3,
                     vocab_size=num_words_dict, # 43800 dictionary
                     hidden_dim=64,
                     embedding_dim=32,
                     output_dim=LEN_CLASSES,    # 13 classes
                     drop_prob=0.55
                    )


# Moving to gpu
model.to(device)
print(model)

In [None]:
# How many trainable parameters does our model have?
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print('Total Number of parameters: ',params)

In [None]:
# loss and optimization functions
lr = 0.0001

criterion = nn.CrossEntropyLoss().to(device)

# We choose an Adam optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

# function to predict accuracy
def acc(pred, labels):
    pred = torch.argmax(pred, dim=1)
#     labels = torch.argmax(labels, dim=1)
    return torch.sum((pred == labels).float())

In [None]:
# Number of training Epochs
epochs = 25
# Maximum absolute value accepted for the gradeint
clip = 5

# Initial Loss value (assumed big)
valid_loss_min = np.Inf

# Lists to follow the evolution of the loss and accuracy
epoch_tr_loss, epoch_vl_loss = [], []
epoch_tr_acc, epoch_vl_acc = [], []

# Train for a number of Epochs
for epoch in range(epochs):
  train_losses = []
  train_acc = 0.0
  model.train()

  for inputs, labels in train_loader:
    # Initialize hidden state 
    h = model.init_hidden(batch_size)
    
    # Creating new variables for the hidden state
    h = tuple([each.data.to(device) for each in h])
    
    # Move batch inputs and labels to gpu
    inputs, labels = inputs.to(device), labels.to(device)   

    # Set gradient to zero
    model.zero_grad()
    
    # Compute model output
    output, h = model(inputs, h)

    # Calculate the loss and perform backprop
    loss = criterion(output.squeeze(), labels.long())
    loss.backward()
    train_losses.append(loss.item())
    
    # calculating accuracy
    accuracy = acc(output,labels)
    train_acc += accuracy

    #`clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
    nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    
  # Evaluate on the validation set for this epoch 
  val_losses = []
  val_acc = 0.0
  model.eval()
    
  for inputs, labels in test_loader:
    
    # Initialize hidden state 
    val_h = model.init_hidden(batch_size)
    val_h = tuple([each.data.to(device) for each in val_h])

    # Move batch inputs and labels to gpu
    inputs, labels = inputs.to(device), labels.to(device)

    # Compute model output
    output, val_h = model(inputs, val_h)

    # Compute Loss
    val_loss = criterion(output.squeeze(), labels.long())

    val_losses.append(val_loss.item())

    accuracy = acc(output, labels)
    val_acc += accuracy

  epoch_train_loss = np.mean(train_losses)
  epoch_val_loss = np.mean(val_losses)
  epoch_train_acc = train_acc/len(train_loader.dataset)
  epoch_val_acc = val_acc/len(test_loader.dataset)
  epoch_tr_loss.append(epoch_train_loss)
  epoch_vl_loss.append(epoch_val_loss)
  epoch_tr_acc.append(epoch_train_acc)
  epoch_vl_acc.append(epoch_val_acc)
  print(f'Epoch {epoch+1}') 
  print(f'train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}')
  print(f'train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}')
  if epoch_val_loss <= valid_loss_min:
    print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(valid_loss_min,epoch_val_loss))
    # torch.save(model.state_dict(), '../working/state_dict.pt')
    valid_loss_min = epoch_val_loss
  print(25*'==')

In [None]:
fig = plt.figure(figsize = (20, 6))
plt.subplot(1, 2, 1)
plt.plot(epoch_tr_acc, label='Train Acc')
plt.plot(epoch_vl_acc, label='Validation Acc')
plt.title("Accuracy")
plt.legend()
plt.grid()

plt.subplot(1, 2, 2)
plt.plot(epoch_tr_loss, label='Train loss')
plt.plot(epoch_vl_loss, label='Validation loss')
plt.title("Loss")
plt.legend()
plt.grid()

plt.show()

In [None]:
###Utility
from sklearn import metrics

def get_eval_report(labels, preds):
    mcc = matthews_corrcoef(labels, preds)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    precision = (tp)/(tp+fp)
    recall = (tp)/(tp+fn)
    f1 = (2*(precision*recall))/(precision+recall)
    return {
        "mcc": mcc,
        "tp": tp,
        "tn": tn,
        "fp": fp,
        "fn": fn,
        "pricision" : precision,
        "recall" : recall,
        "F1" : f1,
        "accuracy": (tp+tn)/(tp+tn+fp+fn)
    }

def compute_metrics(labels, preds):
    assert len(preds) == len(labels)
    return get_eval_report(labels, preds)

def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string], '')
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()

def class_balance(df, target):
  cls = df[target].value_counts()
  cls.plot(kind='bar')
  plt.show()

In [None]:
print("\n Plotting results ... \n")
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')
print("\n Evaluating Model ... \n")
predicted = model_RNN.predict_classes(X_test_Glove)
#print(predicted)
print(metrics.classification_report(y_test, predicted))
# print("\n")
# logger = logging.getLogger("logger")
# result = compute_metrics(y_test, predicted)
# for key in (result.keys()):
#   logger.info("  %s = %s", key, str(result[key]))

In [None]:
print("\n Plotting results ... \n")
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')
print("\n Evaluating Model ... \n")
predicted = model.predict_classes(x_test_pad)
#print(predicted)
print(metrics.classification_report(y_test_np, predicted))
# print("\n")
# logger = logging.getLogger("logger")
# result = compute_metrics(y_test, predicted)
# for key in (result.keys()):
#   logger.info("  %s = %s", key, str(result[key]))