# Import

In [2]:
import os
import re
import io
import time
import math
import pickle
from collections import Counter
from argparse import Namespace

import torch
import torch.optim as optim
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import DataLoader

import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab

from tqdm.notebook import tqdm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from google.colab import drive, files

import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

from mlxtend.plotting import plot_confusion_matrix

# Settings

In [3]:
DATA_FOLDER_PATH = '/content/Gdrive/MyDrive/CS 6795 Cog-Sci Final Project/data/'

args = Namespace(
    sample_size=300000, # the original dataset is too big to be processed efficiently 

    test_ratio=0.1,
    val_ratio=0.2,
    
    model_name='baseline_lstm_with_dropout',
    num_epoch=7,
    gradient_clip=1.0,
    learning_rate=1e-3,
    criterion='crossEntropy',
    optimizer='Adam',

    num_layers=2,
    dropout=0.3,
    embedding_dim=300,
    hidden_dim=300,
    output_dim=2, # output_dim is the number of different sentiments in the target dataset

    batch_size=128,
    batch_first=False,

    train_csv_path=DATA_FOLDER_PATH + 'training_1_6_million.csv',
    # pretrained_baseline_lstm_with_dropout_path=DATA_FOLDER_PATH + 'baseline_lstm_with_dropout.pt',

    load_pretrained_weight=True,
    save_pretrained_model=False,
    read_vocab=True,
    
    device=torch.device('cuda' if torch.cuda.is_available else 'cpu'),
)

# Load Data

In [4]:
drive.mount('/content/Gdrive')
large_data_df = pd.read_csv(args.train_csv_path, encoding='latin').sample(args.sample_size)
# rename the columns into standardized names
large_data_df.columns = ['sentiment_score', 'id', 'date', 'status', 'username', 'tweet']
# convert the sentiment score of 4 into 1
large_data_df.loc[large_data_df['sentiment_score'] == 4, 'sentiment_score'] = 1

Mounted at /content/Gdrive


In [5]:
large_data_df.head()

Unnamed: 0,sentiment_score,id,date,status,username,tweet
3885,0,1468696840,Tue Apr 07 03:13:26 PDT 2009,NO_QUERY,KatelynWelch,ooooooooooooh my headddd uncle johnny i never...
12255,0,1551735091,Sat Apr 18 09:50:17 PDT 2009,NO_QUERY,Gita,what a shame! you can't type persian &amp; rea...
133516,0,1836020829,Mon May 18 07:13:03 PDT 2009,NO_QUERY,FieFieSoMajor,@rugzdbewler hardly working lol. I think I got...
1185657,1,1982819007,Sun May 31 11:57:58 PDT 2009,NO_QUERY,inmollywood,@freshalicious heehee I felt dumb though b/c ...
1164625,1,1979774158,Sun May 31 04:27:08 PDT 2009,NO_QUERY,Ginababy127,@IrishLad585 you r right. Thanx


In [6]:
# download the glove embedding
!wget https://raw.githubusercontent.com/aritter/aritter.github.io/master/files/glove.840B.300d.conll_filtered.txt

def read_GloVe(filename):
  embeddings = {}

  for line in open(filename).readlines():
    #print(line)
    fields = line.strip().split(" ")
    word = fields[0]
    embeddings[word] = [float(x) for x in fields[1:]]
    
  return embeddings

GloVe = read_GloVe("glove.840B.300d.conll_filtered.txt")

--2021-04-27 02:49:37--  https://raw.githubusercontent.com/aritter/aritter.github.io/master/files/glove.840B.300d.conll_filtered.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 69798443 (67M) [text/plain]
Saving to: ‘glove.840B.300d.conll_filtered.txt’


2021-04-27 02:49:40 (136 MB/s) - ‘glove.840B.300d.conll_filtered.txt’ saved [69798443/69798443]



# Vocabulary

In [13]:
unk_token = '<unk>'
pad_token = '<pad>'
bos_token = '<bos>'
eos_token = '<eos>'

def build_vocab_from_counter(counter):
    return Vocab(counter, specials=[unk_token, pad_token, bos_token, eos_token])

def read_counter(filename):
    counter = Counter()
    with open(filename, 'rb') as f:
        counter = pickle.load(f)
    return counter

In [14]:
def build_vocab_from_df(data_df, GloVe, tokenizer, save_counter=True, save_filename='vocab_counter.pickle'):
  counter = Counter()

  for idx, row in data_df.iterrows():
    counter.update(tokenizer(row['tweet']))

  for word in GloVe.keys():
    counter.update(tokenizer(word))
  
  # save the counter content
  with open(save_filename, 'wb') as f:
    pickle.dump(counter, f)
    
  return Vocab(counter, specials=[unk_token, pad_token, bos_token, eos_token])

basic_english_tokenizer = get_tokenizer('basic_english')
if args.read_vocab == True:
  counter = read_counter(DATA_FOLDER_PATH + 'vocab_counter.pickle')
  large_data_vocab = build_vocab_from_counter(counter)
else:
  large_data_vocab = build_vocab_from_df(large_data_df, GloVe, basic_english_tokenizer)

PAD_IDX = large_data_vocab[pad_token]
BOS_IDX = large_data_vocab[bos_token]
EOS_IDX = large_data_vocab[eos_token]
UNK_IDX = large_data_vocab[unk_token]

In [15]:
print(PAD_IDX)
print(EOS_IDX)

1
3


# Dataset & DataLoader

In [16]:
def get_data_from_df(data_df):
  data = []

  for idx, row in data_df.iterrows():
    tokenized_text = torch.tensor([large_data_vocab[token] for token in basic_english_tokenizer(row['tweet'])])
    target_score = row['sentiment_score']
    data.append((tokenized_text, target_score))
    
  return data

train_data_df, test_data_df = train_test_split(large_data_df, test_size=args.test_ratio)
train_data_df, val_data_df = train_test_split(train_data_df, test_size=args.val_ratio)

train_data = get_data_from_df(train_data_df)
val_data = get_data_from_df(val_data_df)
test_data = get_data_from_df(test_data_df)

In [17]:
def generate_batch(raw_data):
  data_batch = []
  target_batch = []

  for data, target in raw_data:
    data_batch.append(torch.cat([torch.tensor([BOS_IDX]), data, torch.tensor([EOS_IDX])]))
    target_batch.append(target)
    
  data_batch = pad_sequence(data_batch, padding_value=PAD_IDX, batch_first=args.batch_first)
  target_batch = torch.tensor(target_batch)
  return data_batch, target_batch

train_dataloader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True, collate_fn=generate_batch)
val_dataloader = DataLoader(val_data, batch_size=args.batch_size, shuffle=True, collate_fn=generate_batch)
test_dataloader = DataLoader(test_data, batch_size=args.batch_size, shuffle=True, collate_fn=generate_batch)

# Model Definition

In [18]:
class RNN(nn.Module):
  def __init__(self, input_dim=args.embedding_dim, hidden_dim=args.hidden_dim, 
              num_out=args.output_dim):
    super().__init__()
    self.num_words = len(large_data_vocab) # TODO: need to generalize this
    self.embedding = nn.Embedding(self.num_words, input_dim)
    self.rnn = nn.RNN(input_dim, hidden_dim, batch_first=args.batch_first)
    self.fc = nn.Linear(hidden_dim, num_out)

  def init_glove(self, GloVe):
    for word in GloVe.keys():
      self.embedding.weight.data[large_data_vocab[word]] = torch.as_tensor(GloVe[word])

  def forward(self, text):
    embedded = self.embedding(text)
    output, hidden = self.rnn(embedded)
    return self.fc(hidden.squeeze(0))

In [19]:
class BaselineGRU(nn.Module):
  def __init__(self, input_dim=args.embedding_dim, hidden_dim=args.hidden_dim, 
               num_out=args.output_dim, bidirectional=True, batch_first=args.batch_first):
    super().__init__()
    self.num_words = len(large_data_vocab) # TODO: need to generalize this
    self.input_dim = input_dim
    self.hidden_dim = hidden_dim

    self.embedding = nn.Embedding(self.num_words, input_dim)
    self.gru = nn.GRU(input_dim, hidden_dim, bidirectional=bidirectional, batch_first=batch_first)
    self.linear = nn.Linear(hidden_dim * 2 if bidirectional else hidden_dim, num_out)

  def init_glove(self, GloVe):
    for word in GloVe.keys():
      self.embedding.weight.data[large_data_vocab[word]] = torch.as_tensor(GloVe[word])

  def forward(self, x):
    if args.batch_first == True:
      lengths = torch.sum(x != PAD_IDX, dim=1).cpu()
      batch_size, max_sentence_length = x.shape
    else:
      lengths = torch.sum(x != PAD_IDX, dim=0).cpu()
      max_sentence_length, batch_size = x.shape

    embed_x = self.embedding(x)
    embed_x = pack_padded_sequence(embed_x, lengths, enforce_sorted=False, batch_first=args.batch_first)

    out, hidden = self.gru(embed_x)
    out, out_length = pad_packed_sequence(out, padding_value=PAD_IDX, batch_first=args.batch_first)
    hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
    return self.linear(hidden)

In [20]:
class BaselineLSTMwithDropout(nn.Module):
  def __init__(self, embedding_dim=args.embedding_dim, hidden_dim=args.hidden_dim, 
                output_dim=args.output_dim, n_layers=args.num_layers, 
                bidirectional=True, dropout=args.dropout):
    super().__init__()
    self.vocab_size = len(large_data_vocab)
    self.embedding = nn.Embedding(self.vocab_size, embedding_dim, padding_idx=PAD_IDX)
    self.lstm = nn.LSTM(embedding_dim, 
                        hidden_dim, 
                        num_layers=n_layers, 
                        bidirectional=bidirectional, 
                        dropout=dropout)
    self.fc = nn.Linear(hidden_dim * 2, output_dim)
    self.dropout = nn.Dropout(dropout)


  def init_glove(self, GloVe):
    for word in GloVe.keys():
      self.embedding.weight.data[large_data_vocab[word]] = torch.as_tensor(GloVe[word])

  def forward(self, x):
    if args.batch_first == True:
      lengths = torch.sum(x != PAD_IDX, dim=1).cpu()
      batch_size, max_sentence_length = x.shape
    else:
      lengths = torch.sum(x != PAD_IDX, dim=0).cpu()
      max_sentence_length, batch_size = x.shape

    embedded = self.dropout(self.embedding(x))
    packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths.to('cpu'), enforce_sorted=False)
    packed_output, (hidden, cell) = self.lstm(packed_embedded)
    output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output, padding_value=PAD_IDX)
    #output = [sent len, batch size, hid dim * num directions]    
    #hidden = [num layers * num directions, batch size, hid dim]
    #cell = [num layers * num directions, batch size, hid dim]
    hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
    return self.fc(hidden)

# Define Training Procedure

In [21]:
def get_pretrained_model_path():
  return DATA_FOLDER_PATH + args.model_name + '.pt'


def get_model():
  if args.model_name == 'baseline_lstm_with_dropout':
    return BaselineLSTMwithDropout()
  elif args.model_name == 'rnn':
    return RNN()
  elif args.model_name == 'baseline_gru':
    return BaselineGRU()


def get_criterion():
  if args.criterion == 'crossEntropy':
    return nn.CrossEntropyLoss()
  else:
    raise Exception('The loss function you specified is invalid')


def get_optimizer(model):
  if args.optimizer == 'Adam':
    return optim.Adam(model.parameters(), lr=args.learning_rate)
  else:
    raise Exception('The optimizer you specified is invalid')

In [22]:
def train(model, iterator, optimizer, criterion):
  model.train()
  epoch_loss = 0
  correct_pred_cnt = 0
  total_cnt = 0

  for (x, y) in tqdm(iterator):
    x, y = x.to(args.device), y.to(args.device)

    optimizer.zero_grad()

    output = model(x)
    correct_pred_cnt += torch.sum((torch.argmax(output, 1) == y)).item()
    total_cnt += [*y.size()][0] # This is used to convert torch.Size to int type

    loss = criterion(output, y)
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), args.gradient_clip)
    optimizer.step()

    epoch_loss += loss.item()
  

  return epoch_loss / len(iterator), correct_pred_cnt * 1.0 / total_cnt


def evaluate(model, iterator, criterion):
  model.eval()
  epoch_loss = 0
  correct_pred_cnt = 0
  total_cnt = 0

  with torch.no_grad():
    for (x, y) in tqdm(iterator):
        x, y = x.to(args.device), y.to(args.device)

        output = model(x) 
        correct_pred_cnt += torch.sum((torch.argmax(output, 1) == y)).item()
        total_cnt += [*y.size()][0]

        loss = criterion(output, y)
        epoch_loss += loss.item()

  return epoch_loss / len(iterator), correct_pred_cnt * 1.0 / total_cnt


def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time / 60)
  elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
  return elapsed_mins, elapsed_secs

# Train

In [23]:
model = get_model().to(args.device)

if args.load_pretrained_weight == True:
  pretrained_model_path = get_pretrained_model_path()
  model.load_state_dict(torch.load(pretrained_model_path))
else:
  best_val_loss = float('inf')

  train_losses, val_losses = [], []
  train_accuracies, val_accuracies = [], []

  model.init_glove(GloVe)

  criterion = get_criterion()
  optimizer = get_optimizer(model)

  for epoch in range(args.num_epoch):
      start_time = time.time()

      train_loss, train_acc = train(model, train_dataloader, optimizer, criterion)
      val_loss, val_acc = evaluate(model, val_dataloader, criterion)

      end_time = time.time()
      epoch_mins, epoch_secs = epoch_time(start_time, end_time)

      print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
      print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f} | Train Accuarcy: {train_acc:.3f}')
      print(f'\t Val. Loss: {val_loss:.3f} |  Val. PPL: {math.exp(val_loss):7.3f} | Val. Accuracy: {val_acc:.3f}')

      train_losses.append(train_loss)
      val_losses.append(val_loss)

      train_accuracies.append(train_acc)
      val_accuracies.append(val_acc)

  test_loss, test_acc = evaluate(model, test_dataloader, criterion)
  print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} | Test Accuracy: {test_acc:.3f}')

In [None]:
# download and save the model state
if args.save_pretrained_model == True:
  pretrained_model_path = get_pretrained_model_path()
  torch.save(model.state_dict(), pretrained_model_path)

# Training Analysis & Inspection

In [25]:
def plot_result(train_accuracies, val_accuracies, train_losses, val_losses):
  epochs = np.arange(0, args.num_epoch)
  fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 6))
  ax1.set_title('Accuracies against Epochs')
  ax1.set_xlabel('Epochs')
  ax1.set_ylabel('Accuracy')
  ax1.plot(epochs, train_accuracies, label='training accuracy')
  ax1.plot(epochs, val_accuracies, label='validation accuracy')
  ax1.legend()

  ax2.set_title('Loss against Epochs')
  ax2.set_xlabel('Epochs')
  ax2.set_ylabel('Loss')
  ax2.plot(epochs, train_losses, label='training loss')
  ax2.plot(epochs, val_losses, label='validation loss')
  ax2.legend()

plot_result(train_accuracies, val_accuracies, train_losses, val_losses)

NameError: ignored

# Inference

In [29]:
def predict_sentiment(model, sentence):
    model.eval()
    tokenized_sentence = [token for token in basic_english_tokenizer(sentence)]
    indexed = [large_data_vocab[token] for token in tokenized_sentence]
    input = torch.LongTensor(indexed).to(args.device)
    input = input.unsqueeze(1)
    prediction = torch.sigmoid(model(input)).flatten().detach().cpu().numpy().tolist()
    formatted_prediction_str = 'Input: {} \n P(negative) = {:.2f} \n P(positive) = {:.2f} \n\n'. \
                                format(sentence, prediction[0], prediction[1])
    return formatted_prediction_str

print(predict_sentiment(model, 'this is a great movie'))
print(predict_sentiment(model, 'what an awful day'))
print(predict_sentiment(model, 'I work 40 hours a week for us to be this poor'))
print(predict_sentiment(model, 'Nice perfume.'))
print(predict_sentiment(model, 'Nice perfume. How long did you marinate in it?'))
print(predict_sentiment(model, 'I am hungry.'))
print(predict_sentiment(model, 'He wrote a meritorious theme about his visit.'))
print(predict_sentiment(model, 'She is friendly as a rattlesnack.'))
print(predict_sentiment(model, 'What is the weather today?'))

Input: this is a great movie 
 P(negative) = 0.20 
 P(positive) = 0.78 


Input: what an awful day 
 P(negative) = 0.89 
 P(positive) = 0.11 


Input: I work 40 hours a week for us to be this poor 
 P(negative) = 0.84 
 P(positive) = 0.20 


Input: Nice perfume. 
 P(negative) = 0.26 
 P(positive) = 0.73 


Input: Nice perfume. How long did you marinate in it? 
 P(negative) = 0.67 
 P(positive) = 0.34 


Input: I am hungry. 
 P(negative) = 0.60 
 P(positive) = 0.42 


Input: He wrote a meritorious theme about his visit. 
 P(negative) = 0.10 
 P(positive) = 0.89 


Input: She is friendly as a rattlesnack. 
 P(negative) = 0.52 
 P(positive) = 0.48 


Input: What is the weather today? 
 P(negative) = 0.63 
 P(positive) = 0.38 


