##Predicting Sarcasm using Word2Vec+LSTMs

In [None]:
#importing libraries
#!pip install gensim
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

#for word embeddings
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim.downloader as api

nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.manual_seed(0); np.random.seed(0); random.seed(0)

In [None]:
#loading datasets
train_df = pd.read_csv('train.csv')
print('train set has been loaded!')
train_df.name = 'train'
valid_df = pd.read_csv('valid.csv')
print('valid set has been loaded!')
valid_df.name = 'valid'
test_df = pd.read_csv('test.csv')
print('test set has been loaded!')
test_df.name = 'test'

train_labels = torch.tensor(
    train_df['label'].values,
    dtype=torch.float32
)
valid_labels = torch.tensor(
    valid_df['label'].values,
    dtype=torch.float32
)
test_labels = torch.tensor(
    test_df['label'].values,
    dtype=torch.float32
)


In [None]:
#looking at features of train_df, valid_df, and test_df:
def features(df):
  print(f"Dataset: {df.name}")
  print(f"Size: {df.shape}")
  print(f"Columns: {df.columns}")
  print(f"Null Values: {df.isnull().sum()}")
  print(f"Head: {df.head()}\n")
  print(f"Description:\n{df.describe()}\n")

  #longest phrase in dataset
  max_len = 0
  max_len_phrase = ""
  for i in range(len(df)):
    if len(df['text'][i]) > max_len:
      max_len = len(df['text'][i])
      max_len_phrase = df['text'][i]
  print(f"Longest Phrase in Dataset: \n{max_len_phrase}, \nchar length {max_len}")

  #count of sarcasm vs non-sarcasm
  sarcasm_count = df['label'].value_counts()[1]
  non_sarcasm_count = df['label'].value_counts()[0]
  print(f"Sarcasm Count: {sarcasm_count}")
  print(f"Non-Sarcasm Count: {non_sarcasm_count}")
  print("-----------------------------")

features(train_df)
features(valid_df)
features(test_df)

Based on the information above, there does not seem to be any null values in any of the datasets.

In [None]:
#extract text from sarcasm labels
train_text = train_df['text']
valid_text = valid_df['text']
test_text = test_df['text']

#tokenizing them
def tokenize(series):
    data = []
    for text in series:
        tokens = [w.lower() for w in word_tokenize(text)]
        data.append(tokens)
    return data


train_tokens = tokenize(train_text)
print(train_tokens)
valid_tokens = tokenize(valid_text)
test_tokens = tokenize(test_text)

In [None]:
#getting vocab:
def build_vocab(tokens):
  counter = Counter(word for sent in tokens for word in sent)
  vocab = {
    "<PAD>": 0,
    "<UNK>": 1
  }
  for word in counter:
    vocab[word] = len(vocab)

  return vocab

train_vocab = build_vocab(train_tokens)


In [None]:
#converts tokens into numerical indices for model to interpret from
#also does padding/truncating
def numericalize(tokens, vocab, max_len=30):
  seq = [vocab.get(w, vocab["<UNK>"]) for w in tokens]
  return seq[:max_len] + [vocab["<PAD>"]] * max(0, max_len - len(seq))

train_sequences = torch.tensor([numericalize(s, train_vocab) for s in train_tokens])
valid_sequences = torch.tensor([numericalize(s, train_vocab) for s in valid_tokens])
test_sequences  = torch.tensor([numericalize(s, train_vocab) for s in test_tokens])

#print(train_sequences.max())
#print(len(train_vocab))
#print(train_embeddings.shape)


In [None]:
#uncomment only if you dont have GloVe already downloaded
#!wget https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
#!unzip -q glove.6B.zip
#for mac
#!curl -o glove.6B.zip https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
#!unzip -q glove.6B.zip

def load_glove_embeddings(glove_path, vocab, embed_dim=50):
    embeddings = {}
    with open(glove_path, "r", encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype="float32")
            embeddings[word] = vector

    embedding_matrix = np.zeros((len(vocab), embed_dim))

    for word, idx in vocab.items():
        if word in embeddings:
            embedding_matrix[idx] = embeddings[word]
        else:
            embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embed_dim,))

    return torch.tensor(embedding_matrix, dtype=torch.float32)
glove_embeddings = load_glove_embeddings(
    "glove.6B.50d.txt",
    train_vocab,
    embed_dim=50
)


In [None]:
class LSTMModelGloVE(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, embed_matrix):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.embed.weight.data.copy_(embed_matrix)
        #freeze embedding weights (not needed for training and makes it more stable)
        self.embed.weight.requires_grad = False
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=2, batch_first=True, bidirectional=True, dropout=0.4)
        self.drop1 = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim * 4, 1) #used two poolings, which are both hidden_dim * 2
        #self.fc = nn.Linear(hidden_dim * 2, 1) #only if we use one pooling

    def forward(self, x):
      emb = self.embed(x)
      out, (h_n, _) = self.lstm(emb)
      avg_pool = out.mean(dim=1) #global average pooling - gets the context of the entire sentence and helps avoid overfitting
      max_pool, _ = out.max(dim=1) #global max pooling - helps with feature extraction
      h = torch.cat([avg_pool, max_pool], dim=1)
      #h = self.drop1(avg_pool)
      #h = self.drop1(max_pool)
      h = self.drop1(h)
      return self.fc(h).squeeze(1)


In [None]:
#training model (and validating)
train_dataset = TensorDataset(train_sequences, train_labels)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_dataset = TensorDataset(valid_sequences, valid_labels)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)

lstm = LSTMModelGloVE(
    vocab_size=len(train_vocab),
    embed_dim=50,
    hidden_dim=64,
    embed_matrix=glove_embeddings
)
lstm.to(device)
loss_fn = nn.BCEWithLogitsLoss()
opt = optim.Adam(lstm.parameters(), lr=0.001)

#keeping track of validation loss, traning loss, and validation accuracy for visual representation
validation_losses = []
training_losses = []
validation_accuracies = []
#early stopping parameters
best_loss = float('inf')
best_validation_acc = float('-inf')
epochs_not_improving = 0

#loop
for epoch in range(30):
  #training
  #unfreeze the embedded weights later in loop to avoid overfitting
  if epoch == 4:
    lstm.embed.weight.requires_grad = True
  lstm.train()

  total_loss = 0.0
  for batch_sequences, batch_labels in train_loader:
    batch_sequences = batch_sequences.to(device)
    batch_labels = batch_labels.to(device)

    opt.zero_grad()
    output = lstm(batch_sequences)
    loss = loss_fn(output, batch_labels)
    loss.backward()
    nn.utils.clip_grad_norm_(lstm.parameters(), 1.0) #gradient clipping to avoid exploding gradients
    opt.step()

    total_loss += loss.item()
  train_loss = total_loss / len(train_loader)

  #validating
  lstm.eval()
  validation_loss = 0.0
  preds = []
  labels = []
  with torch.no_grad():
    for batch_sequences, batch_labels in valid_loader:
      batch_sequences = batch_sequences.to(device)
      batch_labels = batch_labels.to(device)
      output = lstm(batch_sequences)
      loss = loss_fn(output, batch_labels)
      validation_loss += loss.item()
      preds.extend((torch.sigmoid(output) > 0.5).float().cpu().numpy())
      labels.extend(batch_labels.cpu().numpy())
  validation_loss /= len(valid_loader)
  validation_accuracy = accuracy_score(labels, preds)

  print(f"Epoch {epoch+1}/30 : Loss {train_loss} | Validation Loss: {validation_loss} | Validation Accuracy {validation_accuracy * 100:.4f}%")
  #either do early stopping or update to more optimal model
  if validation_accuracy > best_validation_acc:
    best_validation_acc = validation_accuracy
    torch.save(lstm.state_dict(), 'model_acc.pth')
    print("Model_acc saved!")
  if validation_loss < best_loss:
    best_loss = validation_loss
    torch.save(lstm.state_dict(), 'model.pth')
    print("Model saved!")
    epochs_not_improving = 0 #reset everytime model improves
  else:
    epochs_not_improving += 1
    if epochs_not_improving >= 7:
      print("Early stopping!")
      break

  #add information to data
  training_losses.append(train_loss)
  validation_losses.append(validation_loss)
  validation_accuracies.append(validation_accuracy)

In [None]:
#plotting validation loss, validation accuracy, and training loss
plt.figure(figsize=(10, 6))
plt.plot(validation_losses, label='Validation Loss')
plt.plot(validation_accuracies, label='Validation Accuracy')
plt.plot(training_losses, label='Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Value')
plt.title('Validation Loss, Validation Accuracy, and Training Loss')
plt.legend()
plt.show()

Before implementing early stopping, we saw that overfitting happening in the training loop, giving test accuracy of around 80% later on. This still occurs as seen in the above, where the validation loss increases after a few epochs. With this, we ended up with a higher testing accuracy close to 88%.

In [None]:
#confusion matrix and error analysis
from sklearn.metrics import confusion_matrix
import seaborn as sns

#testing the model
test_labels = torch.tensor(test_df['label'].values, dtype=torch.float32).unsqueeze(1)
test_dataset = TensorDataset(test_sequences, test_labels)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

#load most optimal model (lowest validation loss) from training loop
lstm.load_state_dict(torch.load('model.pth'))
lstm.to(device)
lstm.eval()
preds = []
labels = []
with torch.no_grad():
  for batch_sequences, batch_labels in test_loader:
    batch_sequences = batch_sequences.to(device)

    logits = lstm(batch_sequences)
    probs = torch.sigmoid(logits)

    preds.extend((probs > 0.5).float().cpu().numpy())
    labels.extend(batch_labels.cpu().numpy())
test_accuracy = accuracy_score(labels, preds)
print(f"Test Accuracy with model.pth: {test_accuracy * 100:.4f}%")
print(classification_report(labels, preds, digits=4))
conf_matrix = confusion_matrix(labels, preds)
print(conf_matrix)
sns.heatmap(conf_matrix, annot=True)

In [None]:
#getting weights of model, embedding, and vocabulary to be used in predict_sarcasm.py
torch.save(lstm.state_dict(), 'model_weights.pth')

#exporting embedding weights
#print(lstm.embed.weight)
torch.save(lstm.embed.weight, 'embedding_matrix.pt')

#exporting vocabulary
#print(train_vocab)
torch.save(train_vocab, 'vocab.pkl')


In [None]:
#load most optimal model (best validation accuracy) from training loop
lstm.load_state_dict(torch.load('model_acc.pth'))
lstm.to(device)
lstm.eval()
preds = []
labels = []
with torch.no_grad():
  for batch_sequences, batch_labels in test_loader:
    batch_sequences = batch_sequences.to(device)

    logits = lstm(batch_sequences)
    probs = torch.sigmoid(logits)

    preds.extend((probs > 0.5).float().cpu().numpy())
    labels.extend(batch_labels.cpu().numpy())
test_accuracy = accuracy_score(labels, preds)
print(f"Test Accuracy with model_acc.pth: {test_accuracy * 100:.4f}%")
print(classification_report(labels, preds, digits=4))
confusion_matrix_acc = confusion_matrix(labels, preds)
sns.heatmap(confusion_matrix_acc, annot=True)