### **Load all required libraries**

In [None]:
!pip install transformers --quiet
!pip install opendatasets --quiet

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### **Load the Sarcasm Dataset**

In [None]:
df = pd.read_json('/content/drive/MyDrive/Colab Notebooks/sarcasm.json', lines=True)

In [None]:
# view the first 5 rows of the dataframe
df.head()

In [None]:
# print the shape of the dataframe
df.shape

In [None]:
# check for any null values
df.isnull().sum()

### **Data Cleaning**ðŸ§¼ðŸ§½

In [None]:
# check for duplicates
df.duplicated().sum()

# drop the duplicates
df.drop_duplicates(inplace=True)

# confirm above changes
df.shape

In [None]:
# drop the 'article_link' column
df.drop('article_link', inplace=True, axis=1)

# confirm above changes
df.shape

In [None]:
# view first 5 rows of the dataframe after all the above changes
df.head()

In [None]:
# check for class balance distribution
df['is_sarcastic'].value_counts().plot(kind='bar', color=['black', 'green'])
plt.title('Sarcasm Class Distribution\n(0 for No, 1 for Yes)')
plt.xticks(rotation=0)

### **Data Preprocessing, Model Selection and Evaluation**ðŸ§¹ðŸŽ¯ðŸ“Š

In [None]:
# assign the X and y variables
X = df['headline'].values # convert to numpy array
y = df['is_sarcastic'].values # convert to numpy array

In [None]:
# split the dataset for training, validation and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test)

# check above changes
print(f'Training Set: {X_train.shape}')
print(f'Validation Set: {X_val.shape}')
print(f'Testing Set: {X_test.shape}')

In [None]:
# Load model from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
bert_model = AutoModel.from_pretrained("google-bert/bert-base-uncased")

In [None]:
class dataset(Dataset):
  def __init__(self, X, y):
    self.X = [tokenizer(x, max_length=100, truncation=True, padding='max_length', return_tensors = 'pt') for x in X]
    self.y = torch.tensor(y, dtype=torch.float32)

  def __len__(self):
    return len(self.X)

  def __getitem__(self, index):
    return self.X[index], self.y[index]

training_data = dataset(X_train, y_train)
validation_data = dataset(X_val, y_val)
testing_data = dataset(X_test, y_test)

In [None]:
batch_size = 32

training_dataloader = DataLoader(training_data, batch_size=batch_size, shuffle=True)
validation_dataloader = DataLoader(validation_data, batch_size=batch_size, shuffle=True)
testing_dataloader = DataLoader(testing_data, batch_size=batch_size, shuffle=True)

In [None]:
# building the model
class Model(nn.Module):
  def __init__(self, bert):
    super(Model, self).__init__()

    self.bert = bert
    self.dropout = nn.Dropout(0.25)
    self.linear1 = nn.Linear(768, 384)
    self.linear2 = nn.Linear(384, 1)
    self.sigmoid = nn.Sigmoid()

  def forward(self, input_ids, attention_mask):
    pooled_output = self.bert(input_ids, attention_mask, return_dict=False)[0][:,0]
    output = self.linear1(pooled_output)
    output = self.dropout(output)
    output = self.linear2(output)
    output = self.sigmoid(output)
    return output

In [None]:
for param in bert_model.parameters():
  param.requires_grad = False

model = Model(bert_model)
model

In [None]:
epochs = 10
learning_rate = 0.0001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
total_loss_train_plot = []
total_loss_validation_plot = []
total_acc_train_plot = []
total_acc_validation_plot = []

for epoch in range(epochs):
  total_loss_train = 0
  total_loss_validation = 0
  total_acc_train = 0
  total_acc_validation = 0

  for index, data in enumerate(training_dataloader):
    inputs, labels = data

    prediction = model(inputs['input_ids'].squeeze(1), inputs['attention_mask'].squeeze(1)).squeeze(1)
    batch_loss = criterion(prediction, labels)
    total_loss_train += batch_loss.item()

    acc = (prediction.round() == labels).sum().item()
    total_acc_train += acc

    batch_loss.backward()
    optimizer.step()
    optimizer.zero_grad()

  with torch.no_grad():
    for index, data in enumerate(validation_dataloader):
      inputs, labels = data

      prediction = model(inputs['input_ids'].squeeze(1), inputs['attention_mask'].squeeze(1)).squeeze(1)
      batch_loss = criterion(prediction, labels)
      total_loss_validation += batch_loss.item()

      acc = (prediction.round() == labels).sum().item()
      total_acc_validation += acc

  total_loss_train_plot.append(round(total_loss_train/1000, 4))
  total_loss_validation_plot.append(round(total_loss_validation/1000, 4))

  total_acc_train_plot.append(round(total_acc_train/training_data.__len__() * 100, 4))
  total_acc_validation_plot.append(round(total_acc_validation/validation_data.__len__() * 100, 4))

  print(f"""
          Epoch No: {epoch+1} | Train Loss: {round(total_loss_train/1000, 4)} | Train Accuracy: {round(total_acc_train/training_data.__len__() * 100, 4)} | Validation Loss: {round(total_loss_validation/1000, 4)} | Validation Accuracy: {round(total_acc_validation/validation_data.__len__() * 100, 4)}
        """)


In [None]:
with torch.no_grad():
  total_loss_test = 0
  total_acc_test = 0

for index, data in enumerate(testing_dataloader):
  inputs, labels = data

  prediction = model(inputs['input_ids'].squeeze(1), inputs['attention_mask'].squeeze(1)).squeeze(1)
  batch_loss = criterion(prediction, labels)
  total_loss_test += batch_loss.item()

  acc = (prediction.round() == labels).sum().item()
  total_acc_test += acc

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(15, 5))

axs[0].plot(total_loss_train_plot, label='Training Loss')
axs[0].plot(total_loss_validation_plot, label='Validation Loss')
axs[0].set_title('Training and Validation Loss over Epochs')
axs[0].set_xlabel('Epochs')
axs[0].set_ylabel('Loss')
axs[1].set_ylim([0, 0.30])
axs[0].legend()

axs[1].plot(total_acc_train_plot, label='Training Accuracy')
axs[1].plot(total_acc_validation_plot, label='Validation Accuracy')
axs[1].set_title('Training and Validation Accuracy over Epochs')
axs[1].set_xlabel('Epochs')
axs[1].set_ylabel('Accuracy')
axs[1].set_ylim([0, 100])
axs[1].legend()

plt.tight_layout()

plt.show()