In [1]:
# Install necessary libraries
!pip install transformers[torch]
!pip install accelerate -U
!pip install emoji
!pip install scikit-learn
!pip install gensim

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.32.1-py3-none-any.whl (314 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.1/314.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import re
import emoji
import matplotlib.pyplot as plt
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from gensim.models import Word2Vec
from google.colab import files

In [13]:
# Read the CSV file
try:
    df = pd.read_csv("/content/train-balanced-sarcasm.csv")
except pd.errors.ParserError as e:
    print(f"Parser Error: {e}")
    problematic_row_index = int(str(e).split("row ")[-1].split(" ")[0]) - 1
    with open("/content/train-balanced-sarcasm.csv", "r") as f:
        for i, line in enumerate(f):
            if i == problematic_row_index:
                print(f"Problematic Row ({i+1}):\n{line}")
                break

In [65]:
# Tokenize using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)


In [66]:
# Convert to PyTorch Dataset
class SarcasmDatasetWithW2V(torch.utils.data.Dataset):
    def __init__(self, encodings, labels, w2v_vectors):
        self.encodings = encodings
        self.labels = labels
        self.w2v_vectors = w2v_vectors

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        item['w2v_vectors'] = torch.tensor(self.w2v_vectors[idx], dtype=torch.float32)
        return item

    def __len__(self):
        return len(self.labels)

In [67]:
train_dataset = SarcasmDatasetWithW2V(train_encodings, train_labels.tolist(), train_w2v_vectors)
val_dataset = SarcasmDatasetWithW2V(val_encodings, val_labels.tolist(), val_w2v_vectors)

In [68]:
# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [72]:
# **Define training arguments with increased epochs and learning rate scheduler**
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,  # Increased epochs
    per_device_train_batch_size=32,  # Adjust batch size
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    eval_strategy="epoch",
    learning_rate=2e-5,  # Adjust learning rate
)

In [73]:
from transformers import get_linear_schedule_with_warmup

In [74]:
# Define the trainer with a learning rate scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=training_args.num_train_epochs * len(train_dataset))

In [75]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler)
)


In [76]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6804,0.63501,0.69209,0.660964,0.71912,0.611511
2,0.4971,0.462596,0.806497,0.794603,0.829421,0.76259
3,0.3043,0.33231,0.868644,0.873124,0.830091,0.920863
4,0.1852,0.196651,0.927966,0.928969,0.900135,0.959712
5,0.0869,0.205336,0.94209,0.942976,0.912517,0.97554


TrainOutput(global_step=885, training_loss=0.3948352714042879, metrics={'train_runtime': 351.1729, 'train_samples_per_second': 80.616, 'train_steps_per_second': 2.52, 'total_flos': 800150524899000.0, 'train_loss': 0.3948352714042879, 'epoch': 5.0})

In [77]:
# Evaluate the model
eval_results = trainer.evaluate()

In [78]:
# Make predictions
predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=1)

In [79]:
# Compute confusion matrix
conf_matrix = confusion_matrix(val_labels, preds)
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[656  65]
 [ 17 678]]


In [80]:
# **Print classification report with zero_division=1**
class_report = classification_report(val_labels, preds, target_names=['Not Sarcastic', 'Sarcastic'], zero_division=1)
print("Classification Report:")
print(class_report)

Classification Report:
               precision    recall  f1-score   support

Not Sarcastic       0.97      0.91      0.94       721
    Sarcastic       0.91      0.98      0.94       695

     accuracy                           0.94      1416
    macro avg       0.94      0.94      0.94      1416
 weighted avg       0.94      0.94      0.94      1416



In [81]:
# Compute accuracy
accuracy = accuracy_score(val_labels, preds)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9421
