In [24]:
import pandas as pd
import numpy as np
import random
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import json
from sklearn.model_selection import train_test_split
import tqdm

In [9]:
data = []
with open('./../data/Sarcasm_Headlines_Dataset.json') as f:
    for line in f:
        data.append(json.loads(line))

# Convert JSON data to a DataFrame
data = pd.DataFrame(data)

In [10]:
data

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...
...,...,...,...
28614,1,jews to celebrate rosh hashasha or something,https://www.theonion.com/jews-to-celebrate-ros...
28615,1,internal affairs investigator disappointed con...,https://local.theonion.com/internal-affairs-in...
28616,0,the most beautiful acceptance speech this week...,https://www.huffingtonpost.com/entry/andrew-ah...
28617,1,mars probe destroyed by orbiting spielberg-gat...,https://www.theonion.com/mars-probe-destroyed-...


In [11]:
data.is_sarcastic.value_counts()

0    14985
1    13634
Name: is_sarcastic, dtype: int64

In [12]:
data = data[['is_sarcastic','headline']]

In [36]:
data = data[:1000]

In [37]:
data

Unnamed: 0,is_sarcastic,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...
...,...,...
995,0,"at lg forum hosted by h.a.p.a., green and espe..."
996,0,the best style moments from wimbledon 2015
997,1,battle of wits with unwieldy burrito nears thr...
998,0,look: 'thor's helmet' glows in brilliant neon ...


In [38]:
train_df, test_df = train_test_split(data, test_size=0.3, random_state=41)

In [39]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from torch.nn.functional import softmax
import torch
from sklearn.metrics import accuracy_score, classification_report

In [40]:
# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # binary classification (sarcasm or not)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
# Tokenize and encode the headlines
def tokenize_headlines(df, tokenizer, max_length=128):
    tokenized = tokenizer(list(df["headline"]), max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    return tokenized

train_tokenized = tokenize_headlines(train_df, tokenizer)
test_tokenized = tokenize_headlines(test_df, tokenizer)

In [42]:
# Create a custom dataset class
class HeadlineDataset(Dataset):
    def __init__(self, tokenized_inputs, labels):
        self.tokenized_inputs = tokenized_inputs
        self.labels = torch.tensor(labels.values)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.tokenized_inputs.items()}, self.labels[idx]

# Create DataLoader for training and testing sets
train_dataset = HeadlineDataset(train_tokenized, train_df["is_sarcastic"])
test_dataset = HeadlineDataset(test_tokenized, test_df["is_sarcastic"])

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [43]:
# Training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 1

In [44]:
for epoch in range(num_epochs):
    print("Epoch: ", epoch)
    model.train()
    print("Length of train_dataloader: ", len(train_dataloader))
    cnt = 1
    for batch in train_dataloader:
        print("Batch: ", cnt)
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        cnt = cnt+1

# Evaluation
model.eval()
all_predictions = []
all_labels = []

Epoch:  0
Length of train_dataloader:  22
Batch:  1
Batch:  2
Batch:  3
Batch:  4
Batch:  5
Batch:  6
Batch:  7
Batch:  8
Batch:  9
Batch:  10
Batch:  11
Batch:  12
Batch:  13
Batch:  14
Batch:  15
Batch:  16
Batch:  17
Batch:  18
Batch:  19
Batch:  20
Batch:  21
Batch:  22


In [None]:
with torch.no_grad():
    print("Total batches are: ", len(test_dataloader))
    cnt = 1
    for batch in test_dataloader:
        print("Batch: ", cnt)
        inputs, labels = batch
        inputs = {key: val.to(device) for key, val in inputs.items()}
        labels = labels.to(device)

        outputs = model(**inputs)
        logits = outputs.logits
        probabilities = softmax(logits, dim=1)
        predictions = torch.argmax(probabilities, dim=1).cpu().numpy()

        all_predictions.extend(predictions)
        all_labels.extend(labels.cpu().numpy())
        cnt = cnt + 1

Total batches are:  10
Batch:  1


In [None]:
# Calculate accuracy and print classification report
accuracy = accuracy_score(all_labels, all_predictions)
print(f"Accuracy: {accuracy:.4f}")

classification_rep = classification_report(all_labels, all_predictions, target_names=["Not Sarcastic", "Sarcastic"])
print("Classification Report:\n", classification_rep)