In [1]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam
import torchvision
from torchvision import transforms
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
import os
from torch.utils.data import random_split
from torchvision import datasets, transforms
from torchvision.datasets import ImageFolder
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split

device = 'cuda' if torch.cuda.is_available() else 'cpu'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_df = pd.read_json('/home/adityaraut/Documents/Machine-Learning-FCC/dl_fcc/archive/Sarcasm_Headlines_Dataset.json', lines=True)

In [3]:
data_df.dropna(inplace=True)
data_df.drop(['article_link'], inplace=True, axis=1)

In [4]:
data_df

Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0
...,...,...
26704,american politics in moral free-fall,0
26705,america's best 20 hikes,0
26706,reparations and obama,0
26707,israeli ban targeting boycott supporters raise...,0


In [5]:
x_train, x_test, y_train, y_test = train_test_split(np.array(data_df['headline']), np.array(data_df['is_sarcastic']), test_size=0.3)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=0.5)

In [6]:
print(x_train.shape, y_train.shape)
print(x_val.shape, y_val.shape)
print(x_test.shape, y_test.shape)

(18696,) (18696,)
(4006,) (4006,)
(4007,) (4007,)


In [7]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-uncased')
bert_model = AutoModel.from_pretrained('google-bert/bert-base-uncased')

In [8]:
class dataset(Dataset):
    def __init__(self, text, labels, tokenizer, max_len = 100):
        
        self.encodings = tokenizer(
            text,
            max_length=max_len,
            truncation=True,
            padding='max_length',
            return_tensors='pt'
        )
        self.labels = torch.tensor(labels, dtype=torch.long)
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item
    
training_dataset = dataset(x_train.tolist(), y_train.tolist(), tokenizer)
val_dataset = dataset(x_val.tolist(), y_val.tolist(), tokenizer)
test_dataset = dataset(x_test.tolist(), y_test.tolist(), tokenizer)

In [12]:
epochs = 10
batch_size = 32
lr = 0.0001

In [13]:
train_loader = DataLoader(dataset=training_dataset, batch_size = batch_size, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size = batch_size)
test_loader = DataLoader(dataset=test_dataset, batch_size = batch_size)

In [14]:
class NN(nn.Module):
    def __init__(self, bert):
        super(NN, self).__init__()
        self.bert = bert
        self.dropout = nn.Dropout(0.25)
        self.linear1 = nn.Linear(768, 384)
        self.linear2 = nn.Linear(384, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, input_ids, attention_mask):
        pooled_output = self.bert(input_ids, attention_mask, return_dict = False)[0][:,0]
        output = self.linear1(pooled_output)
        output = self.dropout(output)
        output = self.linear2(output)
        output = self.sigmoid(output)
        return output
         
        

In [15]:
for params in bert_model.parameters():
    params.requires_grad = True
model = NN(bert_model).to(device)

In [16]:
model


NN(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tru

In [18]:
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
total_loss_training_plot = []
total_loss_val_plot = []
total_acc_training_plot = []
total_acc_val_plot = []

for epoch in range(epochs):
    
    model.train()
    total_loss_train = 0
    total_acc_train = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).float()

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask).squeeze(1)  # [batch]
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss_train += loss.item()
        preds = (outputs > 0.5).long()
        total_acc_train += (preds == labels.long()).sum().item()

    model.eval()
    total_loss_val = 0
    total_acc_val = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device).float()

            outputs = model(input_ids, attention_mask).squeeze(1)
            loss = criterion(outputs, labels)

            total_loss_val += loss.item()
            preds = (outputs > 0.5).long()
            total_acc_val += (preds == labels.long()).sum().item()

    avg_train_loss = total_loss_train / len(train_loader)
    avg_train_acc = total_acc_train / len(training_dataset)
    avg_val_loss = total_loss_val / len(val_loader)
    avg_val_acc = total_acc_val / len(val_dataset)

    total_loss_training_plot.append(avg_train_loss)
    total_acc_training_plot.append(avg_train_acc)
    total_loss_val_plot.append(avg_val_loss)
    total_acc_val_plot.append(avg_val_acc)

    print(f"Epoch {epoch+1}/{epochs} | "
          f"Train Loss: {avg_train_loss:.4f}, Train Acc: {avg_train_acc:.4f} | "
          f"Val Loss: {avg_val_loss:.4f}, Val Acc: {avg_val_acc:.4f}")

KeyboardInterrupt: 