In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
import numpy as np
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [3]:
# Custom Dataset
class SarcasmDataset(Dataset):
    def __init__(self, csv_file, img_dir, transform=None):
        # Try different encodings
        encodings = ['utf-8', 'iso-8859-1', 'cp1252']
        for encoding in encodings:
            try:
                self.data = pd.read_csv(csv_file, encoding=encoding)
                print(f"Successfully read the CSV file with {encoding} encoding.")
                break
            except UnicodeDecodeError:
                print(f"Failed to read with {encoding} encoding. Trying next...")
        else:
            raise ValueError("Failed to read the CSV file with any of the attempted encodings.")
        
        self.img_dir = img_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = os.path.join(self.img_dir, self.data.iloc[idx, 1])
        image = Image.open(img_path).convert('RGB')  # Convert all images to RGB
        label = self.data.iloc[idx, 3]  # Image_label

        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label, dtype=torch.float32)

In [4]:
# Data Transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # ViT typically expects 224x224 images
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [5]:
# Create Datasets
train_dataset = SarcasmDataset(csv_file='/kaggle/input/sarcasm-detectiondata/SarcNet Image-Text/SarcNetTrain.csv', 
                               img_dir='/kaggle/input/sarcasm-detectiondata/SarcNet Image-Text/Image', 
                               transform=transform)

val_dataset = SarcasmDataset(csv_file='/kaggle/input/sarcasm-detectiondata/SarcNet Image-Text/SarcNetVal.csv', 
                             img_dir='/kaggle/input/sarcasm-detectiondata/SarcNet Image-Text/Image', 
                             transform=transform)

test_dataset = SarcasmDataset(csv_file='/kaggle/input/sarcasm-detectiondata/SarcNet Image-Text/SarcNetTest.csv', 
                              img_dir='/kaggle/input/sarcasm-detectiondata/SarcNet Image-Text/Image', 
                              transform=transform)

Failed to read with utf-8 encoding. Trying next...
Successfully read the CSV file with iso-8859-1 encoding.
Failed to read with utf-8 encoding. Trying next...
Successfully read the CSV file with iso-8859-1 encoding.
Failed to read with utf-8 encoding. Trying next...
Successfully read the CSV file with iso-8859-1 encoding.


In [6]:
# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=4, pin_memory=True)

In [7]:
# ViT Model
def get_vit_model():
    model = models.vit_b_16(pretrained=True)
    
    print("Original ViT model structure:")
    print(model)
    
    # Freeze all layers except the last few
    for param in list(model.parameters())[:-10]:
        param.requires_grad = False
    
    # Replace the classifier head
    model.heads = nn.Sequential(
        nn.Linear(model.hidden_dim, 256),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(256, 1)
    )
    
    print("\nModified ViT model structure:")
    print(model)
    
    print("\nClassifier structure:")
    print(model.heads)
    
    return model

model = get_vit_model().to(device)
print(f"Model moved to {device}")
print(f"Number of features in the first layer of the classifier: {model.heads[0].in_features}")
print(f"Number of features in the last layer of the classifier: {model.heads[-1].out_features}")

Downloading: "https://download.pytorch.org/models/vit_b_16-c867db91.pth" to /root/.cache/torch/hub/checkpoints/vit_b_16-c867db91.pth
100%|██████████| 330M/330M [00:04<00:00, 72.6MB/s] 


Original ViT model structure:
VisionTransformer(
  (conv_proj): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  (encoder): Encoder(
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): Sequential(
      (encoder_layer_0): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (self_attention): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.0, inplace=False)
        (ln_2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Dropout(p=0.0, inplace=False)
          (3): Linear(in_features=3072, out_features=768, bias=True)
          (4): Dropout(p=0.0, inplace=False)
        )
      )
      (encoder_layer_1): EncoderBlock(
        (ln_1): LayerNorm((768,), eps=1e-06, elementwis

In [8]:
# Loss and Optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=3, verbose=True)



In [9]:
def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, num_epochs=20):
    best_f1 = 0.0
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(images).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() * images.size(0)
        
        train_loss = train_loss / len(train_loader.dataset)
        
        # Validation
        model.eval()
        val_loss = 0.0
        val_preds = []
        val_true = []
        with torch.no_grad():
            for images, labels in val_loader:
                images, labels = images.to(device), labels.to(device)
                outputs = model(images).squeeze()
                loss = criterion(outputs, labels)
                val_loss += loss.item() * images.size(0)
                val_preds.extend(torch.sigmoid(outputs).cpu().numpy())
                val_true.extend(labels.cpu().numpy())
        
        val_loss = val_loss / len(val_loader.dataset)
        val_preds = (np.array(val_preds) > 0.5).astype(int)
        val_accuracy = accuracy_score(val_true, val_preds)
        val_precision = precision_score(val_true, val_preds)
        val_recall = recall_score(val_true, val_preds)
        val_f1 = f1_score(val_true, val_preds)
        
        # Learning rate scheduler step
        scheduler.step(val_f1)
            
        print(f'Epoch {epoch+1}/{num_epochs}:')
        print(f'Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
        print(f'Val Accuracy: {val_accuracy:.4f}, Precision: {val_precision:.4f}, Recall: {val_recall:.4f}, F1: {val_f1:.4f}')
            
        # Save best model
        if val_f1 > best_f1:
            best_f1 = val_f1
            torch.save(model.state_dict(), 'best_sarcasm_detection_model_vit_gpu.pth')
            print("Saved best model!")

In [10]:
# Train the model
train_model(model, train_loader, val_loader, criterion, optimizer, scheduler)

# Load best model and evaluate on test set
model.load_state_dict(torch.load('/kaggle/working/best_sarcasm_detection_model_vit_gpu.pth'))
model.eval()
test_preds = []
test_true = []
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images).squeeze()
        test_preds.extend(torch.sigmoid(outputs).cpu().numpy())
        test_true.extend(labels.cpu().numpy())

test_preds = (np.array(test_preds) > 0.5).astype(int)
test_accuracy = accuracy_score(test_true, test_preds)
test_precision = precision_score(test_true, test_preds)
test_recall = recall_score(test_true, test_preds)
test_f1 = f1_score(test_true, test_preds)

print("Test Set Results:")
print(f'Accuracy: {test_accuracy:.4f}')
print(f'Precision: {test_precision:.4f}')
print(f'Recall: {test_recall:.4f}')
print(f'F1 Score: {test_f1:.4f}')



Epoch 1/20:
Train Loss: 0.5775, Val Loss: 0.5546
Val Accuracy: 0.6870, Precision: 0.5660, Recall: 0.2679, F1: 0.3636
Saved best model!




Epoch 2/20:
Train Loss: 0.4194, Val Loss: 0.6169
Val Accuracy: 0.7332, Precision: 0.7473, Recall: 0.3036, F1: 0.4317
Saved best model!




Epoch 3/20:
Train Loss: 0.2817, Val Loss: 0.7212
Val Accuracy: 0.6960, Precision: 0.5543, Recall: 0.4554, F1: 0.5000
Saved best model!




Epoch 4/20:
Train Loss: 0.2104, Val Loss: 0.9037
Val Accuracy: 0.7347, Precision: 0.6643, Recall: 0.4152, F1: 0.5110
Saved best model!




Epoch 6/20:
Train Loss: 0.1657, Val Loss: 1.0677
Val Accuracy: 0.6945, Precision: 0.5459, Recall: 0.5045, F1: 0.5244
Saved best model!




Epoch 7/20:
Train Loss: 0.1565, Val Loss: 0.8677
Val Accuracy: 0.7064, Precision: 0.5754, Recall: 0.4598, F1: 0.5112




Epoch 8/20:
Train Loss: 0.1453, Val Loss: 0.9947
Val Accuracy: 0.7094, Precision: 0.5744, Recall: 0.5000, F1: 0.5346
Saved best model!




Epoch 9/20:
Train Loss: 0.1206, Val Loss: 1.1421
Val Accuracy: 0.7198, Precision: 0.5857, Recall: 0.5491, F1: 0.5668
Saved best model!




Epoch 10/20:
Train Loss: 0.1160, Val Loss: 1.3288
Val Accuracy: 0.7094, Precision: 0.5668, Recall: 0.5491, F1: 0.5578




Epoch 11/20:
Train Loss: 0.1214, Val Loss: 1.1005
Val Accuracy: 0.7124, Precision: 0.5847, Recall: 0.4777, F1: 0.5258




Epoch 12/20:
Train Loss: 0.0887, Val Loss: 1.3210
Val Accuracy: 0.7049, Precision: 0.5619, Recall: 0.5268, F1: 0.5438




Epoch 13/20:
Train Loss: 0.0734, Val Loss: 1.3493
Val Accuracy: 0.7198, Precision: 0.5891, Recall: 0.5312, F1: 0.5587




Epoch 14/20:
Train Loss: 0.0522, Val Loss: 1.4552
Val Accuracy: 0.7332, Precision: 0.6347, Recall: 0.4732, F1: 0.5422




Epoch 15/20:
Train Loss: 0.0393, Val Loss: 1.5349
Val Accuracy: 0.7228, Precision: 0.6067, Recall: 0.4821, F1: 0.5373




Epoch 16/20:
Train Loss: 0.0363, Val Loss: 1.6399
Val Accuracy: 0.7347, Precision: 0.6402, Recall: 0.4688, F1: 0.5412




Epoch 17/20:
Train Loss: 0.0338, Val Loss: 1.7214
Val Accuracy: 0.7347, Precision: 0.6402, Recall: 0.4688, F1: 0.5412




Epoch 18/20:
Train Loss: 0.0278, Val Loss: 1.7185
Val Accuracy: 0.7303, Precision: 0.6287, Recall: 0.4688, F1: 0.5371




Epoch 19/20:
Train Loss: 0.0306, Val Loss: 1.7233
Val Accuracy: 0.7303, Precision: 0.6287, Recall: 0.4688, F1: 0.5371




Epoch 20/20:
Train Loss: 0.0288, Val Loss: 1.7267
Val Accuracy: 0.7317, Precision: 0.6310, Recall: 0.4732, F1: 0.5408


  model.load_state_dict(torch.load('/kaggle/working/best_sarcasm_detection_model_vit_gpu.pth'))


Test Set Results:
Accuracy: 0.7162
Precision: 0.5762
Recall: 0.5475
F1 Score: 0.5615
