In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from PIL import Image, ImageFile
from transformers import DistilBertTokenizer, DistilBertModel, ViTImageProcessor, ViTModel
import torch.nn as nn
import os
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score

In [2]:
ImageFile.LOAD_TRUNCATED_IMAGES = True

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.backends.cudnn.benchmark = True

In [4]:
device

device(type='cuda')

In [5]:
class MemeDataset(Dataset):
    def __init__(self, labels_path, image_dir, text_max_length=128):
        self.labels_df = pd.read_csv(labels_path)
        self.image_dir = image_dir
        
        # Data validation
        self.labels_df['text_corrected'] = self.labels_df['text_corrected'].astype(str)
        
        self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
        self.text_max_length = text_max_length
        
        self.image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
        
        self.label_maps = {
            'sentiment': ['very_negative', 'negative', 'neutral', 'positive', 'very_positive'],
            'humor': ['not_funny', 'funny', 'very_funny', 'hilarious'],
            'sarcasm': ['not_sarcastic', 'general', 'twisted_meaning', 'very_twisted'],
            'offensive': ['not_offensive', 'slight', 'very_offensive', 'hateful_offensive'],
            'motivational': ['not_motivational', 'motivational']
        }

    def __len__(self):
        return len(self.labels_df)

    def __getitem__(self, idx):
        row = self.labels_df.iloc[idx]
        
        # Text processing
        text = str(row['text_corrected'])
        inputs = self.tokenizer(
            text,
            max_length=self.text_max_length,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        
        # Image processing
        img_path = os.path.join(self.image_dir, row['image_name'])
        image = Image.open(img_path).convert('RGB')
        pixel_values = self.image_processor(images=image, return_tensors="pt").pixel_values
        
        # Label encoding
        labels = {
            'sentiment': torch.tensor(self.label_maps['sentiment'].index(row['overall_sentiment']), dtype=torch.long),
            'humor': torch.tensor(self.label_maps['humor'].index(row['humour']), dtype=torch.long),
            'sarcasm': torch.tensor(self.label_maps['sarcasm'].index(row['sarcasm']), dtype=torch.long),
            'offensive': torch.tensor(self.label_maps['offensive'].index(row['offensive']), dtype=torch.long),
            'motivational': torch.tensor(1 if row['motivational'] == 'motivational' else 0, dtype=torch.long)
        }
        
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'pixel_values': pixel_values.squeeze(),
            'labels': labels
        }

In [6]:
class MultimodalModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        # Text encoder
        self.text_model = DistilBertModel.from_pretrained('distilbert-base-uncased')
        
        # Image encoder
        self.image_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
        
        # Multimodal fusion
        self.fusion = nn.Sequential(
            nn.Linear(768*2, 512),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        
        # Classification heads
        self.classifier = nn.ModuleDict({
            'sentiment': nn.Linear(512, 5),
            'humor': nn.Linear(512, 4),
            'sarcasm': nn.Linear(512, 4),
            'offensive': nn.Linear(512, 4),
            'motivational': nn.Linear(512, 2)
        })

    def forward(self, input_ids, attention_mask, pixel_values):
        text_out = self.text_model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state[:,0,:]
        image_out = self.image_model(pixel_values=pixel_values).last_hidden_state[:,0,:]
        
        fused = torch.cat([text_out, image_out], dim=1)
        fused = self.fusion(fused)
        
        return {task: self.classifier[task](fused) for task in self.classifier}

In [7]:
def train_model(model, dataloader, optimizer, criterion, epochs=2):
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        
        for batch in progress_bar:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = {k: v.to(device) for k, v in batch['labels'].items()}
            
            optimizer.zero_grad()
            outputs = model(**inputs)
            
            loss = sum(criterion[task](outputs[task], labels[task]) for task in outputs)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})
        
        print(f"Epoch {epoch+1} Avg Loss: {total_loss/len(dataloader):.4f}")
    
    return model

In [8]:
def evaluate_model(model, dataloader):
    model.eval()
    all_preds, all_labels = {task: [] for task in criterion}, {task: [] for task in criterion}

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = {k: v.cpu().numpy() for k, v in batch['labels'].items()}

            outputs = model(**inputs)
            preds = {task: torch.argmax(outputs[task], dim=1).cpu().numpy() for task in outputs}

            for task in preds:
                all_preds[task].extend(preds[task])
                all_labels[task].extend(labels[task])

    # Calculate metrics
    metrics = {}
    for task in criterion:
        metrics[task] = {
            "Accuracy": accuracy_score(all_labels[task], all_preds[task]),
            "F1 Score": f1_score(all_labels[task], all_preds[task], average="macro"),
        }

    return metrics

In [9]:
dataset = MemeDataset(labels_path='D:/Multimodal Sentiment Analysis/Major Assignment 3/Multimodal_dataset_assignment3/Multimodal_dataset_assignment3/labels.csv', image_dir='D:/Multimodal Sentiment Analysis/Major Assignment 3/Multimodal_dataset_assignment3/Multimodal_dataset_assignment3/images')
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [10]:
dataset_df=dataset.labels_df
dataset_df.head()

Unnamed: 0.1,Unnamed: 0,image_name,text_ocr,text_corrected,humour,sarcasm,offensive,motivational,overall_sentiment
0,0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,hilarious,general,not_offensive,not_motivational,very_positive
1,1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,The best of #10 YearChallenge! Completed in le...,not_funny,general,not_offensive,motivational,very_positive
2,2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,Sam Thorne @Strippin ( Follow Follow Saw every...,very_funny,not_sarcastic,not_offensive,not_motivational,positive
3,3,image_4.png,10 Year Challenge - Sweet Dee Edition,10 Year Challenge - Sweet Dee Edition,very_funny,twisted_meaning,very_offensive,motivational,positive
4,4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,hilarious,very_twisted,very_offensive,not_motivational,neutral


In [11]:
from sklearn.model_selection import train_test_split

# Ensure labels_df is properly copied before splitting
train_df, val_df = train_test_split(dataset_df, test_size=0.2, random_state=42)

In [12]:
train_dataset = MemeDataset(labels_path='D:/Multimodal Sentiment Analysis/Major Assignment 3/Multimodal_dataset_assignment3/Multimodal_dataset_assignment3/labels.csv', image_dir='D:/Multimodal Sentiment Analysis/Major Assignment 3/Multimodal_dataset_assignment3/Multimodal_dataset_assignment3/images')
test_dataset = MemeDataset(labels_path='D:/Multimodal Sentiment Analysis/Major Assignment 3/Multimodal_dataset_assignment3/Multimodal_dataset_assignment3/labels.csv', image_dir='D:/Multimodal Sentiment Analysis/Major Assignment 3/Multimodal_dataset_assignment3/Multimodal_dataset_assignment3/images')

train_dataset.labels_df=train_df
test_dataset.labels_df=val_df

In [13]:
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

In [14]:
model = MultimodalModel().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [15]:
criterion = {
    task: nn.CrossEntropyLoss()
    for task in ['sentiment', 'humor', 'sarcasm', 'offensive', 'motivational']
}

In [16]:
trained_model = train_model(model, train_loader, optimizer, criterion)

Epoch 1/2: 100%|██████████| 350/350 [12:28<00:00,  2.14s/it, loss=4.9401]


Epoch 1 Avg Loss: 5.6382


Epoch 2/2: 100%|██████████| 350/350 [07:48<00:00,  1.34s/it, loss=4.9566]

Epoch 2 Avg Loss: 5.5489





In [17]:
torch.save(trained_model.state_dict(), 'multimodal_sentiment_model.pth')
print("Model saved successfully!")

Model saved successfully!


In [18]:
# Load model for evaluation
model.load_state_dict(torch.load("multimodal_sentiment_model.pth", map_location=device))
model.to(device)

  model.load_state_dict(torch.load("multimodal_sentiment_model.pth", map_location=device))


MultimodalModel(
  (text_model): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): 

In [19]:
metrics = evaluate_model(model, test_loader)

Evaluating: 100%|██████████| 88/88 [00:30<00:00,  2.90it/s]


In [20]:
for task, scores in metrics.items():
    print(f"\nTask: {task}")
    print(f"  Accuracy: {scores['Accuracy']:.4f}")
    print(f"  Macro F1 Score: {scores['F1 Score']:.4f}")


Task: sentiment
  Accuracy: 0.4382
  Macro F1 Score: 0.1237

Task: humor
  Accuracy: 0.2866
  Macro F1 Score: 0.1627

Task: sarcasm
  Accuracy: 0.5068
  Macro F1 Score: 0.1682

Task: offensive
  Accuracy: 0.3974
  Macro F1 Score: 0.2061

Task: motivational
  Accuracy: 0.6505
  Macro F1 Score: 0.3941
