In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertModel, ViTModel
from torchvision import transforms
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score

# Load dataset
class MemotionDataset(Dataset):
    def __init__(self, csv_path, image_folder, tokenizer, transform, max_length=128):
        self.data = pd.read_csv(csv_path)
        self.image_folder = image_folder
        self.tokenizer = tokenizer
        self.transform = transform
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['text']
        image_path = f"{self.image_folder}/{self.data.iloc[idx]['image']}"
        sentiment = self.data.iloc[idx]['sentiment']
        humor = self.data.iloc[idx]['humor']
        sarcasm = self.data.iloc[idx]['sarcasm']
        offense = self.data.iloc[idx]['offense']
        motivation = self.data.iloc[idx]['motivation']

        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')

        image = Image.open(image_path).convert('RGB')
        image = self.transform(image)

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'image': image,
            'sentiment': torch.tensor(sentiment + 1, dtype=torch.long),  # Mapping [-1, 0, 1] to [0, 1, 2]
            'humor': torch.tensor(humor, dtype=torch.float),
            'sarcasm': torch.tensor(sarcasm, dtype=torch.float),
            'offense': torch.tensor(offense, dtype=torch.float),
            'motivation': torch.tensor(motivation, dtype=torch.float)
        }

transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

csv_path = "/content/Multimodal_Sentiment_Analysis_FinalAssignment.csv"
image_folder = "/content/images"
dataset = MemotionDataset(csv_path, image_folder, tokenizer, transform)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

class MultimodalModel(nn.Module):
    def __init__(self):
        super(MultimodalModel, self).__init__()
        self.text_model = BertModel.from_pretrained('bert-base-uncased')
        self.image_model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')

        self.fusion_layer = nn.Linear(self.text_model.config.hidden_size + self.image_model.config.hidden_size, 256)
        self.sentiment_classifier = nn.Linear(256, 3)
        self.humor_classifier = nn.Linear(256, 2)
        self.sarcasm_classifier = nn.Linear(256, 2)
        self.offense_classifier = nn.Linear(256, 2)
        self.motivation_classifier = nn.Linear(256, 2)
        self.regression_head = nn.Linear(256, 4)

    def forward(self, input_ids, attention_mask, image):
        text_embeds = self.text_model(input_ids, attention_mask=attention_mask).last_hidden_state[:, 0, :]
        image_embeds = self.image_model(image).last_hidden_state[:, 0, :]

        fusion = torch.cat((text_embeds, image_embeds), dim=1)
        fusion = torch.relu(self.fusion_layer(fusion))

        sentiment_output = self.sentiment_classifier(fusion)
        sentiment_output = torch.argmax(sentiment_output, dim=1) - 1  # Mapping [0, 1, 2] back to [-1, 0, 1]

        humor_output = self.humor_classifier(fusion)
        sarcasm_output = self.sarcasm_classifier(fusion)
        offense_output = self.offense_classifier(fusion)
        motivation_output = self.motivation_classifier(fusion)
        scale_output = torch.sigmoid(self.regression_head(fusion)) * 3  # Scaling regression outputs to [0, 3]

        return sentiment_output, humor_output, sarcasm_output, offense_output, motivation_output, scale_output

model = MultimodalModel().to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled = True
sentiment_criterion = nn.CrossEntropyLoss()
regression_criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=2e-5)

def train_model(model, dataloader, sentiment_criterion, regression_criterion, optimizer, epochs=3):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in dataloader:
            input_ids, attention_mask, images = batch['input_ids'].to(device), batch['attention_mask'].to(device), batch['image'].to(device)
            sentiments, humors, sarcasms, offenses, motivations = batch['sentiment'].to(device), batch['humor'].to(device), batch['sarcasm'].to(device), batch['offense'].to(device), batch['motivation'].to(device)

            optimizer.zero_grad()
            sentiment_output, humor_output, sarcasm_output, offense_output, motivation_output, scale_output = model(input_ids, attention_mask, images)

            loss = sentiment_criterion(sentiment_output, sentiments)
            loss += regression_criterion(scale_output[:, 0], humors)
            loss += regression_criterion(scale_output[:, 1], sarcasms)
            loss += regression_criterion(scale_output[:, 2], offenses)
            loss += regression_criterion(scale_output[:, 3], motivations)

            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader)}")

train_model(model, dataloader, sentiment_criterion, regression_criterion, optimizer)
torch.save(model.state_dict(), "multimodal_sentiment_model.pth")
print("Model trained and saved successfully!")


KeyboardInterrupt: 