<a href="https://colab.research.google.com/github/shaswatgithub/Multimodal-Sentiment-Analysis-using-Transformers/blob/main/Multimodal_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch torchvision sentencepiece scikit-learn --quiet

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, ViTModel, ViTFeatureExtractor, AdamW
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from PIL import Image
import pandas as pd

In [None]:
data = pd.read_csv('multimodal_sentiment_data.csv')


In [None]:
le = LabelEncoder()
data['label_enc'] = le.fit_transform(data['label'])

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
vit_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')



In [None]:
class MultiModalDataset(Dataset):
    def __init__(self, dataframe, tokenizer, feature_extractor, max_len=128):
        self.df = dataframe
        self.tokenizer = tokenizer
        self.feature_extractor = feature_extractor
        self.max_len = max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = str(self.df.iloc[idx]['text'])
        image_path = self.df.iloc[idx]['image_path']
        label = self.df.iloc[idx]['label_enc']

        text_encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )


In [None]:
image = Image.open(image_path).convert('RGB')
        image_encoding = self.feature_extractor(images=image, return_tensors='pt')

In [None]:
     input_ids = text_encoding['input_ids'].squeeze(0)
        attention_mask = text_encoding['attention_mask'].squeeze(0)
        pixel_values = image_encoding['pixel_values'].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'pixel_values': pixel_values,
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(data, test_size=0.2, random_state=42)
train_dataset = MultiModalDataset(train_df, bert_tokenizer, vit_extractor)
val_dataset = MultiModalDataset(val_df, bert_tokenizer, vit_extractor)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [None]:
class MultiModalModel(nn.Module):
    def __init__(self, num_classes):
        super(MultiModalModel, self).__init__()
        self.text_model = BertModel.from_pretrained('bert-base-uncased')
        self.image_model = ViTModel.from_pretrained('google/vit-base-patch16-224')

        # Combine features
        self.fc = nn.Linear(self.text_model.config.hidden_size + self.image_model.config.hidden_size, 256)
        self.classifier = nn.Linear(256, num_classes)
        self.dropout = nn.Dropout(0.3)

In [None]:
 def forward(self, input_ids, attention_mask, pixel_values):
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_outputs.pooler_output

        image_features = self.image_model(pixel_values=pixel_values).pooler_output

        combined = torch.cat((text_features, image_features), dim=1)
        x = self.dropout(torch.relu(self.fc(combined)))
        logits = self.classifier(x)
        return logits

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultiModalModel(num_classes=len(le.classes_)).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=2e-5)


In [None]:
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        pixel_values = batch['pixel_values'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask, pixel_values)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Training Loss: {avg_loss:.4f}")

In [None]:
model.eval()
    val_preds = []
    val_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            pixel_values = batch['pixel_values'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask, pixel_values)
            preds = torch.argmax(outputs, dim=1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(val_labels, val_preds)
    print(f"Validation Accuracy: {acc:.4f}")