In [None]:

import ssl
import os
import certifi

# Fix SSL certificate verification errors
ssl._create_default_https_context = ssl._create_unverified_context
os.environ['CURL_CA_BUNDLE'] = ''
os.environ['REQUESTS_CA_BUNDLE'] = ''

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score
from PIL import Image

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
df = pd.read_csv("../data/amazon_fashion_reviews.csv")

# Map ratings to sentiment classes
def map_sentiment(rating):
    if rating <= 2:
        return 0  # Negative
    elif rating == 3:
        return 1  # Neutral
    else:
        return 2  # Positive

df['sentiment'] = df['reviews.rating'].apply(map_sentiment)

# Split dataset
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42, stratify=df['sentiment'])
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42, stratify=test_df['sentiment'])


In [None]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

MAX_LEN = 128  # truncate/pad to 128 tokens

def tokenize_text(text):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=MAX_LEN,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )
    return encoding['input_ids'].squeeze(), encoding['attention_mask'].squeeze()


In [None]:

image_transforms = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406],[0.229,0.224,0.225])
])


In [None]:
import torch
from torch.utils.data import Dataset
from PIL import Image
from io import BytesIO
import requests

class MultimodalDataset(Dataset):
    def __init__(self, df, transform=None, tokenizer=None, max_length=128):
        self.df = df.reset_index(drop=True)
        self.transform = transform
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # ----- Text -----
        text = str(row['reviews.text'])
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].squeeze(0)  # remove batch dim
        attention_mask = encoding['attention_mask'].squeeze(0)

        # ----- Image -----
        img = None
        url = row['reviews.sourceURLs']
        if isinstance(url, str) and url.strip() != "":
            try:
                response = requests.get(url, timeout=5)
                img = Image.open(BytesIO(response.content)).convert("RGB")
                if self.transform:
                    img = self.transform(img)
            except:
                img = torch.zeros(3, 224, 224)  # fallback blank image
        else:
            img = torch.zeros(3, 224, 224)

     
        label = torch.tensor(row['sentiment'], dtype=torch.long)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'image': img,
            'label': label
        }


In [6]:
train_dataset = MultimodalDataset(train_df, transform=image_transforms, tokenizer=tokenizer)
val_dataset   = MultimodalDataset(val_df, transform=image_transforms, tokenizer=tokenizer)
test_dataset  = MultimodalDataset(test_df, transform=image_transforms, tokenizer=tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [None]:

class MultimodalSentimentModel(nn.Module):
    def __init__(self, num_classes=3):
        super().__init__()
        # Text Model
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.text_fc = nn.Linear(768, 256)

        # Image Model
        self.resnet = models.resnet50(pretrained=True)
        self.resnet.fc = nn.Linear(self.resnet.fc.in_features, 256)

        # Fusion
        self.fc1 = nn.Linear(256+256, 128)
        self.dropout = nn.Dropout(0.3)
        self.out = nn.Linear(128, num_classes)

    def forward(self, input_ids, attention_mask, images):
        # Text forward
        text_emb = self.bert(input_ids=input_ids, attention_mask=attention_mask).pooler_output
        text_feat = self.text_fc(text_emb)

        # Image forward
        img_feat = self.resnet(images)

        # Concatenate
        combined = torch.cat((text_feat, img_feat), dim=1)
        x = self.fc1(combined)
        x = self.dropout(x)
        out = self.out(x)
        return out


In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalSentimentModel().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

EPOCHS = 3  




In [None]:
train_loss_list, val_loss_list = [], []
train_acc_list, val_acc_list = [], []

for epoch in range(EPOCHS):
    model.train()
    total_loss, correct = 0, 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        images = batch['image'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask, images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        correct += (outputs.argmax(1) == labels).sum().item()

    train_loss = total_loss / len(train_loader)
    train_acc = correct / len(train_dataset)
    train_loss_list.append(train_loss)
    train_acc_list.append(train_acc)

    # Validation
    model.eval()
    val_loss, val_correct = 0, 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            images = batch['image'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask, images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            val_correct += (outputs.argmax(1) == labels).sum().item()

    val_loss_list.append(val_loss/len(val_loader))
    val_acc_list.append(val_correct/len(val_dataset))

    print(f"Epoch {epoch+1}/{EPOCHS}: "
          f"Train Loss={train_loss:.4f}, Train Acc={train_acc:.4f} | "
          f"Val Loss={val_loss_list[-1]:.4f}, Val Acc={val_acc_list[-1]:.4f}")


Epoch 1/3: Train Loss=0.4814, Train Acc=0.8675 | Val Loss=1.5684, Val Acc=0.8750


In [None]:

torch.save(model.state_dict(), "../models/multimodal_model.pth")


In [None]:

plt.figure(figsize=(8,5))
plt.plot(train_loss_list, label="Train Loss")
plt.plot(val_loss_list, label="Val Loss")
plt.title("Loss vs Epoch")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()

plt.figure(figsize=(8,5))
plt.plot(train_acc_list, label="Train Accuracy")
plt.plot(val_acc_list, label="Val Accuracy")
plt.title("Accuracy vs Epoch")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.show()


In [None]:

model.eval()
y_true, y_pred = [], []

with torch.no_grad():
    for input_ids, attention_mask, images, labels in test_loader:
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        images = images.to(device)
        labels = labels.to(device)

        outputs = model(input_ids, attention_mask, images)
        preds = outputs.argmax(1)
        y_true.extend(labels.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())

cm = confusion_matrix(y_true, y_pred, labels=[0,1,2])
disp = ConfusionMatrixDisplay(cm, display_labels=["Negative","Neutral","Positive"])
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.show()


In [None]:

for i in range(5):
    print("Review Text:", test_df.iloc[i]['reviews.text'])
    print("True Sentiment:", test_df.iloc[i]['sentiment'])
    print("Predicted Sentiment:", y_pred[i])
    print("-"*60)


In [None]:
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        images = batch['image'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids, attention_mask, images)
        preds = torch.argmax(outputs, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())


accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='weighted')
recall = recall_score(all_labels, all_preds, average='weighted')
f1 = f1_score(all_labels, all_preds, average='weighted')


print("Model Performance on Test Set:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-score : {f1:.4f}")
