In [None]:
!pip install transformers
!pip install torch torchvision
!pip install kaggle


In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '/content/'

!kaggle datasets download -d preethamaap/amazon-ml-challenge
!unzip amazon-ml-challenge.zip -d /content/amazon_dataset


In [None]:
import torch
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from PIL import Image
from torchvision import transforms
import pandas as pd
import os


In [None]:
class EcomDataset(Dataset):
    def __init__(self, df, img_dir, tokenizer, max_len=128, transform=None):
        self.df = df
        self.img_dir = img_dir
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row['catalog_content'])
        encoding = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )

        # Read image from Kaggle dataset folder
        img_path = os.path.join(self.img_dir, row['image_link'])  # image_link contains file name
        img = Image.open(img_path).convert('RGB')
        if self.transform:
            img = self.transform(img)

        price = torch.tensor(row['price'], dtype=torch.float) if 'price' in row else torch.tensor(0.0)

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'image': img,
            'price': price
        }


In [None]:
image_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])


In [None]:
df = pd.read_csv('/content/amazon_dataset/train.csv')  # Adjust if using test.csv
img_dir = '/content/amazon_dataset/images'  # Kaggle images folder

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
dataset = EcomDataset(df, img_dir, tokenizer, transform=image_transform)
from torch.utils.data import DataLoader
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


In [None]:
import torch.nn as nn
from torchvision import models
from transformers import AutoModel

class MultiModalRegressor(nn.Module):
    def __init__(self, text_model_name='distilbert-base-uncased'):
        super().__init__()
        self.text_model = AutoModel.from_pretrained(text_model_name)
        self.text_hidden_size = self.text_model.config.hidden_size

        self.image_model = models.resnet50(pretrained=True)
        self.image_model.fc = nn.Identity()  # Remove classification layer
        self.image_hidden_size = 2048

        self.fc1 = nn.Linear(self.text_hidden_size + self.image_hidden_size, 512)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(512,1)

    def forward(self, input_ids, attention_mask, image):
        text_out = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_feat = text_out.last_hidden_state[:,0,:]  # CLS token

        image_feat = self.image_model(image)

        combined = torch.cat([text_feat, image_feat], dim=1)
        x = self.fc1(combined)
        x = self.relu(x)
        x = self.dropout(x)
        out = self.fc2(x)
        return out.squeeze(1)


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultiModalRegressor().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
criterion = nn.MSELoss()

epochs = 3  # Increase as needed
model.train()

for epoch in range(epochs):
    total_loss = 0
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        images = batch['image'].to(device)
        prices = batch['price'].to(device)

        outputs = model(input_ids, attention_mask, images)
        loss = criterion(outputs, prices)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(dataloader):.4f}")


In [None]:
torch.save(model.state_dict(), 'multi_modal_model.pth')
