In [None]:
# Download the dataset
!wget https://s3-ap-southeast-1.amazonaws.com/he-public-data/datasetb2d9982.zip

# Unzip the dataset
!unzip datasetb2d9982.zip

In [None]:
# Install necessary libraries
!pip install torch transformers pandas numpy

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertModel, DistilBertTokenizerFast
import pandas as pd
import numpy as np

In [None]:
# Load data (assume the files inside the zip have these names)
train_file_path = 'train.csv'
test_file_path = 'test.csv'

# Read the CSV files
train_data = pd.read_csv(train_file_path)
test_data = pd.read_csv(test_file_path)

In [None]:
# Fill missing values for text columns in train and test data
train_data['TITLE'] = train_data['TITLE'].fillna('')
train_data['DESCRIPTION'] = train_data['DESCRIPTION'].fillna('')
train_data['BULLET_POINTS'] = train_data['BULLET_POINTS'].fillna('')

# Handle missing categorical values
train_data['PRODUCT_TYPE_ID'] = train_data['PRODUCT_TYPE_ID'].fillna(train_data['PRODUCT_TYPE_ID'].mode()[0])

In [None]:
# Initialize the DistilBERT tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [None]:
class ProductDataset(Dataset):
    def __init__(self, data, tokenizer, max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        product = self.data.iloc[idx]
        title = product['TITLE']
        description = product['DESCRIPTION']
        bullet_points = product['BULLET_POINTS']
        product_type = product['PRODUCT_TYPE_ID']
        length = product['PRODUCT_LENGTH']

        # Concatenate text data
        text = title + ' ' + description + ' ' + bullet_points
        
        # Tokenize text data
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'product_type': torch.tensor(product_type, dtype=torch.long),
            'length': torch.tensor(length, dtype=torch.float)
        }

# Initialize dataset and dataloaders
train_dataset = ProductDataset(train_data, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [None]:
class ProductLengthPredictor(nn.Module):
    def __init__(self, bert_model):
        super(ProductLengthPredictor, self).__init__()
        self.bert = bert_model
        self.fc1 = nn.Linear(768 + 1, 128)  # 768 from DistilBERT output + 1 for product_type
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
    
    def forward(self, input_ids, attention_mask, product_type):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_output.last_hidden_state[:, 0]  # Take [CLS] token output
        x = torch.cat((pooled_output, product_type.unsqueeze(1)), dim=1)  # Concatenate with product type
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Load pre-trained DistilBERT model
bert_model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Initialize the model and move it to the GPU if available
model = ProductLengthPredictor(bert_model).to(device)

In [None]:
# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=1e-5)

In [None]:
# Training loop with validation
epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        product_type = batch['product_type'].to(device, dtype=torch.float)
        length = batch['length'].to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids, attention_mask, product_type)
        loss = criterion(outputs.squeeze(), length)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss / len(train_loader)}")

In [None]:
# Prepare the test dataset
test_dataset = ProductDataset(test_data, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model.eval()  # Set the model to evaluation mode
predictions = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        product_type = batch['product_type'].to(device, dtype=torch.float)
        
        outputs = model(input_ids, attention_mask, product_type)
        predictions.extend(outputs.cpu().numpy().flatten())

# Store predictions
test_data['PREDICTED_PRODUCT_LENGTH'] = predictions
test_data[['PRODUCT_ID', 'PREDICTED_PRODUCT_LENGTH']].to_csv('predictions.csv', index=False)

In [None]:
from google.colab import files
files.download('predictions.csv')