In [12]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('https://files.grouplens.org/datasets/movielens/ml-100k/u.data', sep='\t', names=column_names)
df = df.drop('timestamp', axis=1)

# Split into train and test sets (80% train, 20% test)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [None]:
import os
import torch
from torch import nn, optim
import pandas as pd
import numpy as np

# Step 1: Remap user_id and item_id to a zero-based index
user_mapping = {user_id: idx for idx, user_id in enumerate(train_df['user_id'].unique())}
item_mapping = {item_id: idx for idx, item_id in enumerate(train_df['item_id'].unique())}

train_df['user_id'] = train_df['user_id'].map(user_mapping)
train_df['item_id'] = train_df['item_id'].map(item_mapping)
test_df['user_id'] = test_df['user_id'].map(user_mapping)
test_df['item_id'] = test_df['item_id'].map(item_mapping)

# Remove any rows in test_df with unmapped (NaN) user or item IDs
test_df = test_df.dropna().astype({'user_id': 'int', 'item_id': 'int'})

class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_items, latent_dim=10):
        super(MatrixFactorization, self).__init__()
        self.user_factors = nn.Embedding(num_users, latent_dim)
        self.item_factors = nn.Embedding(num_items, latent_dim)

    def forward(self, user, item):
        return (self.user_factors(user) * self.item_factors(item)).sum(1)

def matrix_factorization(train_data, test_data, output_dir):
    num_users = train_data['user_id'].nunique()
    num_items = train_data['item_id'].nunique()
    model = MatrixFactorization(num_users, num_items)
    optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-5)
    criterion = nn.MSELoss()

    # Training
    for epoch in range(20):  # Adjust epochs as needed
        for _, row in train_data.iterrows():
            user = torch.LongTensor([row['user_id']])
            item = torch.LongTensor([row['item_id']])
            rating = torch.FloatTensor([row['rating']])

            prediction = model(user, item)
            loss = criterion(prediction, rating)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

    # Prediction
    predictions = []
    for _, row in test_data.iterrows():
        user = torch.LongTensor([row['user_id']])
        item = torch.LongTensor([row['item_id']])
        predicted_rating = model(user, item).item()
        original_user_id = list(user_mapping.keys())[list(user_mapping.values()).index(row['user_id'])]
        original_item_id = list(item_mapping.keys())[list(item_mapping.values()).index(row['item_id'])]
        predictions.append((original_user_id, original_item_id, predicted_rating))

    # Save predictions to Google Drive
    pd.DataFrame(predictions, columns=['user_id', 'item_id', 'predicted_rating']).to_csv(
        output_dir + 'matrix_factorization_predictions.csv', index=False
    )

# Define output directory and run the function
output_dir = '/content/drive/MyDrive/submission/'
os.makedirs(output_dir, exist_ok=True)
matrix_factorization(train_df, test_df, output_dir)