In [1]:
import pandas as pd 
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.metrics import mean_squared_error
%load_ext autoreload
%autoreload 2

In [2]:
DATA_PATH = 'Data/'
books = pd.read_csv(DATA_PATH + 'books.csv')
sample_submission = pd.read_csv(DATA_PATH + 'sample_submission.csv')
test = pd.read_csv(DATA_PATH + 'test.csv')
train = pd.read_csv(DATA_PATH + 'train.csv')

In [3]:
sample_submission.head()

Unnamed: 0,id,rating
0,0,3.500829
1,1,1.276242
2,2,2.945291
3,3,3.264039
4,4,1.0


In [4]:
print(f"Books df shape: {books.shape}")
print(f"Train df shape: {train.shape}")

Books df shape: (16599, 2)
Train df shape: (100523, 3)


In [5]:
test = test.merge(sample_submission, on='id', how='left')
#test = test.merge(books, on='book_id', how='left')
print(f"Test ratings df shape: {test.shape}")
display(test.head())

Test ratings df shape: (29367, 4)


Unnamed: 0,id,book_id,user_id,rating
0,0,3786,40484,3.500829
1,1,1985,47039,1.276242
2,2,2290,60111,2.945291
3,3,118657,64447,3.264039
4,4,1560,2953,1.0


In [6]:
train.head()

Unnamed: 0,book_id,user_id,rating
0,7260,20145,3.5
1,243238,85182,4.0
2,9135,45973,1.0
3,18671,63554,3.0
4,243293,81002,5.0


# Simple ML models:

## Linear Regression

In [7]:
X_test = test[['book_id', 'user_id']].to_numpy()
y_test = test['rating'].to_numpy()

X_train = train[['book_id', 'user_id']].to_numpy()
y_train = train['rating'].to_numpy()

In [8]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [9]:
## Mean squared error
mse = 0
for i in range(len(y_pred)):
    mse += (y_pred[i] - y_test[i])**2
mse /= len(y_pred)
print(f"Mean squared error: {mse}")

Mean squared error: 1.1677357358426437


In [10]:
result_lr = pd.DataFrame({'id': test['id'], 'rating': y_pred})
result_lr.to_csv('submission_lr.csv', index=False) # Kaggle: 0.90800 (good start)

# Matrix Factorization (Collaborative Filtering) --> to be optimized

In [11]:
import torch

In [12]:
train_df = train.copy()

book_mapping = {book_id: idx for idx, book_id in enumerate(train_df['book_id'].unique())}
user_mapping = {user_id: idx for idx, user_id in enumerate(train_df['user_id'].unique())}

train_df['book_idx'] = train_df['book_id'].map(book_mapping)
train_df['user_idx'] = train_df['user_id'].map(user_mapping)

test_df = test.copy()
test_df['book_idx'] = test_df['book_id'].map(book_mapping)
test_df['user_idx'] = test_df['user_id'].map(user_mapping)

In [13]:
import torch.nn as nn

class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_books, embedding_dim):
        super(MatrixFactorization, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.book_embedding = nn.Embedding(num_books, embedding_dim)
    
    def forward(self, user_idx, book_idx):
        # Compute dot product of user and book latent factors
        user_factors = self.user_embedding(user_idx)
        book_factors = self.book_embedding(book_idx)
        return (user_factors * book_factors).sum(1)

In [14]:
embedding_dim = 50
num_users = len(user_mapping)
num_books = len(book_mapping)
learning_rate = 0.01
epochs = 10

model = MatrixFactorization(num_users, num_books, embedding_dim)
criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

train_users = torch.tensor(train_df['user_idx'].values, dtype=torch.long)
train_books = torch.tensor(train_df['book_idx'].values, dtype=torch.long)
train_ratings = torch.tensor(train_df['rating'].values, dtype=torch.float32)

for epoch in range(epochs):
    model.train()
    
    optimizer.zero_grad()
    
    predictions = model(train_users, train_books)
    loss = criterion(predictions, train_ratings)

    loss.backward()
    optimizer.step()
    
    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")


Epoch 1, Loss: 56.6036
Epoch 2, Loss: 56.5995
Epoch 3, Loss: 56.5954
Epoch 4, Loss: 56.5913
Epoch 5, Loss: 56.5872
Epoch 6, Loss: 56.5831
Epoch 7, Loss: 56.5790
Epoch 8, Loss: 56.5749
Epoch 9, Loss: 56.5708
Epoch 10, Loss: 56.5667


In [15]:
test_users = torch.tensor(test_df['user_idx'].values, dtype=torch.long)
test_books = torch.tensor(test_df['book_idx'].values, dtype=torch.long)

model.eval()
with torch.no_grad():
    test_predictions = model(test_users, test_books)

test_df['predicted_rating'] = test_predictions.numpy()

In [16]:
# Replace values that are <1 or >5 with 1 and 5, respectively, in test_predictions
test_predictions = test_predictions.clamp(1, 5)

In [17]:
test_users = torch.tensor(test_df['user_idx'].values, dtype=torch.long)
test_books = torch.tensor(test_df['book_idx'].values, dtype=torch.long)
test_ratings = torch.tensor(test_df['rating'].values, dtype=torch.float32)

model.eval()  
with torch.no_grad():  
    test_predictions = model(test_users, test_books)
    
criterion = nn.MSELoss()
mse = criterion(test_predictions, test_ratings)
print(f"Test MSE: {mse.item():.4f}")

Test MSE: 57.5657


In [18]:
result = pd.DataFrame({'id': test["id"], 'rating': test_predictions})
result.to_csv('submission.csv', index=False) # Kaggle: 1.97917 (Not good at all)