# Implementation of a Neural Collaborative Filtering (NCF)

Paper: https://arxiv.org/pdf/1708.05031


In [None]:
from helpers_ncf import *
import numpy as np 
from torch.utils.data import DataLoader
import torch.optim as optim

%load_ext autoreload
%autoreload 2

In [None]:
# Check if a GPU is available, if so use it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [None]:
# Load the data
test, train_data = load_data()

Unnamed: 0,book_id,user_id,rating
0,0,0,3.5
1,1,1,4.0
2,2,2,1.0
3,3,3,3.0
4,4,4,5.0


Unnamed: 0,id,book_id,user_id
0,0,3786,40484
1,1,1985,47039
2,2,2290,60111
3,3,118657,64447
4,4,1560,2953


Unnamed: 0,id,book_id,user_id
0,0,123,4104
1,1,1478,5851
2,2,424,8720
3,3,14224,574
4,4,104,73


Let us check if there is users or books present in the test set but not in the train set (= cold start):


In [None]:
users_tr = train_data['user_id'].unique()
print(f"Number of unique users in train: {len(users_tr)}")
users_te = test['user_id'].unique()
print(f"Number of unique users in test: {len(users_te)}")

# Find intersection of users in train and test
inter_users = list(set(users_tr) & set(users_te))

print(f"Number of overlapping users: {len(inter_users)}") # all users in test are in train

Number of unique users in train: 18905
Number of unique users in test: 6519
Number of overlapping users: 6519


In [None]:
books_tr = train_data['book_id'].unique()
print(f"Number of unique books in train: {len(books_tr)}")
books_te = test['book_id'].unique()
print(f"Number of unique books in test: {len(books_te)}")

# Find intersection of users in train and test
inter_books = list(set(books_tr) & set(books_te))

print(f"Number of overlapping users: {len(inter_books)}") # all books in test are in train

Number of unique books in train: 15712
Number of unique books in test: 9371
Number of overlapping users: 9371


We are all good so we can keep going with those datasets.


In [None]:
# Remap user and book IDs in the training data
train, user_mapping, book_mapping = remap_ids(train_data, 'user_id', 'book_id')
display(train.head(), test.head())
# Use this mapping for the testing data 
test['user_id'] = test['user_id'].map(user_mapping).fillna(-1).astype(int)
test['book_id'] = test['book_id'].map(book_mapping).fillna(-1).astype(int)

In [None]:
# We need to reshape our columns for the neural network to work
user_ids_train = np.array(train['user_id']).reshape(-1, 1)
book_ids_train = np.array(train['book_id']).reshape(-1, 1)
ratings_train = np.array(train['rating']).reshape(-1,1)

user_ids_test = np.array(test['user_id']).reshape(-1, 1)
book_ids_test = np.array(test['book_id']).reshape(-1, 1)

In [None]:
# Change the ratings to values between 0 and 1 for the neural network to work
min_rating = train['rating'].min()
max_rating = train['rating'].max()

train['rating'] = (train['rating'] - min_rating) / (max_rating - min_rating)
display(train.head())

In [None]:
#  Hyper-parameters fine-tuned
embedding_dim = 16
hidden_dims = [64, 32]
batch_size = 2
epochs = 10
learning_rate = 0.001

In [31]:
data = TrainSet(user_ids_train, book_ids_train, train['rating'])
dataloader = DataLoader(data, batch_size = batch_size, shuffle=True)

num_users = len(user_mapping)
num_books = len(book_mapping)

In [None]:
model = NeuralCollaborativeFiltering(num_users, num_books, embedding_dim, hidden_dims).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

In [None]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        user = batch['user_id'].to(device)
        book = batch['book_id'].to(device)
        rating = batch['rating'].to(device)
        optimizer.zero_grad()
        predictions = model(user, book).squeeze()
        loss = criterion(predictions, rating.squeeze())
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")

Epoch 1/10, Loss: 31444.5318
Epoch 2/10, Loss: 29530.1396
Epoch 3/10, Loss: 28490.2395
Epoch 4/10, Loss: 27973.2530
Epoch 5/10, Loss: 27550.6402
Epoch 6/10, Loss: 27239.4893
Epoch 7/10, Loss: 26939.4874
Epoch 8/10, Loss: 26644.5865
Epoch 9/10, Loss: 26318.8834
Epoch 10/10, Loss: 26011.0113


In [34]:
test_data = TestSet(user_ids_test, book_ids_test)
test_loader = DataLoader(test_data, batch_size=1, shuffle=False)

In [None]:
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        user = batch['user_id'].to(device)
        book = batch['book_id'].to(device)
        preds = model(user, book).squeeze()
        predictions.append(preds.tolist())
print(predictions)

[0.25948837399482727, 0.21061907708644867, 0.08420722186565399, 0.44519224762916565, 0.3641352355480194, 0.1040249615907669, 0.3295806050300598, 0.2164134532213211, 0.4319446086883545, 0.3705073595046997, 0.31160086393356323, 0.3543943464756012, 0.024826059117913246, 0.3079931139945984, 0.42375442385673523, 0.21959376335144043, 0.2007642537355423, 0.24715480208396912, 0.196157768368721, 0.7129548192024231, 0.2648475468158722, 0.7091618776321411, 0.4228413701057434, 0.3219766318798065, 0.062061309814453125, 0.23683381080627441, 0.6385611295700073, 0.5013186931610107, 0.1979992687702179, 0.49587640166282654, 0.16462215781211853, 0.09136646240949631, 0.2156210094690323, 0.3747619390487671, 0.3044148087501526, 0.27535298466682434, 0.3467695415019989, 0.6385611295700073, 0.19313566386699677, 0.06254376471042633, 0.426972359418869, 0.03618578240275383, 0.163033589720726, 0.41683125495910645, 0.03444671258330345, 0.5625030398368835, 0.32301798462867737, 0.24591098725795746, 0.3791537880897522

In [37]:
# Change back the predictions so that they are between 1 and 5 and not 0 and 1
MIN = 1
MAX = 5

new_pred_ = [(pred * (MAX - MIN)) + 1 for pred in predictions]
print(new_pred_)

[2.037953495979309, 1.8424763083457947, 1.336828887462616, 2.7807689905166626, 2.4565409421920776, 1.4160998463630676, 2.3183224201202393, 1.8656538128852844, 2.727778434753418, 2.482029438018799, 2.246403455734253, 2.417577385902405, 1.099304236471653, 2.2319724559783936, 2.695017695426941, 1.8783750534057617, 1.8030570149421692, 1.9886192083358765, 1.784631073474884, 3.8518192768096924, 2.0593901872634888, 3.8366475105285645, 2.6913654804229736, 2.287906527519226, 1.2482452392578125, 1.9473352432250977, 3.5542445182800293, 3.005274772644043, 1.7919970750808716, 2.983505606651306, 1.6584886312484741, 1.3654658496379852, 1.8624840378761292, 2.4990477561950684, 2.2176592350006104, 2.1014119386672974, 2.3870781660079956, 3.5542445182800293, 1.772542655467987, 1.2501750588417053, 2.707889437675476, 1.1447431296110153, 1.652134358882904, 2.667325019836426, 1.1377868503332138, 3.250012159347534, 2.2920719385147095, 1.9836439490318298, 2.516615152359009, 1.3364989757537842, 1.906910240650177

In [None]:
# Save the dataset for the submission on Kaggle
result = pd.DataFrame(({
    "id": test["id"],
    "rating": new_pred_
}))
display(result.head())
result.to_csv('submission_ncf.csv', index=False)

Unnamed: 0,id,rating
0,0,2.037953
1,1,1.842476
2,2,1.336829
3,3,2.780769
4,4,2.456541
