# Neural Network for Collaborative Filtering

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install kaggle

!mkdir ~/.kaggle

import json

kaggle_username = "yuvalnis" #@param {type:"string"}
kaggle_api_key = "1800d5a286834f0416c338c7bd7f6dee" #@param {type:"string"}

assert len(kaggle_username) > 0 and len(kaggle_api_key) > 0

api_token = {"username": kaggle_username,"key": kaggle_api_key}

with open('kaggle.json', 'w') as file:
    json.dump(api_token, file)

!mv kaggle.json ~/.kaggle/kaggle.json

!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c cil-collaborative-filtering-2022

!unzip -n cil-collaborative-filtering-2022.zip

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
mkdir: cannot create directory ‘/root/.kaggle’: File exists
cil-collaborative-filtering-2022.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  cil-collaborative-filtering-2022.zip


In [None]:
#!pip install scikeras

In [None]:
!pip install --upgrade keras-hypetune

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Imports

In [None]:
import pandas as pd
import numpy as np

from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout, Flatten, Lambda
from keras.models import Model

from keras.callbacks import *

%tensorflow_version 2.x
import tensorflow as tf

#from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

#from scikeras.wrappers import KerasRegressor

from kerashypetune import *

In [None]:
print('TF VERSION = ' + str(tf.__version__))

TF VERSION = 2.8.2


## Functions

In [None]:
def extract_users_items_predictions(data_pd):
        users, movies = \
            [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
        predictions = data_pd.Prediction.values
        return users, movies, predictions

def submission_from_separate_predictions(sub_pd, sub_pred, sub_user_data, sub_movie_data, filename):
  temp_pd = sub_pd.copy()
  Id = []
  pred = []
  for i in range(sub_pred.shape[0]):
    Id.append("r"+str(sub_user_data[i]+1)+"_c"+str(sub_movie_data[i]+1))
    pred.append(sub_pred[i,0])
  temp_pd['Prediction'] = pred
  temp_pd.set_index("Id", inplace = True)
  print(temp_pd.sample(3)) 
  temp_pd.to_csv('drive/MyDrive/cil_data/sub/submission_'+str(filename)+'.csv')

## Input

In [None]:
data_pd = pd.read_csv('data_train.csv')
tmp_users, tmp_movies, tmp_predictions = extract_users_items_predictions(data_pd)

d = {'User': tmp_users, 'Rating': tmp_predictions, 'Movie': tmp_movies}
df = pd.DataFrame(data=d)

del tmp_users, tmp_movies, tmp_predictions

df.sample(3)

Unnamed: 0,User,Rating,Movie
51099,3758,5,42
476228,1784,2,410
998571,9206,5,788


In [None]:
sub_pd = pd.read_csv('sampleSubmission.csv')#,index_col='Id')
tmp_users, tmp_movies, tmp_predictions = extract_users_items_predictions(sub_pd)

d = {'User': tmp_users, 'Rating': tmp_predictions, 'Movie': tmp_movies}
df_sub = pd.DataFrame(data=d)

del tmp_users, tmp_movies, tmp_predictions

sub_user_data = df_sub['User']
sub_movie_data = df_sub['Movie']

df_sub.sample(3)

Unnamed: 0,User,Rating,Movie
613933,4566,3,521
311041,9001,3,255
1078677,5627,3,875


## Pre-processing

### Train-test split

In [None]:
df = df.reset_index(drop=True)

print(df.shape)

train_size = 0.85

from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, train_size=train_size, random_state=42)

print(df_train.shape)
print(df_test.shape)

(1176952, 3)
(1000409, 3)
(176543, 3)


### Mapping to users/movies

In [None]:
user_id_mapping = {id:i for i, id in enumerate(df['User'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(df['Movie'].unique())}

train_user_data = df_train['User'].map(user_id_mapping)
train_movie_data = df_train['Movie'].map(movie_id_mapping)

test_user_data = df_test['User'].map(user_id_mapping)
test_movie_data = df_test['Movie'].map(movie_id_mapping)

# Get input variable-sizes
users = len(user_id_mapping)
movies = len(movie_id_mapping)

print("Number of users: " + str(users))
print("Number of movies: " + str(movies))

Number of users: 10000
Number of movies: 1000


In [None]:
train_user_data.to_numpy()
print("Train user data shape: " + str(train_user_data.shape))

train_movie_data.to_numpy()
print("Train movie data shape: " + str(train_movie_data.shape))

train_ratings_data = df_train[['Rating']].to_numpy()
print("Train ratings data shape: " + str(train_ratings_data.shape))

train_data = np.stack([train_user_data, train_movie_data], axis=-1)
print("Train data shape: " + str(train_data.shape))

test_user_data.to_numpy()
print("Test user data shape: " + str(test_user_data.shape))

test_movie_data.to_numpy()
print("Test movie data shape: " + str(test_movie_data.shape))

test_ratings_data = df_test[['Rating']].to_numpy()
print("Test ratings data shape: " + str(test_ratings_data.shape))

test_data = np.stack([test_user_data, test_movie_data], axis=-1)
print("Test data shape: " + str(test_data.shape))

Train user data shape: (1000409,)
Train movie data shape: (1000409,)
Train ratings data shape: (1000409, 1)
Train data shape: (1000409, 2)
Test user data shape: (176543,)
Test movie data shape: (176543,)
Test ratings data shape: (176543, 1)
Test data shape: (176543, 2)


In [None]:
sub_user_data.to_numpy()
sub_movie_data.to_numpy()
sub_data = np.stack([sub_user_data, sub_movie_data], axis=-1)

## Model

keras-hypetune

In [None]:
def get_model(param):
    combined_input = Input((2,))

    user_id_input = Lambda(lambda x: tf.expand_dims(x[:,0],-1))(combined_input)
    u = Embedding(input_dim=users, output_dim=param['user_embedding_size'], name="users_embedding")(user_id_input)
    u = Flatten()(u)

    movie_id_input = Lambda(lambda x: tf.expand_dims(x[:,1],-1))(combined_input)
    i = Embedding(input_dim=movies, output_dim=param['movie_embedding_size'], name="items_embedding")(movie_id_input)
    i = Flatten()(i)

    x = Concatenate(name="concatenated_vector")([i, u])

    for n in [1, 2, 4]:
      x = Dense(units=param['max_units']/n, activation="relu", kernel_initializer="glorot_normal",)(x)
      x = Dropout(0.2)(x)

    x = Dense(units=12, activation="relu", kernel_initializer="glorot_normal")(x)
    y = Dense(units=1)(x)


    model = Model(inputs=combined_input, outputs=y)
    model.compile(
        loss='mean_squared_error',
        optimizer=tf.keras.optimizers.Adam(learning_rate=param['lr']),
        metrics=[tf.keras.metrics.RootMeanSquaredError()],
    )
    
    return model

In [None]:
param_grid = {
    'user_embedding_size': [40, 80, 120, 160], 
    'movie_embedding_size': [20, 40, 60, 80],
    'max_units' : [800, 1600, 2400],
    'lr': [1e-2, 1e-3],
    'epochs': [100], 
    'batch_size': [128, 256, 512]
}

kgs = KerasGridSearch(get_model, param_grid, monitor='val_loss', store_model=True, greater_is_better=False, tuner_verbose=1)

es = EarlyStopping(patience=5, verbose=1, min_delta=0.001, monitor='val_loss', mode='auto', restore_best_weights=True)

In [None]:
kgs.search(train_data, train_ratings_data, validation_data=(test_data, test_ratings_data), callbacks=[es])


288 trials detected for ('user_embedding_size', 'movie_embedding_size', 'max_units', 'lr', 'epochs', 'batch_size')

***** (1/288) *****
Search({'user_embedding_size': 40, 'movie_embedding_size': 40, 'max_units': 800, 'lr': 0.01, 'epochs': 100, 'batch_size': 128})
Restoring model weights from the end of the best epoch: 2.
Epoch 7: early stopping
SCORE: 1.08748 at epoch 2

***** (2/288) *****
Search({'user_embedding_size': 40, 'movie_embedding_size': 40, 'max_units': 800, 'lr': 0.01, 'epochs': 100, 'batch_size': 256})
Restoring model weights from the end of the best epoch: 2.
Epoch 7: early stopping
SCORE: 1.06465 at epoch 2

***** (3/288) *****
Search({'user_embedding_size': 40, 'movie_embedding_size': 40, 'max_units': 800, 'lr': 0.01, 'epochs': 100, 'batch_size': 512})
Restoring model weights from the end of the best epoch: 1.
Epoch 6: early stopping
SCORE: 1.02403 at epoch 1

***** (4/288) *****
Search({'user_embedding_size': 40, 'movie_embedding_size': 40, 'max_units': 800, 'lr': 0.

<kerashypetune.kerashypetune.KerasGridSearch at 0x7fe71ab03450>

In [None]:
print(kgs.best_params)
print('Best score MSE: ' + str(kgs.best_score))

{'user_embedding_size': 40, 'movie_embedding_size': 80, 'max_units': 1600, 'lr': 0.001, 'epochs': 4, 'batch_size': 128, 'steps_per_epoch': 7816}
Best score: 0.9879


In [None]:
# Best params: {'user_embedding_size': 40, 'movie_embedding_size': 80, 'max_units': 1600, 'lr': 0.001, 'epochs': 4, 'batch_size': 128, 'steps_per_epoch': 7816}

In [None]:
best_model = kgs.best_model #kgs.folds_best_models['fold 1']

## Test Set Evaluation

In [None]:
evaluation = best_model.evaluate(x=test_data, y=test_ratings_data, verbose=1)



## Prediction

In [None]:
sub_pred = best_model.predict(sub_data)

## Submission

In [38]:
submission_from_separate_predictions(sub_pd, sub_pred, sub_user_data, sub_movie_data, 'neural_nets')

            Prediction
Id                    
r1005_c460    3.065896
r9850_c440    3.993016
r3372_c637    3.342212
