# Neural Network for Collaborative Filtering

In [None]:
from numpy.random import seed
seed(42)

%tensorflow_version 2.x
import tensorflow as tf
tf.random.set_seed(42)

In [None]:
!pip install kaggle

!mkdir ~/.kaggle

import json

kaggle_username = "yuvalnis" #@param {type:"string"}
kaggle_api_key = "1800d5a286834f0416c338c7bd7f6dee" #@param {type:"string"}

assert len(kaggle_username) > 0 and len(kaggle_api_key) > 0

api_token = {"username": kaggle_username,"key": kaggle_api_key}

with open('kaggle.json', 'w') as file:
    json.dump(api_token, file)

!mv kaggle.json ~/.kaggle/kaggle.json

!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c cil-collaborative-filtering-2022

!unzip -n cil-collaborative-filtering-2022.zip

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Downloading cil-collaborative-filtering-2022.zip to /content
 80% 5.00M/6.25M [00:00<00:00, 10.9MB/s]
100% 6.25M/6.25M [00:00<00:00, 13.5MB/s]
Archive:  cil-collaborative-filtering-2022.zip
  inflating: data_train.csv          
  inflating: sampleSubmission.csv    


In [None]:
!pip install --upgrade keras-hypetune

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keras-hypetune
  Downloading keras_hypetune-0.2.1-py3-none-any.whl (12 kB)
Collecting hyperopt==0.2.5
  Downloading hyperopt-0.2.5-py2.py3-none-any.whl (965 kB)
[K     |████████████████████████████████| 965 kB 12.0 MB/s 
Installing collected packages: hyperopt, keras-hypetune
  Attempting uninstall: hyperopt
    Found existing installation: hyperopt 0.1.2
    Uninstalling hyperopt-0.1.2:
      Successfully uninstalled hyperopt-0.1.2
Successfully installed hyperopt-0.2.5 keras-hypetune-0.2.1


## Imports

In [47]:
import math

import pandas as pd
import numpy as np

from keras.layers import Input, Embedding, Reshape, Dot, Concatenate, Dense, Dropout, Flatten, Lambda
from keras.models import Model

from keras.callbacks import *

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from kerashypetune import *

In [None]:
print('TF VERSION = ' + str(tf.__version__))

TF VERSION = 2.8.2


## Functions

In [40]:
def extract_users_items_predictions(data_pd):
        users, movies = \
            [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
        predictions = data_pd.Prediction.values
        return users, movies, predictions

def submission_from_separate_predictions(sub_pd, sub_pred, sub_user_data, sub_movie_data, filename):
  temp_pd = sub_pd.copy()
  Id = []
  pred = []
  for i in range(sub_pred.shape[0]):
    Id.append("r"+str(sub_user_data[i]+1)+"_c"+str(sub_movie_data[i]+1))
    pred.append(sub_pred[i,0])
  temp_pd['Prediction'] = pred
  temp_pd.set_index("Id", inplace = True)
  print(temp_pd.sample(3)) 
  temp_pd.to_csv(str(filename) + '.csv')

## Input

In [None]:
data_pd = pd.read_csv('data_train.csv')
tmp_users, tmp_movies, tmp_predictions = extract_users_items_predictions(data_pd)

d = {'User': tmp_users, 'Rating': tmp_predictions, 'Movie': tmp_movies}
df = pd.DataFrame(data=d)

del tmp_users, tmp_movies, tmp_predictions

df.sample(3)

Unnamed: 0,User,Rating,Movie
904556,5061,2,705
171326,9043,4,147
190893,1735,4,168


In [None]:
sub_pd = pd.read_csv('sampleSubmission.csv')#,index_col='Id')
tmp_users, tmp_movies, tmp_predictions = extract_users_items_predictions(sub_pd)

d = {'User': tmp_users, 'Rating': tmp_predictions, 'Movie': tmp_movies}
df_sub = pd.DataFrame(data=d)

del tmp_users, tmp_movies, tmp_predictions

df_sub.sample(3)

Unnamed: 0,User,Rating,Movie
1158113,8460,3,981
1095471,30,3,897
478282,8580,3,410


## Pre-processing

In [None]:
df = df.reset_index(drop=True)

user_id_mapping = {id:i for i, id in enumerate(df['User'].unique())}
movie_id_mapping = {id:i for i, id in enumerate(df['Movie'].unique())}

users = len(user_id_mapping)
movies = len(movie_id_mapping)

print("Number of users: " + str(users))
print("Number of movies: " + str(movies))

user_data = df['User'].to_numpy()
movie_data = df['Movie'].to_numpy()
ratings_data = df[['Rating']].to_numpy()

X = np.stack([user_data, movie_data], axis=-1)
y = ratings_data

Number of users: 10000
Number of movies: 1000


In [None]:
sub_user_data = df_sub['User'].to_numpy()
sub_movie_data = df_sub['Movie'].to_numpy()

X_sub = np.stack([sub_user_data, sub_movie_data], axis=-1)

### Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.9, random_state=42)

print("X shape: " + str(X.shape))
print("y shape: " + str(y.shape))

print("X_train shape: " + str(X_train.shape))
print("X_test shape: " + str(X_test.shape))

print("y_train shape: " + str(y_train.shape))
print("y_test shape: " + str(y_test.shape))

X shape: (1176952, 2)
y shape: (1176952, 1)
X_train shape: (1059256, 2)
X_test shape: (117696, 2)
y_train shape: (1059256, 1)
y_test shape: (117696, 1)


## Model

In [None]:
def get_model(param):
    combined_input = Input((2,))

    user_id_input = Lambda(lambda x: tf.expand_dims(x[:,0],-1))(combined_input)
    u = Embedding(input_dim=users, output_dim=param['user_embedding_size'])(user_id_input)
    u = Flatten()(u)

    movie_id_input = Lambda(lambda x: tf.expand_dims(x[:,1],-1))(combined_input)
    i = Embedding(input_dim=movies, output_dim=param['movie_embedding_size'])(movie_id_input)
    i = Flatten()(i)

    x = Concatenate(name="concatenated_vector")([i, u])

    for n in [1, 2, 4]:
      x = Dense(units=param['max_units']/n, activation=param['activation'])(x)
      x = Dropout(0.1)(x)

    x = Dense(units=100, activation="relu")(x)
    y = Dense(units=1)(x)


    model = Model(inputs=combined_input, outputs=y)
    model.compile(
        loss='mean_squared_error',
        optimizer=tf.keras.optimizers.Adam(learning_rate=param['lr']),
        metrics=[tf.keras.metrics.RootMeanSquaredError()],
    )
    
    return model

## Grid Search

### Define grid

In [None]:
param_grid = {
    'user_embedding_size' : [40, 80, 120],
    'movie_embedding_size' : [40, 80, 120],
    'max_units' : [800, 1200],
    'lr' : [1e-2, 1e-3, 1e-4],
    'epochs' : [100],
    'batch_size': [128,256],
    'activation' : ["relu", "sigmoid"]
}

### Perform grid search

In [None]:
kgs = KerasGridSearch(get_model, param_grid, monitor='val_loss', store_model=True, greater_is_better=False, tuner_verbose=1)

In [None]:
es = EarlyStopping(patience=5, verbose=1, min_delta=0.001, monitor='val_loss', mode='auto', restore_best_weights=True)

In [None]:
kgs.search(X_train, y_train,
           validation_data = (X_test, y_test),
           callbacks=[es])


216 trials detected for ('user_embedding_size', 'movie_embedding_size', 'max_units', 'lr', 'epochs', 'batch_size', 'activation')

***** (1/216) *****
Search({'user_embedding_size': 40, 'movie_embedding_size': 40, 'max_units': 800, 'lr': 0.0001, 'epochs': 100, 'batch_size': 128, 'activation': 'sigmoid'})
Restoring model weights from the end of the best epoch: 4.
Epoch 9: early stopping
SCORE: 1.00862 at epoch 4

***** (2/216) *****
Search({'user_embedding_size': 40, 'movie_embedding_size': 40, 'max_units': 800, 'lr': 0.0001, 'epochs': 100, 'batch_size': 128, 'activation': 'relu'})
Restoring model weights from the end of the best epoch: 3.
Epoch 8: early stopping
SCORE: 1.00178 at epoch 3

***** (3/216) *****
Search({'user_embedding_size': 40, 'movie_embedding_size': 40, 'max_units': 800, 'lr': 0.0001, 'epochs': 100, 'batch_size': 256, 'activation': 'sigmoid'})
Restoring model weights from the end of the best epoch: 4.
Epoch 9: early stopping
SCORE: 1.00689 at epoch 4

***** (4/216) ***

<kerashypetune.kerashypetune.KerasGridSearch at 0x7fbe0bfbd190>

### Best Model Found

In [None]:
best_params = kgs.best_params
print(best_params)

{'user_embedding_size': 40, 'movie_embedding_size': 40, 'max_units': 800, 'lr': 0.001, 'epochs': 3, 'batch_size': 128, 'activation': 'relu', 'steps_per_epoch': 8276}


In [50]:
best_model = kgs.best_model
evaluation = best_model.evaluate(X_test, y_test, verbose=1)

best_score_rmse = math.sqrt(kgs.best_score)
print('Validation RMSE: ' + str(best_score_rmse))

best_model = get_model(best_params)

Validation RMSE: 0.9950175877842562


#### Training
Best Model Found

In [51]:
best_model.fit(X, y,
               batch_size=best_params['batch_size'],
               epochs=best_params['epochs'],
               verbose="auto")

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fba3a23ce90>

#### Prediction
Best Model Found

In [52]:
sub_pred = best_model.predict(X_sub)

#### Submission
Best Model Found

In [53]:
submission_from_separate_predictions(sub_pd, sub_pred, sub_user_data, sub_movie_data, 'seedset_fulldata_neural_nets_best_model')

            Prediction
Id                    
r3092_c369    4.202448
r3996_c315    3.683816
r8787_c296    4.745072


## Best Model (statically)

### Training

In [54]:
best_params_static = {
    'user_embedding_size' : 40,
    'movie_embedding_size' : 40,
    'max_units' : 800,
    'lr' : 0.001,
    'epochs' : 3,
    'batch_size' : 128,
    'activation' : 'relu'
}
best_model_static = get_model(best_params_static)

In [55]:
best_model_static.fit(X, y,
                      batch_size=best_params_static['batch_size'],
                      epochs=best_params_static['epochs'],
                      verbose="auto")

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fba3bcd4e10>

### Prediction

In [56]:
sub_pred = best_model_static.predict(X_sub)

### Submission

In [57]:
submission_from_separate_predictions(sub_pd, sub_pred, sub_user_data, sub_movie_data, 'seedset_fulldata_neural_nets_best_model_static')

            Prediction
Id                    
r7244_c61     3.730112
r7489_c253    4.053943
r3104_c457    4.327858
