

*   Kernel Net is based on https://proceedings.mlr.press/v80/muller18a.html
*   this notebook is based on https://github.com/lorenzMuller/kernelNet_MovieLens
* this notebook was made to be ran on google collab



# Imports

In [21]:
%tensorflow_version 1.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')
else:
  print("GPU found")

import pandas as pd
import numpy as np
import itertools
import os
import sys
from time import time
from IPython.display import display
from tqdm import tqdm

GPU found


# Set Seed

In [2]:
seed = int(time())
np.random.seed(seed)

# Download and preprocess Kaggle Data

In [3]:
DATA_DIR = '/content/kernelNet'
try:
    os.makedirs(DATA_DIR)
except FileExistsError:
    # directory already exists
    pass

In [None]:
#download training data and sample predictions 
!pip install kaggle

!mkdir ~/.kaggle

import json

kaggle_username = "yuvalnis" #@param {type:"string"}
kaggle_api_key = "1800d5a286834f0416c338c7bd7f6dee" #@param {type:"string"}

assert len(kaggle_username) > 0 and len(kaggle_api_key) > 0

api_token = {"username": kaggle_username,"key": kaggle_api_key}

with open('kaggle.json', 'w') as file:
    json.dump(api_token, file)

!mv kaggle.json ~/.kaggle/kaggle.json

!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c cil-collaborative-filtering-2022

!unzip -n cil-collaborative-filtering-2022.zip

os.rename("data_train.csv", os.path.join(DATA_DIR,"data_train.csv"))
os.rename("sampleSubmission.csv", os.path.join(DATA_DIR,"sampleSubmission.csv"))

!rm cil-collaborative-filtering-2022.zip

In [5]:
def extract_users_items_predictions(data_pd):
    users, movies = \
        [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    predictions = data_pd.Prediction.values
    return users, movies, predictions

In [6]:
x_train_pd = pd.read_csv(os.path.join(DATA_DIR, "data_train.csv"))
users,movies,predictions = extract_users_items_predictions(x_train_pd)
ratings_dict = {'userID': users,'movieID': movies,'rating': predictions}
df_train = pd.DataFrame(ratings_dict) 
np.savetxt(os.path.join(DATA_DIR, "ratings.dat"), df_train.values, delimiter='::', fmt='%s',encoding='utf-8')

In [7]:
#load entries to predict 
to_predict_pd = pd.read_csv(os.path.join(DATA_DIR, "sampleSubmission.csv"))
pred_users,pred_movies,pred_predictions = extract_users_items_predictions(to_predict_pd)
to_predict_dict = {'userID': pred_users,'movieID': pred_movies,'rating': pred_predictions}
df_predict = pd.DataFrame(to_predict_dict) 
np.savetxt(os.path.join(DATA_DIR, "to_predict.dat"), df_predict.values, delimiter='::', fmt='%s',encoding='utf-8')

# Define data loading function for model 

In [8]:
def loadData(path='./', valfrac=0.1, delimiter='::', seed=1234,
             transpose=False, shuffle_data = True):
    '''
    loads kaggle data 
    :param path: path to the ratings file
    :param valfrac: fraction of data to use for validation
    :param delimiter: delimiter used in data file
    :param seed: random seed for validation splitting
    :param transpose: flag to transpose output matrices (swapping users with movies)
    :return: train ratings (n_u, n_m), valid ratings (n_u, n_m)
    '''
    np.random.seed(seed)

    tic = time()
    print('reading data...')
    data = np.loadtxt(path, skiprows=0, delimiter=delimiter).astype('int32')
    print('data read in', time() - tic, 'seconds')

    n_u = np.unique(data[:, 0]).shape[0]  # number of users
    n_m = np.unique(data[:, 1]).shape[0]  # number of movies
    n_r = data.shape[0]  # number of ratings

    # these dictionaries define a mapping from user/movie id to to user/movie number (contiguous from zero)
    udict = {}
    for i, u in enumerate(np.unique(data[:, 0]).tolist()):
        udict[u] = i
    mdict = {}
    for i, m in enumerate(np.unique(data[:, 1]).tolist()):
        mdict[m] = i

    # shuffle indices
    idx = np.arange(n_r)
    if(shuffle_data):
        np.random.shuffle(idx)

    trainRatings = np.zeros((n_u, n_m), dtype='float32')
    validRatings = np.zeros((n_u, n_m), dtype='float32')

    for i in range(n_r):
        u_id = data[idx[i], 0]
        m_id = data[idx[i], 1]
        r = data[idx[i], 2]

        # the first few ratings of the shuffled data array are validation data
        if i <= valfrac * n_r:
            validRatings[udict[u_id], mdict[m_id]] = int(r)
        # the rest are training data
        else:
            trainRatings[udict[u_id], mdict[m_id]] = int(r)

    if transpose:
        trainRatings = trainRatings.T
        validRatings = validRatings.T

    print('loaded dense data matrix')

    return trainRatings, validRatings

# Define model

In [9]:
# define network functions
def kernel(u, v):
    """
    Sparsifying kernel function
    :param u: input vectors [n_in, 1, n_dim]
    :param v: output vectors [1, n_hid, n_dim]
    :return: input to output connection matrix
    """
    dist = tf.norm(u - v, ord=2, axis=2)
    hat = tf.maximum(0., 1. - dist**2)
    return hat


def kernel_layer(x, n_hid, n_dim, lambda_s,
                 lambda_2, activation=tf.nn.sigmoid, name=''):
    """
    a kernel sparsified layer
    :param x: input [batch, channels]
    :param n_hid: number of hidden units
    :param n_dim: number of dimensions to embed for kernelization
    :param activation: output activation
    :param name: layer name for scoping
    :return: layer output, regularization term
    """

    # define variables
    with tf.variable_scope(name):
        W = tf.get_variable('W', [x.shape[1], n_hid])
        n_in = x.get_shape().as_list()[1]
        u = tf.get_variable('u', initializer=tf.random.truncated_normal([n_in, 1, n_dim], 0., 1e-3))
        v = tf.get_variable('v', initializer=tf.random.truncated_normal([1, n_hid, n_dim], 0., 1e-3))
        b = tf.get_variable('b', [n_hid])

    # compute sparsifying kernel
    # as u and v move further from each other for some given pair of neurons, their connection
    # decreases in strength and eventually goes to zero.
    w_hat = kernel(u, v)

    # compute regularization terms
    sparse_reg = tf.contrib.layers.l2_regularizer(lambda_s)
    sparse_reg_term = tf.contrib.layers.apply_regularization(sparse_reg, [w_hat])

    l2_reg = tf.contrib.layers.l2_regularizer(lambda_2)
    l2_reg_term = tf.contrib.layers.apply_regularization(l2_reg, [W])

    # compute output
    W_eff = W * w_hat
    y = tf.matmul(x, W_eff) + b
    y = activation(y)
    return y, sparse_reg_term + l2_reg_term

# Hyperparameter optimization

**Perform hyperparameter optimization**

In [None]:
# load data
tr, vr = loadData(os.path.join(DATA_DIR, "ratings.dat"), delimiter='::',
                  seed=seed, transpose=True, valfrac=0.1)

tm = np.greater(tr, 1e-12).astype('float32')  # masks indicating non-zero entries
vm = np.greater(vr, 1e-12).astype('float32')

n_m = tr.shape[0]  # number of movies
n_u = tr.shape[1]  # number of users (may be switched depending on 'transpose' in loadData)

#create logging dictionary
logging_dict = {"n_dim": [], "n_hid": [], "lambda_2":[], "lambda_s":[], "n_layers": [], "output_every": [], "n_epoch":[], "train_rmse" : [], "rmse": []}

#define parameter grid 

param_grid= {
    "n_dim": [5],
    "n_hid": [200,500,1000],
    "lambda_2":[60.0],
    "lambda_s":[0.013],
    "n_layers": [2,5],
    "output_every": [50,100],
    "n_epoch":[5]
}

keys, values = zip(*param_grid.items())
permutations_dicts = [dict(zip(keys, v)) for v in itertools.product(*values)]

for params_dict in tqdm(permutations_dicts):

  #reset tensorflow graph
  tf.reset_default_graph()

  # Set hyper-parameters
  n_dim = int(params_dict["n_dim"])
  n_hid = int(params_dict["n_hid"])
  lambda_2 = float(params_dict["lambda_2"])
  lambda_s = float(params_dict["lambda_s"])
  n_layers = int(params_dict["n_layers"])
  output_every = int(params_dict["output_every"])  # evaluate performance on test set; breaks l-bfgs loop
  n_epoch = int(params_dict["n_epoch"])
  verbose_bfgs = False
  use_gpu = True
  if not use_gpu:
      os.environ['CUDA_VISIBLE_DEVICES'] = ''

  
  # Input placeholders
  R = tf.placeholder("float", [None, n_u])
  # Instantiate network
  y = R
  reg_losses = None
  for i in range(n_layers):
      y, reg_loss = kernel_layer(y, n_hid, n_dim, lambda_s, lambda_2, activation=tf.nn.sigmoid, name=str(i))
      reg_losses = reg_loss if reg_losses is None else reg_losses + reg_loss
  prediction, reg_loss = kernel_layer(y, n_u, n_dim, lambda_s, lambda_2, activation=tf.identity, name='out')
  reg_losses = reg_losses + reg_loss

  # Compute loss (symbolic)
  diff = tm*(R - prediction)
  sqE = tf.nn.l2_loss(diff)
  loss = sqE + reg_losses

  # Instantiate L-BFGS Optimizer
  optimizer = tf.contrib.opt.ScipyOptimizerInterface(loss, options={'maxiter': output_every,
                                                                    'disp': verbose_bfgs,
                                                                    'maxcor': 10},
                                                    method='L-BFGS-B')
    

  # Training and validation loop
  init = tf.global_variables_initializer()

  with tf.Session() as sess:
      sess.run(init)
      print("num epochs to run: ", n_epoch)

      for i in range(n_epoch):
          optimizer.minimize(sess, feed_dict={R: tr}) #do maxiter optimization steps
          pre = sess.run(prediction, feed_dict={R: tr}) #predict ratings

          error = (vm * (np.clip(pre, 1., 5.) - vr) ** 2).sum() / vm.sum() #compute validation error
          error_train = (tm * (np.clip(pre, 1., 5.) - tr) ** 2).sum() / tm.sum() #compute train error

          print('.-^-._' * 12)
          print('epoch:', i+1, 'validation rmse:', np.sqrt(error), 'train rmse:', np.sqrt(error_train))
          print('.-^-._' * 12)

          #log_hyperparameters
          logging_dict["n_dim"].append(n_dim);
          logging_dict["n_hid"].append(n_hid);
          logging_dict["lambda_2"].append(lambda_2);
          logging_dict["lambda_s"].append(lambda_s);
          logging_dict["n_layers"].append(n_layers);
          logging_dict["output_every"].append(output_every);
          logging_dict["n_epoch"].append(i+1);
          #log train and val rmse
          logging_dict["train_rmse"].append(np.sqrt(error_train))
          logging_dict["rmse"].append(np.sqrt(error))
          
          #import current log to csv
          log_df = pd.DataFrame.from_dict(logging_dict)
          log_df.sort_values("rmse", inplace = True)
          log_df.to_csv(os.path.join(DATA_DIR,"log_df.csv"),index = False)

  print("finished training, with specific hyperparameter setting")


In [None]:
log_df = pd.read_csv(os.path.join(DATA_DIR, "log_df.csv"))
print(f"Table with the results of the parameter tuning:")
display(log_df)

# Predictions using optimal hyper-parameters

**Train again on full data set with optimal hyper-parameters**

In [None]:
#load data
tr, vr = loadData(os.path.join(DATA_DIR, "ratings.dat"), delimiter='::',
                  seed=seed, transpose=True, valfrac = -1)

tm = np.greater(tr, 1e-12).astype('float32')  # masks indicating non-zero entries
vm = np.greater(vr, 1e-12).astype('float32')

n_m = tr.shape[0]  # number of movies
n_u = tr.shape[1]  # number of users (may be switched depending on 'transpose' in loadData)

#reset tensorflow graph
tf.reset_default_graph()

#setup models with best performing hyperparamters (log_df was sorted by ascending validation rmse)
log_df = pd.read_csv(os.path.join(DATA_DIR, "log_df.csv"))
best_params = log_df.iloc[[0]]

n_dim = best_params["n_dim"].values[0]
n_hid = best_params["n_hid"].values[0]
lambda_2 = best_params["lambda_2"].values[0]
lambda_s = best_params["lambda_s"].values[0]
n_layers = best_params["n_layers"].values[0]
output_every = best_params["output_every"].values[0]  # evaluate performance on test set; breaks l-bfgs loop
n_epoch = best_params["n_epoch"].values[0]
verbose_bfgs = False
use_gpu = True
if not use_gpu:
    os.environ['CUDA_VISIBLE_DEVICES'] = ''



# Input placeholders
R = tf.placeholder("float", [None, n_u])
# Instantiate network
y = R
reg_losses = None
for i in range(n_layers):
    y, reg_loss = kernel_layer(y, n_hid, n_dim, lambda_s, lambda_2, activation=tf.nn.sigmoid, name=str(i))
    reg_losses = reg_loss if reg_losses is None else reg_losses + reg_loss
prediction, reg_loss = kernel_layer(y, n_u, n_dim, lambda_s, lambda_2, activation=tf.identity, name='out')
reg_losses = reg_losses + reg_loss

# Compute loss (symbolic)
diff = tm*(R - prediction)
sqE = tf.nn.l2_loss(diff)
loss = sqE + reg_losses

# Instantiate L-BFGS Optimizer
optimizer = tf.contrib.opt.ScipyOptimizerInterface(loss, options={'maxiter': output_every,
                                                                  'disp': verbose_bfgs,
                                                                  'maxcor': 10},
                                                  method='L-BFGS-B')


init = tf.global_variables_initializer()
saver = tf.train.Saver()
with tf.Session() as sess:
    sess.run(init)
    print("num epochs to run: ", n_epoch)

    for i in tqdm(range(n_epoch)):
        optimizer.minimize(sess, feed_dict={R: tr}) #do maxiter optimization steps
        pre = sess.run(prediction, feed_dict={R: tr}) #predict ratings
        error_train = (tm * (np.clip(pre, 1., 5.) - tr) ** 2).sum() / tm.sum() #compute train error

        print('.-^-._' * 12)
        print('epoch:', i+1, 'train rmse:', np.sqrt(error_train))
        print('.-^-._' * 12)

    saver.save(sess, os.path.join(DATA_DIR, "checkpoints/model"))
    print("saved model checkpoint")
    
print("finished training on whole training data")



#make raw predictions of our data
tr, vr = loadData(os.path.join(DATA_DIR, "to_predict.dat"), delimiter='::',
                  seed=seed, transpose=True, valfrac = -1, shuffle_data = False)

with tf.Session() as sess:
    saver.restore(sess, os.path.join(DATA_DIR, "checkpoints/model"))
    pre = sess.run(prediction, feed_dict={R: tr}) #predict ratings
    pd.DataFrame(pre).to_csv(os.path.join(DATA_DIR, "raw_predictions.csv"))

print("finished raw predictions")

**convert raw predictions into correct format**

In [25]:
# i can throw  0'th column of raw predictions away as it is just the index
raw_predictions = pd.read_csv(os.path.join(DATA_DIR, "raw_predictions.csv"))
#convert raw predictions to correct format
output = to_predict_pd.to_numpy()
final_predictions = raw_predictions.to_numpy()
for id,user in enumerate(pred_users):
  prediction = final_predictions[pred_movies[id]][user+1]
  output[id][1] = prediction 

In [26]:
submission_df = pd.DataFrame(output, columns = ['Id', 'Prediction'])
try:
    os.makedirs(os.path.join(DATA_DIR,"final_predictions"))
except FileExistsError:
    # directory already exists
    pass
submission_df.to_csv(os.path.join(DATA_DIR,"final_predictions/kernelNet.csv"),index = False)