* GLocal-K is based on https://arxiv.org/pdf/2108.12184.pdf
* this notebook is based on https://github.com/usydnlp/Glocal_K
* this notebook was made to be ran on google collab



In [None]:
!pip install tensorflow-gpu==1.15.5
import tensorflow as tf
print(tf.VERSION)
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  print(
      '\n\nThis error most likely means that this notebook is not '
      'configured to use a GPU.  Change this in Notebook Settings via the '
      'command palette (cmd/ctrl-shift-P) or the Edit menu.\n\n')
  raise SystemError('GPU device not found')
else:
  print("GPU found")
  
from time import time
from scipy.sparse import csc_matrix
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import h5py
import itertools

# Download and preprocess Kaggle Data

In [None]:
DATA_DIR = '/content/GLocal_K'
try:
    os.makedirs(DATA_DIR)
except FileExistsError:
    # directory already exists
    pass

In [None]:
#download training data and sample predictions
#in case you can't download the data via this cell please put "data_train.csv"
#and "sampleSubmissions.csv" in the DATA_DIR path defined above

!pip install kaggle

!mkdir ~/.kaggle

import json

kaggle_username = "yuvalnis" #@param {type:"string"}
kaggle_api_key = "1800d5a286834f0416c338c7bd7f6dee" #@param {type:"string"}

assert len(kaggle_username) > 0 and len(kaggle_api_key) > 0

api_token = {"username": kaggle_username,"key": kaggle_api_key}

with open('kaggle.json', 'w') as file:
    json.dump(api_token, file)

!mv kaggle.json ~/.kaggle/kaggle.json

!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c cil-collaborative-filtering-2022

!unzip -n cil-collaborative-filtering-2022.zip

os.rename("data_train.csv", os.path.join(DATA_DIR,"data_train.csv"))
os.rename("sampleSubmission.csv", os.path.join(DATA_DIR,"sampleSubmission.csv"))

!rm cil-collaborative-filtering-2022.zip

In [None]:
def extract_users_items_predictions(data_pd):
    users, movies = \
        [np.squeeze(arr) for arr in np.split(data_pd.Id.str.extract('r(\d+)_c(\d+)').values.astype(int) - 1, 2, axis=-1)]
    predictions = data_pd.Prediction.values
    return users, movies, predictions

In [None]:
x_train_pd = pd.read_csv(os.path.join(DATA_DIR, "data_train.csv"))
users,movies,predictions = extract_users_items_predictions(x_train_pd)
ratings_dict = {'userID': users,'movieID': movies,'rating': predictions}
df_train = pd.DataFrame(ratings_dict) 
np.savetxt(os.path.join(DATA_DIR, "ratings.dat"), df_train.values, delimiter='::', fmt='%s',encoding='utf-8')

In [None]:
#load entries to predict 
to_predict_pd = pd.read_csv(os.path.join(DATA_DIR, "sampleSubmission.csv"))
pred_users,pred_movies,pred_predictions = extract_users_items_predictions(to_predict_pd)
to_predict_dict = {'userID': pred_users,'movieID': pred_movies,'rating': pred_predictions}
df_predict = pd.DataFrame(to_predict_dict) 
np.savetxt(os.path.join(DATA_DIR, "to_predict.dat"), df_predict.values, delimiter='::', fmt='%s',encoding='utf-8')

# Data Loader Function

In [None]:
def load_data(path='./', delimiter='::', frac=0.1, seed=1234, shuffle_data = True):

    tic = time()
    print('reading data...')
    #data = np.loadtxt(path+'movielens_1m_dataset.dat', skiprows=0, delimiter=delimiter).astype('int32')
    data = np.loadtxt(path, skiprows=0, delimiter=delimiter).astype('int32')
    print('taken', time() - tic, 'seconds')

    n_u = np.unique(data[:,0]).size  # num of users
    n_m = np.unique(data[:,1]).size  # num of movies
    n_r = data.shape[0]  # num of ratings

    udict = {}
    for i, u in enumerate(np.unique(data[:,0]).tolist()):
        udict[u] = i
    mdict = {}
    for i, m in enumerate(np.unique(data[:,1]).tolist()):
        mdict[m] = i

    np.random.seed(seed)
    idx = np.arange(n_r)
    if(shuffle_data):
      np.random.shuffle(idx)

    train_r = np.zeros((n_m, n_u), dtype='float32')
    test_r = np.zeros((n_m, n_u), dtype='float32')

    for i in range(n_r):
        u_id = data[idx[i], 0]
        m_id = data[idx[i], 1]
        r = data[idx[i], 2]

        if i < int(frac * n_r):
            test_r[mdict[m_id], udict[u_id]] = r
        else:
            train_r[mdict[m_id], udict[u_id]] = r

    train_m = np.greater(train_r, 1e-12).astype('float32')  # masks indicating non-zero entries
    test_m = np.greater(test_r, 1e-12).astype('float32')

    print('data matrix loaded')
    print('num of users: {}'.format(n_u))
    print('num of movies: {}'.format(n_m))
    print('num of training ratings: {}'.format(n_r - int(frac * n_r)))
    print('num of test ratings: {}'.format(int(frac * n_r)))

    return n_m, n_u, train_r, train_m, test_r, test_m

# Load Data

In [None]:
# Insert the path of a data directory by yourself (e.g., '/content/.../data')
# .-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._
data_path = DATA_DIR
# .-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._.-^-._

# Network Function

In [None]:
def local_kernel(u, v):

    dist = tf.norm(u - v, ord=2, axis=2)
    hat = tf.maximum(0., 1. - dist**2)

    return hat

In [None]:
def kernel_layer(x, n_hid, n_dim, lambda_s, lambda_2, activation=tf.nn.sigmoid, name=''):

    with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
        W = tf.get_variable('W', [x.shape[1], n_hid])
        n_in = x.get_shape().as_list()[1]
        u = tf.get_variable('u', initializer=tf.random.truncated_normal([n_in, 1, n_dim], 0., 1e-3))
        v = tf.get_variable('v', initializer=tf.random.truncated_normal([1, n_hid, n_dim], 0., 1e-3))
        b = tf.get_variable('b', [n_hid])

    w_hat = local_kernel(u, v)
    
    sparse_reg = tf.contrib.layers.l2_regularizer(lambda_s)
    sparse_reg_term = tf.contrib.layers.apply_regularization(sparse_reg, [w_hat])
    
    l2_reg = tf.contrib.layers.l2_regularizer(lambda_2)
    l2_reg_term = tf.contrib.layers.apply_regularization(l2_reg, [W])

    W_eff = W * w_hat  # Local kernelised weight matrix
    y = tf.matmul(x, W_eff) + b
    y = activation(y)

    return y, sparse_reg_term + l2_reg_term

In [None]:
def global_kernel(input, gk_size, dot_scale):

    avg_pooling = tf.reduce_mean(input, axis=1)  # Item (axis=1) based average pooling
    avg_pooling = tf.reshape(avg_pooling, [1, -1])
    n_kernel = avg_pooling.shape[1].value

    conv_kernel = tf.get_variable('conv_kernel', initializer=tf.random.truncated_normal([n_kernel, gk_size**2], stddev=0.1))
    gk = tf.matmul(avg_pooling, conv_kernel) * dot_scale  # Scaled dot product
    gk = tf.reshape(gk, [gk_size, gk_size, 1, 1])

    return gk

In [None]:
def global_conv(input, W):

    input = tf.reshape(input, [1, input.shape[0], input.shape[1], 1])
    conv2d = tf.nn.relu(tf.nn.conv2d(input, W, strides=[1,1,1,1], padding='SAME'))

    return tf.reshape(conv2d, [conv2d.shape[1], conv2d.shape[2]])

# Evaluation code

In [None]:
def dcg_k(score_label, k):
    dcg, i = 0., 0
    for s in score_label:
        if i < k:
            dcg += (2**s[1]-1) / np.log2(2+i)
            i += 1
    return dcg

In [None]:
def ndcg_k(y_hat, y, k):
    score_label = np.stack([y_hat, y], axis=1).tolist()
    score_label = sorted(score_label, key=lambda d:d[0], reverse=True)
    score_label_ = sorted(score_label, key=lambda d:d[1], reverse=True)
    norm, i = 0., 0
    for s in score_label_:
        if i < k:
            norm += (2**s[1]-1) / np.log2(2+i)
            i += 1
    dcg = dcg_k(score_label, k)
    return dcg / norm

In [None]:
def call_ndcg(y_hat, y):
    ndcg_sum, num = 0, 0
    y_hat, y = y_hat.T, y.T
    n_users = y.shape[0]

    for i in range(n_users):
        y_hat_i = y_hat[i][np.where(y[i])]
        y_i = y[i][np.where(y[i])]

        if y_i.shape[0] < 2:
            continue

        ndcg_sum += ndcg_k(y_hat_i, y_i, y_i.shape[0])  # user-wise calculation
        num += 1

    return ndcg_sum / num

# Hyper-parameter optimization

In [None]:
use_gpu = True

#create logging dictionary
logging_dict = {"n_dim": [],
    "n_hid": [],
    "lambda_2":[],
    "lambda_s":[],
    "n_layers": [],
    "gk_size":[],
    "iter_p":[],
    "iter_f":[],
    "epoch_p":[],
    "epoch_f":[],
    "dot_scale":[],
    "rmse":[]
}

#define parameter grid 

param_grid= {
    "n_dim": [5],
    "n_hid": [500],
    "lambda_2":[20.0],
    "lambda_s":[0.006],
    "n_layers": [2],
    "gk_size":[3],
    "iter_p":[5],
    "iter_f":[5],
    "epoch_p":[30],
    "epoch_f":[60],
    "dot_scale":[1.0]
}

keys, values = zip(*param_grid.items())
permutations_dicts = [dict(zip(keys, v)) for v in itertools.product(*values)]

path = os.path.join(DATA_DIR, "ratings.dat")
n_m, n_u, train_r, train_m, test_r, test_m = load_data(path=path, delimiter='::', frac=0.1, seed=1234)

for params_dict in tqdm(permutations_dicts):

  #reset tensorflow graph
  tf.reset_default_graph()

  #set hyperparameters
  n_hid = int(params_dict["n_hid"])
  n_dim = int(params_dict["n_dim"])
  n_layers = int(params_dict["n_layers"])
  gk_size = int(params_dict["gk_size"])
  #advanced hyper params 
  lambda_2 = float(params_dict["lambda_2"])
  lambda_s = float(params_dict["lambda_s"])
  iter_p = int(params_dict["iter_p"])
  iter_f = int(params_dict["iter_f"])
  epoch_p = int(params_dict["epoch_p"])
  epoch_f = int(params_dict["epoch_f"])
  dot_scale = float(params_dict["dot_scale"])



  #input placeholders
  R = tf.placeholder("float", [n_m, n_u])
  #build model pre-training 
  y = R
  reg_losses = None

  for i in range(n_layers):
      y, reg_loss = kernel_layer(y, n_hid = n_hid, n_dim = n_dim,lambda_s = lambda_s, lambda_2 = lambda_2, name=str(i))
      reg_losses = reg_loss if reg_losses is None else reg_losses + reg_loss

  pred_p, reg_loss = kernel_layer(y, n_hid = n_u, n_dim = n_dim,lambda_s = lambda_s, lambda_2 = lambda_2,  activation=tf.identity, name='out')
  reg_losses = reg_losses + reg_loss

  # L2 loss
  diff = train_m * (train_r - pred_p)
  sqE = tf.nn.l2_loss(diff)
  loss_p = sqE + reg_losses

  optimizer_p = tf.contrib.opt.ScipyOptimizerInterface(loss_p, options={'disp': True, 'maxiter': iter_p, 'maxcor': 10}, method='L-BFGS-B') 

  #build model fine-tuning

  y = R
  reg_losses = None

  for i in range(n_layers):
      y, _ = kernel_layer(y, n_hid = n_hid, n_dim = n_dim,lambda_s = lambda_s, lambda_2 = lambda_2, name=str(i))

  y_dash, _ = kernel_layer(y, n_hid = n_u, n_dim = n_dim,lambda_s = lambda_s, lambda_2 = lambda_2, activation=tf.identity, name='out')

  gk = global_kernel(y_dash, gk_size, dot_scale)  # Global kernel
  y_hat = global_conv(train_r, gk)  # Global kernel-based rating matrix

  for i in range(n_layers):
      y_hat, reg_loss = kernel_layer(y_hat, n_hid = n_hid, n_dim = n_dim,lambda_s = lambda_s, lambda_2 = lambda_2, name=str(i))
      reg_losses = reg_loss if reg_losses is None else reg_losses + reg_loss

  pred_f, reg_loss = kernel_layer(y_hat, n_hid = n_u, n_dim = n_dim,lambda_s = lambda_s, lambda_2 = lambda_2, activation=tf.identity, name='out')
  reg_losses = reg_losses + reg_loss

  # L2 loss
  diff = train_m * (train_r - pred_f)
  sqE = tf.nn.l2_loss(diff)
  loss_f = sqE + reg_losses

  optimizer_f = tf.contrib.opt.ScipyOptimizerInterface(loss_f, options={'disp': True, 'maxiter': iter_f, 'maxcor': 10}, method='L-BFGS-B')




  best_rmse_ep, best_mae_ep, best_ndcg_ep = 0, 0, 0
  best_rmse, best_mae, best_ndcg = float("inf"), float("inf"), 0

  time_cumulative = 0

  init = tf.global_variables_initializer()

  with tf.Session() as sess:
      sess.run(init)
      for i in tqdm(range(epoch_p)):
          tic = time()
          optimizer_p.minimize(sess, feed_dict={R: train_r})
          pre = sess.run(pred_p, feed_dict={R: train_r})

          t = time() - tic
          time_cumulative += t
          
          error = (test_m * (np.clip(pre, 1., 5.) - test_r) ** 2).sum() / test_m.sum()  # test error
          test_rmse = np.sqrt(error)

          error_train = (train_m * (np.clip(pre, 1., 5.) - train_r) ** 2).sum() / train_m.sum()  # train error
          train_rmse = np.sqrt(error_train)

          print('.-^-._' * 12)
          print('PRE-TRAINING')
          print('Epoch:', i+1, 'test rmse:', test_rmse, 'train rmse:', train_rmse)
          print('Time:', t, 'seconds')
          print('Time cumulative:', time_cumulative, 'seconds')
          print('.-^-._' * 12)

      for i in tqdm(range(epoch_f)):
          tic = time()
          optimizer_f.minimize(sess, feed_dict={R: train_r})
          pre = sess.run(pred_f, feed_dict={R: train_r})

          t = time() - tic
          time_cumulative += t
          
          error = (test_m * (np.clip(pre, 1., 5.) - test_r) ** 2).sum() / test_m.sum()  # test error
          test_rmse = np.sqrt(error)

          error_train = (train_m * (np.clip(pre, 1., 5.) - train_r) ** 2).sum() / train_m.sum()  # train error
          train_rmse = np.sqrt(error_train)

          test_mae = (test_m * np.abs(np.clip(pre, 1., 5.) - test_r)).sum() / test_m.sum()
          train_mae = (train_m * np.abs(np.clip(pre, 1., 5.) - train_r)).sum() / train_m.sum()

          test_ndcg = call_ndcg(np.clip(pre, 1., 5.), test_r)
          train_ndcg = call_ndcg(np.clip(pre, 1., 5.), train_r)

          if test_rmse < best_rmse:
              best_rmse = test_rmse
              best_rmse_ep = i+1

          if test_mae < best_mae:
              best_mae = test_mae
              best_mae_ep = i+1

          if best_ndcg < test_ndcg:
              best_ndcg = test_ndcg
              best_ndcg_ep = i+1

          print('.-^-._' * 12)
          print('FINE-TUNING')
          print('Epoch:', i+1, 'test rmse:', test_rmse, 'test mae:', test_mae, 'test ndcg:', test_ndcg)
          print('Epoch:', i+1, 'train rmse:', train_rmse, 'train mae:', train_mae, 'train ndcg:', train_ndcg)
          print('Time:', t, 'seconds')
          print('Time cumulative:', time_cumulative, 'seconds')
          print('.-^-._' * 12)

      # Final result
      print('Epoch:', best_rmse_ep, ' best rmse:', best_rmse)
      print('Epoch:', best_mae_ep, ' best mae:', best_mae)
      print('Epoch:', best_ndcg_ep, ' best ndcg:', best_ndcg)

      #log_hyperparameters
      logging_dict["n_dim"].append(n_dim);
      logging_dict["n_hid"].append(n_hid);
      logging_dict["lambda_2"].append(lambda_2);
      logging_dict["lambda_s"].append(lambda_s);
      logging_dict["n_layers"].append(n_layers);
      logging_dict["gk_size"].append(gk_size);
      logging_dict["iter_p"].append(iter_p)
      logging_dict["iter_f"].append(iter_f)
      logging_dict["epoch_p"].append(epoch_p)
      logging_dict["epoch_f"].append(best_rmse_ep)
      logging_dict["dot_scale"].append(dot_scale)
      #val rmse
      logging_dict["rmse"].append(best_rmse)
      
      #import current log to csv
      log_df = pd.DataFrame.from_dict(logging_dict)
      log_df.sort_values("rmse", inplace = True)
      log_df.to_csv(os.path.join(DATA_DIR,"log_df.csv"),index = False)

In [None]:
log_df = pd.read_csv(os.path.join(DATA_DIR, "log_df.csv"))
print(f"Table with the results of the parameter tuning:")
display(log_df)

# Predictions using optimal hyper-parameters

**Train again on full data set with optimal hyper-parameters**

In [None]:
path = os.path.join(DATA_DIR, "ratings.dat")
n_m, n_u, train_r, train_m, test_r, test_m = load_data(path=path, delimiter='::', frac = 0.0, seed=1234)

tf.reset_default_graph()

#setup models with best performing hyperparamters (log_df was sorted by ascending validation rmse)
log_df = pd.read_csv(os.path.join(DATA_DIR, "log_df.csv"))
best_params = log_df.iloc[[0]]

n_hid = int(best_params["n_hid"].values[0])
n_dim = int(best_params["n_dim"].values[0])
n_layers = int(best_params["n_layers"].values[0])
gk_size = int(best_params["gk_size"].values[0])
#advanced hyper params 
lambda_2 = float(best_params["lambda_2"].values[0])
lambda_s = float(best_params["lambda_s"].values[0])
iter_p = int(best_params["iter_p"].values[0])
iter_f = int(best_params["iter_f"].values[0])
epoch_p = int(best_params["epoch_p"].values[0])
epoch_f = int(best_params["epoch_f"].values[0])
dot_scale = float(best_params["dot_scale"].values[0])


#input placeholders
R = tf.placeholder("float", [n_m, n_u])
#build model pre-training 
y = R
reg_losses = None

for i in range(n_layers):
    y, reg_loss = kernel_layer(y, n_hid = n_hid, n_dim = n_dim,lambda_s = lambda_s, lambda_2 = lambda_2, name=str(i))
    reg_losses = reg_loss if reg_losses is None else reg_losses + reg_loss

pred_p, reg_loss = kernel_layer(y, n_hid = n_u, n_dim = n_dim,lambda_s = lambda_s, lambda_2 = lambda_2,  activation=tf.identity, name='out')
reg_losses = reg_losses + reg_loss

# L2 loss
diff = train_m * (train_r - pred_p)
sqE = tf.nn.l2_loss(diff)
loss_p = sqE + reg_losses

optimizer_p = tf.contrib.opt.ScipyOptimizerInterface(loss_p, options={'disp': True, 'maxiter': iter_p, 'maxcor': 10}, method='L-BFGS-B') 

#build model fine-tuning

y = R
reg_losses = None

for i in range(n_layers):
    y, _ = kernel_layer(y, n_hid = n_hid, n_dim = n_dim,lambda_s = lambda_s, lambda_2 = lambda_2, name=str(i))

y_dash, _ = kernel_layer(y, n_hid = n_u, n_dim = n_dim,lambda_s = lambda_s, lambda_2 = lambda_2, activation=tf.identity, name='out')

gk = global_kernel(y_dash, gk_size, dot_scale)  # Global kernel
y_hat = global_conv(train_r, gk)  # Global kernel-based rating matrix

for i in range(n_layers):
    y_hat, reg_loss = kernel_layer(y_hat, n_hid = n_hid, n_dim = n_dim,lambda_s = lambda_s, lambda_2 = lambda_2, name=str(i))
    reg_losses = reg_loss if reg_losses is None else reg_losses + reg_loss

pred_f, reg_loss = kernel_layer(y_hat, n_hid = n_u, n_dim = n_dim,lambda_s = lambda_s, lambda_2 = lambda_2, activation=tf.identity, name='out')
reg_losses = reg_losses + reg_loss

# L2 loss
diff = train_m * (train_r - pred_f)
sqE = tf.nn.l2_loss(diff)
loss_f = sqE + reg_losses

optimizer_f = tf.contrib.opt.ScipyOptimizerInterface(loss_f, options={'disp': True, 'maxiter': iter_f, 'maxcor': 10}, method='L-BFGS-B')



init = tf.global_variables_initializer()

with tf.Session() as sess:
    sess.run(init)
    for i in tqdm(range(epoch_p)):
        optimizer_p.minimize(sess, feed_dict={R: train_r})
        pre = sess.run(pred_p, feed_dict={R: train_r})

        error_train = (train_m * (np.clip(pre, 1., 5.) - train_r) ** 2).sum() / train_m.sum()  # train error
        train_rmse = np.sqrt(error_train)

        print('.-^-._' * 12)
        print('PRE-TRAINING')
        print('Epoch:', i+1, 'train rmse:', train_rmse)
        print('.-^-._' * 12)

    for i in tqdm(range(epoch_f)):
        optimizer_f.minimize(sess, feed_dict={R: train_r})
        pre = sess.run(pred_f, feed_dict={R: train_r})

        error_train = (train_m * (np.clip(pre, 1., 5.) - train_r) ** 2).sum() / train_m.sum()  # train error
        train_rmse = np.sqrt(error_train)

        train_mae = (train_m * np.abs(np.clip(pre, 1., 5.) - train_r)).sum() / train_m.sum()

        train_ndcg = call_ndcg(np.clip(pre, 1., 5.), train_r)

        print('.-^-._' * 12)
        print('FINE-TUNING')
        print('Epoch:', i+1, 'train rmse:', train_rmse, 'train mae:', train_mae, 'train ndcg:', train_ndcg)
        print('.-^-._' * 12)

    print("finished training on whole training data")

    #make raw predictions of our data 
    path = os.path.join(DATA_DIR, "to_predict.dat")
    n_m, n_u, train_r, train_m, test_r, test_m = load_data(path=path, delimiter='::', frac = 0.0, seed=1234, shuffle_data = False)

    pre = sess.run(pred_p, feed_dict={R: train_r})
    pd.DataFrame(pre).to_csv(os.path.join(DATA_DIR, "raw_predictions.csv"))

    print("finished raw predictions")

In [None]:
# i can throw  0'th column of raw predictions away as it is just the index
raw_predictions = pd.read_csv(os.path.join(DATA_DIR, "raw_predictions.csv"))
#convert raw predictions to correct format
output = to_predict_pd.to_numpy()
final_predictions = raw_predictions.to_numpy()
for id,user in enumerate(pred_users):
  prediction = final_predictions[pred_movies[id]][user+1]
  output[id][1] = prediction

In [None]:
submission_df = pd.DataFrame(output, columns = ['Id', 'Prediction'])
try:
    os.makedirs(os.path.join(DATA_DIR,"final_predictions"))
except FileExistsError:
    # directory already exists
    pass
submission_df.to_csv(os.path.join(DATA_DIR,"final_predictions/GLocal_K.csv"),index = False)