#### 1. Imports

In [1]:
import os
import sys
import shutil
import numpy as np
import pandas as pd
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.evaluation.python_evaluation import (map, ndcg_at_k, precision_at_k, recall_at_k)
from recommenders.utils.constants import SEED as DEFAULT_SEED
from recommenders.utils.notebook_utils import store_metadata

print("System version: {}".format(sys.version))
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow version: {}".format(tf.__version__))

from sklearn.model_selection import train_test_split

ModuleNotFoundError: No module named 'numpy'

### 2. Setting Up Parameters

In [3]:
# top k items to recommend
TOP_K = 10

# Select  data size: 100k, 1m, 10m, or 20m
DATA_SIZE = '10000'

# Model parameters
EPOCHS = 10 # ! Changed from 100 because it was taking forever
BATCH_SIZE = 256

SEED = DEFAULT_SEED  # Set None for non-deterministic results


### 3. Loading Dataframe


In [4]:
# * Load & Set Up Dataframe
CF_DF = pd.read_csv('spotify_songs.csv')
CF_DF = CF_DF.loc[0:10000]

CF_DF['userID'] = pd.factorize(CF_DF.track_id)[0]
CF_DF['itemID'] = pd.factorize(CF_DF.playlist_id)[0]

CF_DF.rename(columns={'track_popularity': 'rating'}, inplace=True)

CF_DF.sort_values('userID')


Unnamed: 0,track_id,track_name,track_artist,rating,track_album_id,track_album_name,track_album_release_date,playlist_name,playlist_id,playlist_genre,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,userID,itemID
0,6f807x0ima9a1j3VPbc7VN,I Don't Care (with Justin Bieber) - Loud Luxur...,Ed Sheeran,66,2oCs0DGTsRO98Gh5ZSl2Cx,I Don't Care (with Justin Bieber) [Loud Luxury...,2019-06-14,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,0.0583,0.1020,0.000000,0.0653,0.518,122.036,194754,0,0
1,0r7CVbZTWZgbTCYdfa2P31,Memories - Dillon Francis Remix,Maroon 5,67,63rPSO264uRjW1X5E6cWv6,Memories (Dillon Francis Remix),2019-12-13,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,0.0373,0.0724,0.004210,0.3570,0.693,99.972,162600,1,0
2,1z1Hg7Vb0AhHDiEmnDE79l,All the Time - Don Diablo Remix,Zara Larsson,70,1HoSmj2eLcsrR0vE9gThr4,All the Time (Don Diablo Remix),2019-07-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,0,0.0742,0.0794,0.000023,0.1100,0.613,124.008,176616,2,0
3,75FpbthrwQmzHlBJLuGdC7,Call You Mine - Keanu Silva Remix,The Chainsmokers,60,1nqYsOef1yKKuGOVchbsk6,Call You Mine - The Remixes,2019-07-19,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,0.1020,0.0287,0.000009,0.2040,0.277,121.956,169093,3,0
4,1e8PAfcKUYoKkxPhrHqw4x,Someone You Loved - Future Humans Remix,Lewis Capaldi,69,7m7vv9wlQ4i0LFuJiE2zsQ,Someone You Loved (Future Humans Remix),2019-03-05,Pop Remix,37i9dQZF1DXcZDD7cfEKhW,pop,...,1,0.0359,0.0803,0.000000,0.0833,0.725,123.976,189052,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9996,7KksdXBhdufqWDxGxyt4I7,no14 - feat. Dani Faiv,tha Supreme,72,42wflBbrb9OchJfd3qiGRO,23 6451,2019-11-15,Zona Trap,37i9dQZF1DWXU2naFUn37x,rap,...,0,0.0987,0.2140,0.000002,0.1550,0.644,90.059,176440,9361,140
9997,2985JpMNKG1QT5i4RSjehz,MAI (feat. Lele Blade & Fred De Palma),Giaime,63,1MvD0ifvf4I8QAWYm3pxZe,MAI (feat. Lele Blade & Fred De Palma),2019-11-08,Zona Trap,37i9dQZF1DWXU2naFUn37x,rap,...,0,0.0531,0.1060,0.000028,0.1690,0.197,100.003,170400,9362,140
9998,2TxDcppMX95JQv9WB5rtuB,NO SPIE,FSK SATELLITE,68,12qonhRlBJqknWeMEvjmxY,FSK TRAPSHIT,2019-07-11,Zona Trap,37i9dQZF1DWXU2naFUn37x,rap,...,0,0.2680,0.4840,0.002010,0.0747,0.443,100.015,141598,9363,140
9999,34UHhwlly3SHryWuVfMOJm,Fuori E Dentro (feat. tha Supreme),Gemitaiz,71,7rdX1715VhDFVwTVbFLcXM,Scatola Nera,2019-09-19,Zona Trap,37i9dQZF1DWXU2naFUn37x,rap,...,1,0.1190,0.1110,0.000000,0.1180,0.538,123.947,170000,9364,140


### 4. Training & Testing Split

In [5]:
#train, test = python_chrono_split(CF_DF, 0.75)

train, test = train_test_split(CF_DF, test_size=0.25, random_state=SEED)


In [6]:
test = test[test["userID"].isin(train["userID"].unique())]
test = test[test["itemID"].isin(train["itemID"].unique())]


In [7]:
leave_one_out_test = test.groupby("userID").last().reset_index()


In [18]:
train_file = "./splits/train.csv"
test_file = "./splits/test.csv"
leave_one_out_test_file = "./splits/leave_one_out_test.csv"
train = train.sort_values(by='userID')
train.to_csv(train_file, index=False)
test.to_csv(test_file, index=False)

In [19]:
leave_one_out_test.to_csv(leave_one_out_test_file, index=False)

### 5. NCF Dataset 

In [20]:
data = NCFDataset(train_file=train_file, test_file=leave_one_out_test_file, seed=SEED, overwrite_test_file_full=True)


INFO:recommenders.models.ncf.dataset:Indexing ./splits/train.csv ...
INFO:recommenders.models.ncf.dataset:Indexing ./splits/leave_one_out_test.csv ...
INFO:recommenders.models.ncf.dataset:Creating full leave-one-out test file ./splits/leave_one_out_test_full.csv ...
100%|██████████| 224/224 [00:00<00:00, 872.80it/s]
INFO:recommenders.models.ncf.dataset:Indexing ./splits/leave_one_out_test_full.csv ...


### 6. Train NCF 

In [11]:
model = NCF(
        n_users=data.n_users, 
        n_items=data.n_items,
        model_type="NeuMF",
        n_factors=4,
        layer_sizes=[16,8,4],
        n_epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        learning_rate=1e-3,
        verbose=10,
        seed=SEED
    )

2024-03-23 22:13:31.409115: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:388] MLIR V1 optimization pass is not enabled


In [12]:
with Timer() as train_time:
        model.fit(data)
        
print("Took {} seconds for training.".format(train_time.interval))


INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [14.47s]: train_loss = 0.336337 


Took 135.517088667 seconds for training.


### 7. Prediction

In [13]:
predictions = [[row.userID, row.itemID, model.predict(row.userID, row.itemID)]
        for (_, row) in test.iterrows()]


predictions = pd.DataFrame(predictions, columns=['userID', 'itemID', 'prediction'])
predictions.head()

Unnamed: 0,userID,itemID,prediction
0,133,25,0.298322
1,1533,43,0.161377
2,321,4,0.23616
3,325,48,0.114047
4,6476,136,0.046168


In [14]:
with Timer() as test_time:
    users, items, preds = [], [], []
    item = list(train.itemID.unique())
    for user in train.userID.unique():
        user = [user] * len(item) 
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

    merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

print("Took {} seconds for prediction.".format(test_time.interval))

Took 4.390577082999982 seconds for prediction.


### 8. Generic Evaluation

In [15]:
eval_map = map(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_ndcg = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_precision = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_recall = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)

print("MAP:\t%f" % eval_map,
    "NDCG:\t%f" % eval_ndcg,
    "Precision@K:\t%f" % eval_precision,
    "Recall@K:\t%f" % eval_recall, sep='\n')

MAP:	0.029262
NDCG:	0.047443
Precision@K:	0.011161
Recall@K:	0.104911


### 9. Leave-One-Out Evaluation

In [16]:
k = TOP_K

ndcgs = []
hit_ratio = []

for b in data.test_loader():
    user_input, item_input, labels = b
    output = model.predict(user_input, item_input, is_list=True)

    output = np.squeeze(output)
    rank = sum(output >= output[0])
    if rank <= k:
        ndcgs.append(1 / np.log(rank + 1))
        hit_ratio.append(1)
    else:
        ndcgs.append(0)
        hit_ratio.append(0)

eval_ndcg = np.mean(ndcgs)
eval_hr = np.mean(hit_ratio)

print("HR:\t%f" % eval_hr)
print("NDCG:\t%f" % eval_ndcg)

HR:	0.129464
NDCG:	0.082759


## 10. Pre-Training

### a. Storing Parameters of GMF & MLP

In [17]:
model = NCF(
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="GMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)

with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time.interval))

model.save(dir_name=".pretrain/GMF")

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [12.44s]: train_loss = 0.461282 


Took 130.759875166 seconds for training.


In [21]:
model = NCF(
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="MLP",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)
with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time.interval))

model.save(dir_name=".pretrain/MLP")

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [12.84s]: train_loss = 0.452138 


Took 138.01384050000001 seconds for training.


### b. Loading pre-trained GMF and MLP model

In [22]:
model = NCF(
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[16,8,4],
    n_epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=1e-3,
    verbose=10,
    seed=SEED
)

model.load(gmf_dir=".pretrain/GMF", mlp_dir=".pretrain/MLP", alpha=0.5)
with Timer() as train_time:
    model.fit(data)

print("Took {} seconds for training.".format(train_time.interval))

INFO:recommenders.models.ncf.ncf_singlenode:Epoch 10 [12.72s]: train_loss = 0.302816 


Took 136.03880037499994 seconds for training.


### c. Compare with not-pre-trained NeuMF

In [23]:
with Timer() as test_time:

    users, items, preds = [], [], []
    item = list(train.itemID.unique())
    for user in train.userID.unique():
        user = [user] * len(item) 
        users.extend(user)
        items.extend(item)
        preds.extend(list(model.predict(user, item, is_list=True)))

    all_predictions = pd.DataFrame(data={"userID": users, "itemID":items, "prediction":preds})

    merged = pd.merge(train, all_predictions, on=["userID", "itemID"], how="outer")
    all_predictions = merged[merged.rating.isnull()].drop('rating', axis=1)

print("Took {} seconds for prediction.".format(test_time.interval))

Took 3.921901165999998 seconds for prediction.


In [24]:
eval_map2 = map(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_ndcg2 = ndcg_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_precision2 = precision_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)
eval_recall2 = recall_at_k(test, all_predictions, col_prediction='prediction', k=TOP_K)

print("MAP:\t%f" % eval_map2,
      "NDCG:\t%f" % eval_ndcg2,
      "Precision@K:\t%f" % eval_precision2,
      "Recall@K:\t%f" % eval_recall2, sep='\n')

MAP:	0.057864
NDCG:	0.075369
Precision@K:	0.014286
Recall@K:	0.127232
