In [None]:
pip install ray

Collecting ray
  Downloading ray-2.12.0-cp310-cp310-manylinux2014_x86_64.whl (65.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.3/65.3 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ray
Successfully installed ray-2.12.0


In [None]:
pip install "ray[tune]"

Collecting tensorboardX>=1.9 (from ray[tune])
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tensorboardX
Successfully installed tensorboardX-2.6.2.2


In [None]:
pip install kmeans-pytorch

Collecting kmeans-pytorch
  Downloading kmeans_pytorch-0.3-py3-none-any.whl (4.4 kB)
Installing collected packages: kmeans-pytorch
Successfully installed kmeans-pytorch-0.3


In [None]:
! pip install recbole

Collecting recbole
  Downloading recbole-1.2.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting colorlog==4.7.2 (from recbole)
  Downloading colorlog-4.7.2-py2.py3-none-any.whl (10 kB)
Collecting colorama==0.4.4 (from recbole)
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting thop>=0.1.1.post2207130030 (from recbole)
  Downloading thop-0.1.1.post2209072238-py3-none-any.whl (15 kB)
Collecting texttable>=0.9.0 (from recbole)
  Downloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->recbole)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->recbole)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.

In [1]:
import os
from datetime import datetime
import time

import numpy as np
import pandas as pd
from recbole.quick_start import run_recbole
from tqdm import tqdm

In [2]:
cfg_str = """
data_path: ./
dataset: hm
field_separator: "\\t"
USER_ID_FIELD: userID
ITEM_ID_FIELD: movieID
RATING_FIELD: ~
TIMESTAMP_FIELD: timestamp
show_progress: false

load_col:
    inter: [userID, movieID, cluster_0, cluster_1, cluster_2 ,cluster_3, cluster_4, cluster_5, cluster_6, cluster_7, cluster_8, cluster_9, cluster_10, cluster_11, cluster_12, cluster_13, cluster_14, cluster_15, cluster_16, cluster_17, cluster_18, cluster_19, timestamp]
    user: [userID, age, gender]
    item: [movieID, actorID, directorID, genreID, countryID]

epochs: 5
learning_rate: 0.01
user_inter_num_interval: "[0,inf)"
item_inter_num_interval: "[0,inf)"
filter_inter_by_user_or_item: false
train_neg_sample_args:
  uniform: 1
eval_args:
    split: {'RS': [4, 1, 1]}
    group_by: None
    order: TO
    mode: uni50
#metrics: ['AUC', 'Logloss']
metrics: ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk: 12
valid_metric: MAP@12
"""


with open("hm/config.yaml", "w") as f:
    f.write(cfg_str)

In [4]:
df = pd.read_csv('hm/hm.inter', sep ='\t')
df.head()

Unnamed: 0,userID:token,movieID:token,cluster_0:float,cluster_1:float,cluster_2:float,cluster_3:float,cluster_4:float,cluster_5:float,cluster_6:float,cluster_7:float,...,cluster_11:float,cluster_12:float,cluster_13:float,cluster_14:float,cluster_15:float,cluster_16:float,cluster_17:float,cluster_18:float,cluster_19:float,timestamp:float
0,1,2,1,1,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1188270000000.0
1,5,2,1,1,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1188270000000.0
2,13,2,1,1,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1188270000000.0
3,29,2,1,1,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1188270000000.0
4,34,2,1,1,1,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1188270000000.0


In [5]:
def run(model_name):
    if model_name in [
        "MultiVAE",
        "MultiDAE",
        "MacridVAE",
        "RecVAE",
        "GRU4Rec",
        "NARM",
        "STAMP",
        "NextItNet",
        "TransRec",
        "SASRec",
        "BERT4Rec",
        "SRGNN",
        "GCSAN",
        "GRU4RecF",
        "FOSSIL",
        "SHAN",
        "RepeatNet",
        "HRM",
        "NPE",
    ]:
        parameter_dict = {
            "neg_sampling": None,
        }
        return run_recbole(
            model=model_name,
            dataset='hm',
            config_file_list=['hm/config.yaml'],
            config_dict=parameter_dict,
        )
    else:
        return run_recbole(
            model=model_name,
            dataset='hm',
            config_file_list=['hm/config.yaml'],
        )


In [6]:
%%time
#model_list = ["Pop", "ItemKNN", "BPR", "NeuMF", "RecVAE", "LightGCN"] # General
model_list = ["DeepFM","FFM"] # Context-aware
#model_list += ["GRU4Rec", "SHAN"] # Sequential
for model_name in model_list:
    print(f"running {model_name}...")
    start = time.time()
    result = run(model_name)
    t = time.time() - start
    print(f"It took {t/60:.2f} mins")
    print(result)



running DeepFM...
It took 24.92 mins
{'best_valid_score': 0.1345, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('recall@12', 0.3422), ('mrr@12', 0.285), ('ndcg@12', 0.2207), ('hit@12', 0.6361), ('precision@12', 0.0611), ('map@12', 0.1345)]), 'test_result': OrderedDict([('recall@12', 0.1915), ('mrr@12', 0.1538), ('ndcg@12', 0.1274), ('hit@12', 0.2833), ('precision@12', 0.0245), ('map@12', 0.0929)])}
running FFM...




It took 31.18 mins
{'best_valid_score': 0.2426, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('recall@12', 0.4057), ('mrr@12', 0.4269), ('ndcg@12', 0.335), ('hit@12', 0.7122), ('precision@12', 0.0848), ('map@12', 0.2426)]), 'test_result': OrderedDict([('recall@12', 0.649), ('mrr@12', 0.425), ('ndcg@12', 0.4291), ('hit@12', 0.8896), ('precision@12', 0.1215), ('map@12', 0.3003)])}
CPU times: user 55min 13s, sys: 17.9 s, total: 55min 31s
Wall time: 56min 6s


In [7]:
cfg_str = """
data_path: ./
dataset: hm
field_separator: "\\t"
USER_ID_FIELD: userID
ITEM_ID_FIELD: movieID
RATING_FIELD: ~
TIMESTAMP_FIELD: timestamp
show_progress: false

load_col:
    inter: [userID, movieID, cluster_0, cluster_1, cluster_2 ,cluster_3, cluster_4, cluster_5, cluster_6, cluster_7, cluster_8, cluster_9, cluster_10, cluster_11, cluster_12, cluster_13, cluster_14, cluster_15, cluster_16, cluster_17, cluster_18, cluster_19, timestamp]
    user: [userID, age, gender]
    item: [movieID, actorID, directorID, genreID, countryID]

epochs: 5
learning_rate: 0.01
user_inter_num_interval: "[0,inf)"
item_inter_num_interval: "[0,inf)"
filter_inter_by_user_or_item: false
train_neg_sample_args:
  uniform: 1
eval_args:
    split: {'RS': [4, 1, 1]}
    group_by: None
    order: TO
    mode: uni50
metrics: ['AUC', 'Logloss']
#metrics: ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk: 12
#valid_metric: MAP@12
"""


with open("hm/config.yaml", "w") as f:
    f.write(cfg_str)

In [8]:
def run(model_name):
    if model_name in [
        "MultiVAE",
        "MultiDAE",
        "MacridVAE",
        "RecVAE",
        "GRU4Rec",
        "NARM",
        "STAMP",
        "NextItNet",
        "TransRec",
        "SASRec",
        "BERT4Rec",
        "SRGNN",
        "GCSAN",
        "GRU4RecF",
        "FOSSIL",
        "SHAN",
        "RepeatNet",
        "HRM",
        "NPE",
    ]:
        parameter_dict = {
            "neg_sampling": None,
        }
        return run_recbole(
            model=model_name,
            dataset='hm',
            config_file_list=['hm/config.yaml'],
            config_dict=parameter_dict,
        )
    else:
        return run_recbole(
            model=model_name,
            dataset='hm',
            config_file_list=['hm/config.yaml'],
        )


In [None]:
%%time
#model_list = ["Pop", "ItemKNN", "BPR", "NeuMF", "RecVAE", "LightGCN"] # General
model_list = ["DeepFM","FFM"] # Context-aware
#model_list += ["GRU4Rec", "SHAN"] # Sequential
for model_name in model_list:
    print(f"running {model_name}...")
    start = time.time()
    result = run(model_name)
    t = time.time() - start
    print(f"It took {t/60:.2f} mins")
    print(result)

running DeepFM...




In [None]:
df = pd.read_csv('hm/hm.item', sep ='\t')
df.head()

Unnamed: 0,movieID:token,actorID:token,directorID:token,genreID:token,countryID:token
0,1343,1008767-jessica_lange,martin_scorsese,Thriller,USA
1,1343,1008767-jessica_lange,martin_scorsese,Thriller,USA
2,2080,alan_reed_sr,clyde_geronimi,Animation,USA
3,2953,abdoulaye_ngom,chris_columbus,Children,USA
4,2953,abdoulaye_ngom,chris_columbus,Children,USA


In [None]:
cfg_str = """
data_path: ./
dataset: hm
field_separator: "\\t"
USER_ID_FIELD: userID
ITEM_ID_FIELD: movieID
RATING_FIELD: ~
TIMESTAMP_FIELD: timestamp
show_progress: false

load_col:
    inter: [userID, movieID, cluster_0, cluster_1, cluster_2 ,cluster_3, cluster_4, cluster_5, cluster_6, cluster_7, cluster_8, cluster_9, cluster_10, cluster_11, cluster_12, cluster_13, cluster_14, cluster_15, cluster_16, timestamp]
    user: [userID, age, gender]
    item: [movieID, genreID, countryID]

epochs: 5
learning_rate: 0.01
user_inter_num_interval: "[0,inf)"
item_inter_num_interval: "[0,inf)"
filter_inter_by_user_or_item: false
train_neg_sample_args:
  uniform: 1
eval_args:
    split: {'RS': [4, 1, 1]}
    group_by: None
    order: TO
    mode: uni50
metrics: ['AUC', 'Logloss']
#metrics: ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk: 12
#valid_metric: MAP@12
"""


with open("hm/config.yaml", "w") as f:
    f.write(cfg_str)

In [None]:
def run(model_name):
    if model_name in [
        "MultiVAE",
        "MultiDAE",
        "MacridVAE",
        "RecVAE",
        "GRU4Rec",
        "NARM",
        "STAMP",
        "NextItNet",
        "TransRec",
        "SASRec",
        "BERT4Rec",
        "SRGNN",
        "GCSAN",
        "GRU4RecF",
        "FOSSIL",
        "SHAN",
        "RepeatNet",
        "HRM",
        "NPE",
    ]:
        parameter_dict = {
            "neg_sampling": None,
        }
        return run_recbole(
            model=model_name,
            dataset='hm',
            config_file_list=['hm/config.yaml'],
            config_dict=parameter_dict,
        )
    else:
        return run_recbole(
            model=model_name,
            dataset='hm',
            config_file_list=['hm/config.yaml'],
        )


In [None]:
%%time
#model_list = ["Pop", "ItemKNN", "BPR", "NeuMF", "RecVAE", "LightGCN"] # General
model_list = ["DeepFM","FFM"] # Context-aware
#model_list += ["GRU4Rec", "SHAN"] # Sequential
for model_name in model_list:
    print(f"running {model_name}...")
    start = time.time()
    result = run(model_name)
    t = time.time() - start
    print(f"It took {t/60:.2f} mins")
    print(result)

running DeepFM...




It took 23.95 mins
{'best_valid_score': 0.4793, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('auc', 0.4793), ('logloss', 0.6291)]), 'test_result': OrderedDict([('auc', 0.4449), ('logloss', 0.6511)])}
running FFM...




It took 25.76 mins
{'best_valid_score': 0.5609, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('auc', 0.5609), ('logloss', 0.2317)]), 'test_result': OrderedDict([('auc', 0.6749), ('logloss', 0.2148)])}
CPU times: user 48min 24s, sys: 1min 2s, total: 49min 27s
Wall time: 49min 42s


In [None]:
cfg_str = """
data_path: ./
dataset: hm
field_separator: "\\t"
USER_ID_FIELD: userID
ITEM_ID_FIELD: movieID
RATING_FIELD: ~
TIMESTAMP_FIELD: timestamp
show_progress: false

load_col:
    inter: [userID, movieID, cluster_0, cluster_1, cluster_2 ,cluster_3, cluster_4, cluster_5, cluster_6, cluster_7, cluster_8, cluster_9, cluster_10, cluster_11, cluster_12, cluster_13, cluster_14, cluster_15, cluster_16, timestamp]
    user: [userID, age, gender]
    item: [movieID, genreID, countryID]

epochs: 5
learning_rate: 0.01
user_inter_num_interval: "[0,inf)"
item_inter_num_interval: "[0,inf)"
filter_inter_by_user_or_item: false
train_neg_sample_args:
  uniform: 1
eval_args:
    split: {'RS': [4, 1, 1]}
    group_by: None
    order: TO
    mode: uni50
#metrics: ['AUC', 'Logloss']
metrics: ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk: 12
valid_metric: MAP@12
"""


with open("hm/config.yaml", "w") as f:
    f.write(cfg_str)

In [None]:
def run(model_name):
    if model_name in [
        "MultiVAE",
        "MultiDAE",
        "MacridVAE",
        "RecVAE",
        "GRU4Rec",
        "NARM",
        "STAMP",
        "NextItNet",
        "TransRec",
        "SASRec",
        "BERT4Rec",
        "SRGNN",
        "GCSAN",
        "GRU4RecF",
        "FOSSIL",
        "SHAN",
        "RepeatNet",
        "HRM",
        "NPE",
    ]:
        parameter_dict = {
            "neg_sampling": None,
        }
        return run_recbole(
            model=model_name,
            dataset='hm',
            config_file_list=['hm/config.yaml'],
            config_dict=parameter_dict,
        )
    else:
        return run_recbole(
            model=model_name,
            dataset='hm',
            config_file_list=['hm/config.yaml'],
        )


In [None]:
%%time
#model_list = ["Pop", "ItemKNN", "BPR", "NeuMF", "RecVAE", "LightGCN"] # General
model_list = ["DeepFM","FFM"] # Context-aware
#model_list += ["GRU4Rec", "SHAN"] # Sequential
for model_name in model_list:
    print(f"running {model_name}...")
    start = time.time()
    result = run(model_name)
    t = time.time() - start
    print(f"It took {t/60:.2f} mins")
    print(result)

running DeepFM...




It took 19.19 mins
{'best_valid_score': 0.0899, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('recall@12', 0.1985), ('mrr@12', 0.2213), ('ndcg@12', 0.1451), ('hit@12', 0.4185), ('precision@12', 0.0394), ('map@12', 0.0899)]), 'test_result': OrderedDict([('recall@12', 0.2212), ('mrr@12', 0.1533), ('ndcg@12', 0.1332), ('hit@12', 0.289), ('precision@12', 0.0246), ('map@12', 0.0946)])}
running FFM...




It took 19.92 mins
{'best_valid_score': 0.0941, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('recall@12', 0.2279), ('mrr@12', 0.2282), ('ndcg@12', 0.1529), ('hit@12', 0.4338), ('precision@12', 0.037), ('map@12', 0.0941)]), 'test_result': OrderedDict([('recall@12', 0.3208), ('mrr@12', 0.2227), ('ndcg@12', 0.2005), ('hit@12', 0.4985), ('precision@12', 0.0494), ('map@12', 0.1353)])}
CPU times: user 38min 48s, sys: 6.29 s, total: 38min 54s
Wall time: 39min 6s
