In [2]:
import sys
import os
import torch
import cornac
# import papermill as pm
# import scrapbook as sb
import pandas as pd
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k
from recommenders.models.cornac.cornac_utils import predict_ranking
from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED

print("PyTorch version: {}".format(torch.__version__))
print("Cornac version: {}".format(cornac.__version__))

PyTorch version: 1.12.0+cu113
Cornac version: 1.10.0


In [16]:
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

data = {'userID': [], 'itemID': [],	'value': []}	
data = pd.DataFrame(data)

train_csv = 'data/train.csv'
train_df = pd.read_csv(train_csv)

for i, _ in enumerate(tqdm(range(train_df.shape[0]))):

    user_id = train_df.iloc[i]['user_id']
    course_seq = train_df.iloc[i]['course_id']
    course_list = course_seq.split(' ')
    #print(course_seq)

    for course_id in course_list:

        row = {'userID': user_id, 'itemID': course_id, 'value': 1.0}
        data = data.append(row, ignore_index=True)
        
    # if i == 3:
    #     break

data

100%|██████████| 59737/59737 [03:06<00:00, 321.15it/s]


Unnamed: 0,userID,itemID,value
0,5bdecbfffec014002166796a,5f194354cad0d086f3ee24cf,1.0
1,5fedf958af850a915c86362c,5bfd47782d018e0020e4b0e4,1.0
2,5fedf958af850a915c86362c,5fc4a352d375951a03cc0d45,1.0
3,5fedf958af850a915c86362c,6090dda489d06d6a564c9a78,1.0
4,5fedf958af850a915c86362c,5fd9b1ce0fb8aa8b32928d5b,1.0
...,...,...,...
139603,60e66f29be3e3b0006c4db75,55ae208a7b4d9910001198f2,1.0
139604,60e2b05ee742c300072ff5b3,600538ff0cf6c91168243a88,1.0
139605,60da0995947dfc0fb61ef296,5edf464fb54d0f59f0e7d96a,1.0
139606,5fd5ea567642a9ec30c89643,5a2170d5a6d4a5001ec3148d,1.0


In [17]:
train_set = cornac.data.Dataset.from_uir(data.itertuples(index=False), seed=42)

print('Number of users: {}'.format(train_set.num_users))
print('Number of items: {}'.format(train_set.num_items))

Number of users: 59737
Number of items: 664


In [18]:
# Model parameters
LATENT_DIM = 250
ENCODER_DIMS = [100]
ACT_FUNC = "tanh"
LIKELIHOOD = "pois"
NUM_EPOCHS = 500
BATCH_SIZE = 128
LEARNING_RATE = 0.001

bivae = cornac.models.BiVAECF(
    k=LATENT_DIM,
    encoder_structure=ENCODER_DIMS,
    act_fn=ACT_FUNC,
    likelihood=LIKELIHOOD,
    n_epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    seed=SEED,
    use_gpu=torch.cuda.is_available(),
    verbose=True
)

with Timer() as t:
    bivae.fit(train_set)
print("Took {} seconds for training.".format(t))

  0%|          | 0/500 [00:00<?, ?it/s]

Took 592.1513 seconds for training.


In [19]:
with Timer() as t:
    all_predictions = predict_ranking(bivae, data, usercol='userID', itemcol='itemID', remove_seen=False)
print("Took {} seconds for prediction.".format(t))
all_predictions

Took 5.3015 seconds for prediction.


Unnamed: 0,userID,itemID,prediction
0,5bdecbfffec014002166796a,5f194354cad0d086f3ee24cf,2.529816e-03
1,5bdecbfffec014002166796a,5bfd47782d018e0020e4b0e4,2.427331e-03
2,5bdecbfffec014002166796a,5fc4a352d375951a03cc0d45,4.960350e-03
3,5bdecbfffec014002166796a,6090dda489d06d6a564c9a78,6.467618e-04
4,5bdecbfffec014002166796a,5fd9b1ce0fb8aa8b32928d5b,1.150339e-02
...,...,...,...
39665363,5fd5ea567642a9ec30c89643,55ae34247b4d99100011990d,4.823733e-07
39665364,5fd5ea567642a9ec30c89643,5d8dfd62a7e8fe0020d77b7e,1.486876e-07
39665365,5fd5ea567642a9ec30c89643,586a2519f108e00800c26e61,8.343290e-08
39665366,5fd5ea567642a9ec30c89643,5f55fb39b34335d28416bd0c,1.823020e-07


In [20]:
import csv
from tqdm import tqdm

pred_csv = 'output.csv'
test_seen_csv = 'data/test_seen.csv'
test_seen_df = pd.read_csv(test_seen_csv)
user_id_list = test_seen_df['user_id'].values

with open(pred_csv, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['user_id', 'course_id'])

    for i, id in enumerate(tqdm(user_id_list)):
        temp = all_predictions.loc[all_predictions['userID'] == id]
        temp.sort_values(by=['prediction'], inplace=True, ascending=False)
        temp = temp['itemID'].values

        rec_list = []
        for j, t in enumerate(temp):
            if len(rec_list) > 50:
                break
            
            rec_list.append(t)

        rec_sequence = ' '.join(rec_list)        
        writer.writerow([id, rec_sequence])

print('Done !')

100%|██████████| 7205/7205 [1:55:19<00:00,  1.04it/s]
