# MODEL STUFF

In [1]:
import pandas as pd
import numpy as np
from gmf import GMFEngine
from mlp import MLPEngine
from neumf import NeuMFEngine
from data import SampleGenerator

In [2]:
gmf_config = {'alias': 'gmf_factor8neg4-implict',
              'num_epoch': 1,
              'batch_size': 1024,
              # 'optimizer': 'sgd',
              # 'sgd_lr': 1e-3,
              # 'sgd_momentum': 0.9,
              # 'optimizer': 'rmsprop',
              # 'rmsprop_lr': 1e-3,
              # 'rmsprop_alpha': 0.99,
              # 'rmsprop_momentum': 0,
              'optimizer': 'adam',
              'adam_lr': 1e-3,
              'num_users': 7985,
              'num_items': 4498,
              'latent_dim': 8,
              'num_negative': 4,
              'l2_regularization': 0, # 0.01
              'use_cuda': False,
              'device_id': 0,
              'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'}

In [None]:
6040
3703
----
6040
1878

In [3]:
mlp_config = {'alias': 'mlp_factor8neg4_bz256_166432168_pretrain_reg_0.0000001',
              'num_epoch': 200,
              'batch_size': 256,  # 1024,
              'optimizer': 'adam',
              'adam_lr': 1e-3,
              'num_users': 6040,
              'num_items': 3706,
              'latent_dim': 8,
              'num_negative': 4,
              'layers': [16,64,32,16,8],  # layers[0] is the concat of latent user vector & latent item vector
              'l2_regularization': 0.0000001,  # MLP model is sensitive to hyper params
              'use_cuda': False,
              'device_id': 7,
              'pretrain': True,
              'pretrain_mf': 'checkpoints/{}'.format('gmf_factor8neg4_Epoch100_HR0.6391_NDCG0.2852.model'),
              'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'}

In [4]:
neumf_config = {'alias': 'pretrain_neumf_factor8neg4',
                'num_epoch': 200,
                'batch_size': 1024,
                'optimizer': 'adam',
                'adam_lr': 1e-3,
                'num_users': 6040,
                'num_items': 3706,
                'latent_dim_mf': 8,
                'latent_dim_mlp': 8,
                'num_negative': 4,
                'layers': [16,32,16,8],  # layers[0] is the concat of latent user vector & latent item vector
                'l2_regularization': 0.01,
                'use_cuda': False,
                'device_id': 7,
                'pretrain': True,
                'pretrain_mf': 'checkpoints/{}'.format('gmf_factor8neg4_Epoch100_HR0.6391_NDCG0.2852.model'),
                'pretrain_mlp': 'checkpoints/{}'.format('mlp_factor8neg4_Epoch100_HR0.5606_NDCG0.2463.model'),
                'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'
                }

In [5]:
# Load Data
ml1m_dir = 'data/ml-1m/ratings.dat'
ml1m_rating = pd.read_csv(ml1m_dir, sep='::', header=None, names=['uid', 'mid', 'rating', 'timestamp'],  engine='python')

In [14]:
# print(ml1m_rating.head())

rating_range = ml1m_rating['rating'].min(), ml1m_rating['rating'].max()

print(rating_range)

(1, 5)


In [7]:
# print(type(ml1m_rating['timestamp'][0]))

In [8]:
# Reindex

user_id = ml1m_rating[['uid']].drop_duplicates().reindex()

print("initial user_id")
print(user_id.head())

user_id['userId'] = np.arange(len(user_id)) 

print("final user_id")
print(user_id.head())

ml1m_rating = pd.merge(ml1m_rating, user_id, on=['uid'], how='left')

print("initial ml1m_rating ")
print(ml1m_rating.head())

item_id = ml1m_rating[['mid']].drop_duplicates()

print("initial user_id")
print(item_id.head())

item_id['itemId'] = np.arange(len(item_id))

print("final user_id")
print(item_id.head())

ml1m_rating = pd.merge(ml1m_rating, item_id, on=['mid'], how='left')

print("final ml1m_rating ")
print(ml1m_rating.head())

ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']]

print("final final ml1m_rating ")
print(ml1m_rating.head())

print('Range of userId is [{}, {}]'.format(ml1m_rating.userId.min(), ml1m_rating.userId.max()))
print('Range of itemId is [{}, {}]'.format(ml1m_rating.itemId.min(), ml1m_rating.itemId.max()))

initial user_id
     uid
0      1
53     2
182    3
233    4
254    5
final user_id
     uid  userId
0      1       0
53     2       1
182    3       2
233    4       3
254    5       4
initial ml1m_rating 
   uid   mid  rating  timestamp  userId
0    1  1193       5  978300760       0
1    1   661       3  978302109       0
2    1   914       3  978301968       0
3    1  3408       4  978300275       0
4    1  2355       5  978824291       0
initial user_id
    mid
0  1193
1   661
2   914
3  3408
4  2355
final user_id
    mid  itemId
0  1193       0
1   661       1
2   914       2
3  3408       3
4  2355       4
final ml1m_rating 
   uid   mid  rating  timestamp  userId  itemId
0    1  1193       5  978300760       0       0
1    1   661       3  978302109       0       1
2    1   914       3  978301968       0       2
3    1  3408       4  978300275       0       3
4    1  2355       5  978824291       0       4
final final ml1m_rating 
   userId  itemId  rating  timestamp
0       0 

In [9]:
print(len(ml1m_rating))

1000209


In [15]:
print(ml1m_rating.head())

   userId  itemId  rating  timestamp
0       0       0       5  978300760
1       0       1       3  978302109
2       0       2       3  978301968
3       0       3       4  978300275
4       0       4       5  978824291


In [17]:
print(ml1m_rating['rating'][ml1m_rating['rating'] == 0])

Series([], Name: rating, dtype: int64)


In [19]:
print(ml1m_rating.groupby(['userId'])['timestamp'].rank(method='first', ascending=False))

0           42.0
1           23.0
2           28.0
3           47.0
4            4.0
           ...  
1000204    161.0
1000205    293.0
1000206    305.0
1000207    234.0
1000208    246.0
Name: timestamp, Length: 1000209, dtype: float64


In [10]:
# DataLoader for training
sample_generator = SampleGenerator(ratings=ml1m_rating)
evaluate_data = sample_generator.evaluate_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['rating'][ratings['rating'] > 0] = 1.0


(6040, 2)    userId                                   interacted_items
0       0  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...
1       1  {0, 18, 20, 42, 47, 48, 52, 53, 54, 55, 56, 57...
2       2  {128, 4, 5, 22, 166, 168, 41, 44, 175, 176, 17...
3       3  {139, 26, 156, 43, 44, 48, 63, 64, 208, 209, 2...
4       4  {3, 4, 9, 18, 27, 38, 39, 43, 48, 51, 59, 62, ...
-------
(6040, 3)    userId                                   interacted_items  \
0       0  {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...   
1       1  {0, 18, 20, 42, 47, 48, 52, 53, 54, 55, 56, 57...   
2       2  {128, 4, 5, 22, 166, 168, 41, 44, 175, 176, 17...   
3       3  {139, 26, 156, 43, 44, 48, 63, 64, 208, 209, 2...   
4       4  {3, 4, 9, 18, 27, 38, 39, 43, 48, 51, 59, 62, ...   

                                      negative_items  
0  {53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 6...  
1  {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...  
2  {0, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1...  
3

In [11]:
# Specify the exact model
config = gmf_config
engine = GMFEngine(config)
# config = mlp_config
# engine = MLPEngine(config)
# config = neumf_config
# engine = NeuMFEngine(config)

In [13]:
for epoch in range(config['num_epoch']):
    print('Epoch {} starts !'.format(epoch))
    print('-' * 80)
    train_loader = sample_generator.instance_a_train_loader(config['num_negative'], config['batch_size'])
    engine.train_an_epoch(train_loader, epoch_id=epoch)
    hit_ratio, ndcg = engine.evaluate(evaluate_data, epoch_id=epoch)
    engine.save(config['alias'], epoch, hit_ratio, ndcg)

Epoch 0 starts !
--------------------------------------------------------------------------------
[Training Epoch 0] Batch 0, Loss 0.8320139646530151
[Training Epoch 0] Batch 1, Loss 0.8574685454368591
[Training Epoch 0] Batch 2, Loss 0.865487813949585
[Training Epoch 0] Batch 3, Loss 0.8466443419456482
[Training Epoch 0] Batch 4, Loss 0.8294466733932495
[Training Epoch 0] Batch 5, Loss 0.8464298844337463
[Training Epoch 0] Batch 6, Loss 0.8484600186347961
[Training Epoch 0] Batch 7, Loss 0.8535138964653015
[Training Epoch 0] Batch 8, Loss 0.8483507633209229
[Training Epoch 0] Batch 9, Loss 0.8167732954025269
[Training Epoch 0] Batch 10, Loss 0.8496021032333374
[Training Epoch 0] Batch 11, Loss 0.8444838523864746
[Training Epoch 0] Batch 12, Loss 0.8398794531822205
[Training Epoch 0] Batch 13, Loss 0.8328506946563721
[Training Epoch 0] Batch 14, Loss 0.859897255897522
[Training Epoch 0] Batch 15, Loss 0.8263131380081177
[Training Epoch 0] Batch 16, Loss 0.8426458835601807
[Training Epo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_in_top_k['ndcg'] = test_in_top_k['rank'].apply(lambda x: math.log(2) / math.log(1 + x)) # the rank starts from 1
