In [1]:
import pandas as pd
import numpy as np
from model.gmf import GMFEngine
from model.mlp import MLPEngine
from model.neumf import NeuMFEngine
from data import SampleGenerator
import os

In [2]:
import torch
torch.cuda.is_available()

True

In [3]:
os.listdir('data')

['cleaned_data.csv']

In [4]:
# Load Data
data_dir = 'data/cleaned_data.csv'
tdc_record = pd.read_csv(data_dir, names=['uid', 'mid', 'timestamp'],  engine='python')

In [5]:
# Reindex
tdc_record = tdc_record.iloc[1:,:]
user_id = tdc_record[['uid']].drop_duplicates().reindex()
user_id['userId'] = np.arange(len(user_id))
tdc_record = pd.merge(tdc_record, user_id, on=['uid'], how='left')
item_id = tdc_record[['mid']].drop_duplicates()
item_id['itemId'] = np.arange(len(item_id))
tdc_record = pd.merge(tdc_record, item_id, on=['mid'], how='left')
tdc_record['rating']=1.0
tdc_record = tdc_record[['userId', 'itemId', 'rating', 'timestamp']]

tdc_record['rating']=tdc_record['rating'].astype('int32')
tdc_record['timestamp']=tdc_record['timestamp'].astype('float64')

print('Range of userId is [{}, {}]'.format(tdc_record.userId.min(), tdc_record.userId.max()))
print('Range of itemId is [{}, {}]'.format(tdc_record.itemId.min(), tdc_record.itemId.max()))
print(tdc_record.dtypes)

Range of userId is [0, 11410]
Range of itemId is [0, 13202]
userId         int32
itemId         int32
rating         int32
timestamp    float64
dtype: object


In [6]:
num_itemid=len(tdc_record['itemId'].unique())

num_userid=len(tdc_record['userId'].unique())

tdc_record.drop_duplicates(inplace=True)

# tdc_record.groupby('userId').count()

In [7]:
#print double check any useless items

df=tdc_record.groupby('userId').count()

user_id_drop= df[df['itemId']==1].index

drop_index=[]
for ind, row in tdc_record.iterrows():
    if row['userId'] in user_id_drop:
        drop_index.append(ind)

if len(drop_index)==0:
    print('No useless data')
else:
    print('found {} useless datapoints'.format(len(drop_index)))
    tdc_record.drop(drop_index, inplace=True)
    print('data cleaned!')

No useless data


In [8]:
# DataLoader for training
sample_generator = SampleGenerator(ratings=tdc_record)
evaluate_data = sample_generator.evaluate_data

start
begin preprocess_ratings
begin setting pools
creating negative items
0
1


100%|██████████████████████████████████████████████████████████████████████████| 11411/11411 [00:03<00:00, 2975.65it/s]


2


100%|██████████████████████████████████████████████████████████████████████████| 11411/11411 [00:02<00:00, 4649.78it/s]


split_loo
Done!
Begin the loop...


In [9]:
# Training Engine
def train_model(model, config):
    engine = model(config)
    best_hit = 0
    for epoch in range(config['num_epoch']):
        print('Epoch {} starts !'.format(epoch))
        print('-' * 70)
        train_loader = sample_generator.instance_a_train_loader(config['num_negative'], config['batch_size'])
        engine.train_an_epoch(train_loader, epoch_id=epoch)
        hit_ratio, ndcg = engine.evaluate(evaluate_data, epoch_id=epoch)
        if epoch % 20 == 0:
            engine.save(config['alias'], epoch, hit_ratio, ndcg)
        elif (epoch == config['num_epoch'] - 1):
            engine.save(config['alias'], epoch, hit_ratio, ndcg)
        if hit_ratio > best_hit:
            best_hit = hit_ratio
            engine.save(config['alias'], epoch, hit_ratio, ndcg, backup=False)

In [10]:
#setup configuration for GMF
gmf_config = {'alias': 'gmf_factor8neg4-implict',
              'num_epoch': 200,
              'batch_size': 4,
              # 'optimizer': 'sgd',
              # 'sgd_lr': 1e-3,
              # 'sgd_momentum': 0.9,
              # 'optimizer': 'rmsprop',
              # 'rmsprop_lr': 1e-3,
              # 'rmsprop_alpha': 0.99,
              # 'rmsprop_momentum': 0,
              'optimizer': 'adam',
              'adam_lr': 1e-3,
              'num_users': num_userid,
              'num_items': num_itemid,
              'latent_dim': 8,
              'num_negative': 4,
              'l2_regularization': 0, # 0.01
              'use_cuda': True,
              'device_id': 0,
              'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'}

In [17]:
12%5

2

In [None]:
# Train GMF Model
train_model(GMFEngine, gmf_config)

Epoch 0 starts !
--------------------------------------------------------------------------------
[Training Stage Epoch 0] Loss 0.5723008513450623


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  test_in_top_k['ndcg'] = test_in_top_k['rank'].apply(lambda x: math.log(2) / math.log(1 + x)) # the rank starts from 1


[Evaluating Epoch 0] HR = 0.0970, NDCG = 0.0436
Epoch 1 starts !
--------------------------------------------------------------------------------
[Training Stage Epoch 1] Loss 0.925452709197998
Epoch 2 starts !
--------------------------------------------------------------------------------
[Training Stage Epoch 2] Loss 0.5774462223052979
Epoch 3 starts !
--------------------------------------------------------------------------------
[Training Stage Epoch 3] Loss 0.2143295854330063
Epoch 4 starts !
--------------------------------------------------------------------------------


In [None]:
#find file name
gmf_model='tbc'
for file in os.listdir('checkpoints/'):
    leng= len('gmf_factor8neg4-implict_Epoch199')
    if file[:leng]=='gmf_factor8neg4-implict_Epoch199':
        print (file)
        gmf_model=file
        break

In [None]:
mlp_config = {'alias': 'mlp_factor8neg4_bz256_166432168_pretrain_reg_0.0000001',
              'num_epoch': 200,
              'batch_size': 4,  # 1024,
              'optimizer': 'adam',
              'adam_lr': 1e-3,
              'num_users': num_userid,
              'num_items': num_itemid,
              'latent_dim': 8,
              'num_negative': 4,
              'layers': [16,64,32,16,8],  # layers[0] is the concat of latent user vector & latent item vector
              'l2_regularization': 0.0000001,  # MLP model is sensitive to hyper params
              'use_cuda': True,
              'device_id': 0,
              'pretrain': True,
              'pretrain_mf': os.path.join('checkpoints',gmf_model),
              'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'}

In [None]:
train_model(MLPEngine, mlp_config)
#need to add a learning rate scheduler

In [None]:
mlp_model='tbc'
for file in os.listdir('checkpoints/'):
    leng= len('mlp_factor8neg4_bz256_166432168_pretrain_reg_0.0000001_Epoch199')
    if file[:leng]=='mlp_factor8neg4_bz256_166432168_pretrain_reg_0.0000001_Epoch199':
        print (file)
        mlp_model=file
        break

In [None]:
neumf_config = {'alias': 'pretrain_neumf_factor8neg4',
                'num_epoch': 200,
                'batch_size': 4,
                'optimizer': 'adam',
                'adam_lr': 1e-3,
                'num_users': num_userid,
              'num_items': num_itemid,
                'latent_dim_mf': 8,
                'latent_dim_mlp': 8,
                'num_negative': 4,
                'layers': [16,64,32,16,8],  # layers[0] is the concat of latent user vector & latent item vector
                'l2_regularization': 0.0000001,
                'use_cuda': True,
                'device_id': 0,
                'pretrain': True,
                'pretrain_mf': 'checkpoints/{}'.format(gmf_model),
                'pretrain_mlp': 'checkpoints/{}'.format(mlp_model),
                'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'
                }

In [None]:
train_model(NeuMFEngine, neumf_config)

In [None]:
print('Done!')

In [34]:
#for item in os.listdir('checkpoints'):
#    os.remove(os.path.join('checkpoints', item))

In [15]:
#for item in os.listdir('checkpoints'):
#    if item[:3]=='pre':
#        os.remove(os.path.join('checkpoints', item))