In [1]:
import pandas as pd
import numpy as np
from gmf import GMFEngine
from mlp import MLPEngine
from neumf import NeuMFEngine
from data import SampleGenerator
import os

In [2]:
import torch
torch.cuda.is_available()

False

In [3]:
os.listdir('data')

['cleaned_data.csv']

In [4]:
# Load Data
data_dir = 'data/cleaned_data.csv'
tdc_record = pd.read_csv(data_dir, names=['uid', 'mid', 'timestamp'],  engine='python')

In [5]:
# Reindex
tdc_record = tdc_record.iloc[1:,:]
user_id = tdc_record[['uid']].drop_duplicates().reindex()
user_id['userId'] = np.arange(len(user_id))
tdc_record = pd.merge(tdc_record, user_id, on=['uid'], how='left')
item_id = tdc_record[['mid']].drop_duplicates()
item_id['itemId'] = np.arange(len(item_id))
tdc_record = pd.merge(tdc_record, item_id, on=['mid'], how='left')
tdc_record['rating']=1.0
tdc_record = tdc_record[['userId', 'itemId', 'rating', 'timestamp']]

tdc_record['rating']=tdc_record['rating'].astype('int32')
tdc_record['timestamp']=tdc_record['timestamp'].astype('float64')

print('Range of userId is [{}, {}]'.format(tdc_record.userId.min(), tdc_record.userId.max()))
print('Range of itemId is [{}, {}]'.format(tdc_record.itemId.min(), tdc_record.itemId.max()))
print(tdc_record.dtypes)

Range of userId is [0, 11410]
Range of itemId is [0, 13202]
userId         int64
itemId         int64
rating         int32
timestamp    float64
dtype: object


In [6]:
num_itemid=len(tdc_record['itemId'].unique())

num_userid=len(tdc_record['userId'].unique())

tdc_record.drop_duplicates(inplace=True)

tdc_record.groupby('userId').count()

Unnamed: 0_level_0,itemId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2,2,2
1,2,2,2
2,148,148,148
3,38,38,38
4,11,11,11
5,3,3,3
6,3,3,3
7,4,4,4
8,474,474,474
9,72,72,72


In [7]:
#print double check any useless items

df=tdc_record.groupby('userId').count()

user_id_drop= df[df['itemId']==1].index

drop_index=[]
for ind, row in tdc_record.iterrows():
    if row['userId'] in user_id_drop:
        drop_index.append(ind)

if len(drop_index)==0:
    print('No useless data')
else:
    print('found {} useless datapoints'.format(len(drop_index)))
    tdc_record.drop(drop_index, inplace=True)
    print('data cleaned!')

No useless data


In [9]:
# DataLoader for training
sample_generator = SampleGenerator(ratings=tdc_record)
evaluate_data = sample_generator.evaluate_data

start
begin preprocess_ratings
begin setting pools
creating negative items
0


  4%|▍         | 476/11411 [00:00<00:04, 2462.18it/s]

1


100%|██████████| 11411/11411 [00:05<00:00, 2189.18it/s]
  2%|▏         | 176/11411 [00:00<00:07, 1567.41it/s]

2


100%|██████████| 11411/11411 [00:03<00:00, 3475.34it/s]


split_loo
Done!
Begin the loop...


In [None]:
# Training Engine

def train_model(model, config):
    engine = model(config)
    for epoch in range(config['num_epoch']):
        print('Epoch {} starts !'.format(epoch))
        print('-' * 80)
        train_loader = sample_generator.instance_a_train_loader(config['num_negative'], config['batch_size'])
        engine.train_an_epoch(train_loader, epoch_id=epoch)
        hit_ratio, ndcg = engine.evaluate(evaluate_data, epoch_id=epoch)
        engine.save(config['alias'], epoch, hit_ratio, ndcg)

In [None]:
#setup configuration for GMF
gmf_config = {'alias': 'gmf_factor8neg4-implict',
              'num_epoch': 200,
              'batch_size': 4,
              # 'optimizer': 'sgd',
              # 'sgd_lr': 1e-3,
              # 'sgd_momentum': 0.9,
              # 'optimizer': 'rmsprop',
              # 'rmsprop_lr': 1e-3,
              # 'rmsprop_alpha': 0.99,
              # 'rmsprop_momentum': 0,
              'optimizer': 'adam',
              'adam_lr': 1e-3,
              'num_users': num_userid,
              'num_items': num_itemid,
              'latent_dim': 8,
              'num_negative': 4,
              'l2_regularization': 0, # 0.01
              'use_cuda': True,
              'device_id': 0,
              'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'}

In [None]:
# Train GMF Model
train_model(GMFEngine, gmf_config)

In [None]:
#find file name
gmf_model='tbc'
for file in os.listdir('checkpoints/'):
    leng= len('gmf_factor8neg4-implict_Epoch199')
    if file[:leng]=='gmf_factor8neg4-implict_Epoch199':
        print (file)
        gmf_model=file
        break

In [None]:
mlp_config = {'alias': 'mlp_factor8neg4_bz256_166432168_pretrain_reg_0.0000001',
              'num_epoch': 200,
              'batch_size': 4,  # 1024,
              'optimizer': 'adam',
              'adam_lr': 1e-3,
              'num_users': num_userid,
              'num_items': num_itemid,
              'latent_dim': 8,
              'num_negative': 4,
              'layers': [16,64,32,16,8],  # layers[0] is the concat of latent user vector & latent item vector
              'l2_regularization': 0.0000001,  # MLP model is sensitive to hyper params
              'use_cuda': True,
              'device_id': 0,
              'pretrain': True,
              'pretrain_mf': os.path.join('checkpoints',gmf_model),
              'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'}

In [None]:
train_model(MLPEngine, mlp_config)
#need to add a learning rate scheduler

In [None]:
mlp_model='tbc'
for file in os.listdir('checkpoints/'):
    leng= len('mlp_factor8neg4_bz256_166432168_pretrain_reg_0.0000001_Epoch199')
    if file[:leng]=='mlp_factor8neg4_bz256_166432168_pretrain_reg_0.0000001_Epoch199':
        print (file)
        mlp_model=file
        break

In [None]:
neumf_config = {'alias': 'pretrain_neumf_factor8neg4',
                'num_epoch': 200,
                'batch_size': 4,
                'optimizer': 'adam',
                'adam_lr': 1e-3,
                'num_users': num_userid,
              'num_items': num_itemid,
                'latent_dim_mf': 8,
                'latent_dim_mlp': 8,
                'num_negative': 4,
                'layers': [16,64,32,16,8],  # layers[0] is the concat of latent user vector & latent item vector
                'l2_regularization': 0.0000001,
                'use_cuda': True,
                'device_id': 0,
                'pretrain': True,
                'pretrain_mf': 'checkpoints/{}'.format(gmf_model),
                'pretrain_mlp': 'checkpoints/{}'.format(mlp_model),
                'model_dir':'checkpoints/{}_Epoch{}_HR{:.4f}_NDCG{:.4f}.model'
                }

In [None]:
train_model(NeuMFEngine, neumf_config)

In [None]:
print('Done!')

In [34]:
#for item in os.listdir('checkpoints'):
#    os.remove(os.path.join('checkpoints', item))

In [15]:
#for item in os.listdir('checkpoints'):
#    if item[:3]=='pre':
#        os.remove(os.path.join('checkpoints', item))