In [1]:
# IMPORTS
from itertools import chain, zip_longest
from datetime import datetime
import pandas as pd
import numpy as np
import gc
import random
import os
import json

gc.collect()
start = datetime.now()
print('{} - Initialized environment'.format(
    datetime.now() - start
))

0:00:00 - Initialized environment


## Quick Guide
Here we will train a Bayesian Personalized Ranking (BPR-MF) model on the MovieLens 1M dataset.

To start, we assume that the data (ratings.dat, etc.) files have been extracted to the `datasets/ml_1m` folder.

The following block will process the raw data file to make it into a pandas DataFrame.

In [2]:
def get_interactions(filename):
    # Separators are ::, but the c engine doesn't handle multiple-char separators. Thus we're just going to have to manually interleave and break.
    columns = ['user_id', 'item_id', 'rating', 'timestamp']
    interleave = list(map(str, np.arange(len(columns)-1)))
    read_names = [x for x in chain(*zip_longest(columns, interleave)) if x is not None]
    # Read the actual file in
    interactions_df = pd.read_csv(filename, sep=':', header=None, names=read_names).drop(columns=interleave)
    return interactions_df

movie_dir = os.path.join('datasets', 'ml-1m')
ratings_file = os.path.join(movie_dir, 'ratings.dat')
interactions_loc = os.path.join(movie_dir, 'interactions.msgpack')

start = datetime.now()
try:
    df = pd.read_msgpack(interactions_loc)
    print('{} - Retrieved interactions df.'.format(datetime.now() - start))
except Exception as e:
    print('Error unpickling {}, reconstructing from ratings.dat: {}'.format(interactions_loc, e))
    df = get_interactions(ratings_file)
    print('{} - Processed interactions from ratings.dat'.format(datetime.now() - start))
    df.to_msgpack(interactions_loc)
    print('{} - Serialized interactions to {}'.format(datetime.now() - start, interactions_loc))

0:00:00.092786 - Retrieved interactions df.


In [3]:
df.head(5)

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


## Train/Test/Validation Splits
Perform the following operations to prepare our MovieLens interactions data for the recommender system model:
- K-core (preserve only users and items with more than 5 interactions)
- Create temporal columns
- Map user/item IDs to contiguous integer series

In [4]:
'''
INITIALIZE data and train/validation/test splits
'''
from recsys_models.data import process_temporal_columns, kcore, map_user_items, print_basic_stats
from recsys_models.data.sampling import train_test_validation_split, get_user_interactions_df

user_col = 'user_id'
item_col = 'item_id'

# Get temporal columns
df['date'] = pd.to_datetime(df['timestamp'].apply(datetime.utcfromtimestamp))
df = process_temporal_columns(df)

# K-core
cores = 5
df = kcore(df, user_col, item_col, cores)

# User/item ix -> id mappings
df = map_user_items(df, user_col, item_col)

# Get stats
gc.collect()
print_basic_stats(df, user_col, item_col)
n_users = df[user_col].nunique()
n_items = df[item_col].nunique()

# Create train, validation, and test DFs by holding out the latest interaction per user for test and second-to-last for validation
start = datetime.now()
eval_size = 2000000
train_df, validation_df, test_df, all_int_by_user_df = train_test_validation_split(df, eval_size)
train_items_by_user = train_df.groupby(['u'])['i'].agg(lambda x: set(x)).to_dict()
print('{} - Generated train/validation/test splits and user : items dictionary mappings'.format(
    datetime.now() - start
))

0:00:00.434865 - Added proper temporal columns to df
Removing 0/6040 users (0.00 %) and 290/3706 items (7.83 %) from 1000209 total interactions (95.53164% Sparsity)
Removing 0/6040 users (0.00 %) and 0/3416 items (0.00 %) from 999611 total interactions (95.15520% Sparsity)
0:00:00.363996 - Done: 5-core decomposition after 2 iterations
0:00:00.450793 - Mapped u-i indices
0:00:00.845737 - Created "prior" column
6040 Users interacted with 3416 items 993571 times (95.1845% sparsity, 164.499 actions/user, 290.858 actions/item)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 993571 entries, 289422 to 856074
Data columns (total 12 columns):
user_id        993571 non-null int64
item_id        993571 non-null int64
rating         993571 non-null int64
timestamp      993571 non-null int64
date           993571 non-null datetime64[ns]
year           993571 non-null int32
month          993571 non-null int32
day_of_week    993571 non-null int32
day_of_year    993571 non-null int32
u            

In [5]:
display(train_df.head(3))
display(validation_df.head(3))
display(test_df.head(3))

Unnamed: 0,user_id,item_id,rating,timestamp,date,year,month,day_of_week,day_of_year,u,i,prior
289422,6040,2384,4,956703954,2000-04-25 23:05:54,2000,4,1,115,6039,323,669
138498,6040,593,5,956703954,2000-04-25 23:05:54,2000,4,1,115,6039,128,323
45032,6040,1961,4,956703977,2000-04-25 23:06:17,2000,4,1,115,6039,41,128


Unnamed: 0,user_id,item_id,rating,timestamp,date,year,month,day_of_week,day_of_year,u,i,prior,j
0,2469,3702,4,974160232,2000-11-14 00:03:52,2000,11,1,318,2468,211,217,1586
1,4614,492,4,964143639,2000-07-21 01:40:39,2000,7,4,202,4613,1801,305,2045
2,2260,1923,3,974585842,2000-11-18 22:17:22,2000,11,5,322,2259,290,733,2681


Unnamed: 0,user_id,item_id,rating,timestamp,date,year,month,day_of_week,day_of_year,u,i,prior,j
0,5632,2336,5,959015166,2000-05-22 17:06:06,2000,5,0,142,5631,435,42,27
1,5633,1287,5,959014384,2000-05-22 16:53:04,2000,5,0,142,5632,6,9,1119
2,5903,1617,5,957465792,2000-05-04 18:43:12,2000,5,3,124,5902,246,905,1965


## Training Models
Now we can initialize our model with a few parameters, train a BPR-MF model, and compare it to the PopRec baseline (pick the most popular item as seen in the training set):

In [6]:
'''
PopRec Baseline - Pick the more popular item based on training interactions
'''
from recsys_models.models import pop_rec
from recsys_models.data.sampling import sample_unobserved

start = datetime.now()
pop_auc_tr = pop_rec(train_df, sample_unobserved(train_df, train_items_by_user, n_items, len(test_df)))
pop_auc_v = pop_rec(train_df, validation_df)
pop_auc_t = pop_rec(train_df, test_df)
print('{} - PopRec:\nTraining AUC:\t\t{:.5f}\nValidation AUC:\t\t{:.5f}\nTesting AUC:\t\t{:.5f}'.format(
    datetime.now() - start,
    pop_auc_tr,
    pop_auc_v,
    pop_auc_t
))

0:00:06.907555 - PopRec:
Training AUC:		0.84690
Validation AUC:		0.80072
Testing AUC:		0.79488


In [7]:
'''
Run BPR
'''
import tensorflow as tf
from recsys_models.models.bpr import BPR_MF
from recsys_models.pipeline import train_model

# Set training parameters
max_epochs = 200
n_iterations = 1000
batch_size = 512
stopping_threshold = 1e-5

# Get the validation and testing matrices
start = datetime.now()
validation_data = validation_df[['u', 'i', 'j']].values
test_data = test_df[['u', 'i', 'j']].values
print('{} - Generated u-i-j matrices for validation and testing'.format(
    datetime.now() - start
))

# Initialize the graph
tf.reset_default_graph()
model = BPR_MF(n_users, n_items, k=5, lambda_emb=1e-4, lambda_bias=1e-4,
               opt_type=tf.contrib.opt.LazyAdamOptimizer, opt_args={'learning_rate': 0.007})
print('\n=== BEGIN Optimization for {} ==='.format(model.model_id))
print('    {} Max epochs, with early stoppage at {} Validation AUC change'.format(max_epochs, stopping_threshold))
print('    {} Iterations per epoch with {}-sized batches'.format(n_iterations, batch_size))

# Open session and initialize graph weights
session = tf.Session()
session.run(tf.global_variables_initializer())

# Train the model!
model, train_auc, validation_auc, test_auc = train_model(
    session, model, train_df, validation_data, test_data,
    n_iterations=n_iterations, batch_size=batch_size,
    min_epochs=10, max_epochs=max_epochs,
    stopping_threshold=stopping_threshold,
    sample_columns=['u', 'i'], column_order=['u', 'i', 'j'],
    n_items=n_items, items_by_user=train_items_by_user
)

# Save model
suffix = '_ml-1m'
full_model_id = '{}{}'.format(model.model_id, suffix)
model_folder = os.path.join('tf_models', full_model_id)
if not os.path.exists(model_folder):
    os.makedirs(model_folder)
model.save(session, 'tf_models', suffix=suffix)
print('{} - Saved model to {}'.format(
    datetime.now() - start, model_folder
))

# Cleanup
session.close()
gc.collect()
print()

0:00:00.075769 - Generated u-i-j matrices for validation and testing

=== BEGIN Optimization for bpr-mf_5k_0.0001l2_0.0001l2bias ===
    200 Max epochs, with early stoppage at 1e-05 Validation AUC change
    1000 Iterations per epoch with 512-sized batches
0:00:00.267288 - Prior: 0.50051 Validation AUC, 0.50301 Testing AUC
[0:00:03.625273 - Epoch 1] 0.54022 Loss, 0.85301 Training AUC, 0.79636 Validation AUC (0.29585 Change)
[0:00:06.588379 - Epoch 2] 0.44859 Loss, 0.85319 Training AUC, 0.79956 Validation AUC (0.00320 Change)
[0:00:09.493608 - Epoch 3] 0.43666 Loss, 0.86849 Training AUC, 0.81308 Validation AUC (0.01352 Change)
[0:00:12.403825 - Epoch 4] 0.42238 Loss, 0.88661 Training AUC, 0.82824 Validation AUC (0.01516 Change)
[0:00:15.316034 - Epoch 5] 0.41697 Loss, 0.89285 Training AUC, 0.83717 Validation AUC (0.00893 Change)
[0:00:18.308031 - Epoch 6] 0.41347 Loss, 0.89770 Training AUC, 0.84078 Validation AUC (0.00361 Change)
[0:00:21.266120 - Epoch 7] 0.41205 Loss, 0.89975 Training

## Loading Weights from Pretrained Model
We can load weights from another model and initialize the weight matrices.

We can thusly evaluate existing models using the RecSysModels framework:

In [8]:
'''
Retrieve pretrained weights and evaluate with model
'''
start = datetime.now()

# Initialize TF session
tf.reset_default_graph()
session = tf.Session()

# Retrieve the model we just trained
model2 = BPR_MF.load(model_folder)
session.run(tf.global_variables_initializer())

# Evaluate on the test data:
test_auc_2 = model2.evaluate_auc(session, test_data)
print('{} - Pretrained model from {} evaluated on test data, with AUC: {:.5f}'.format(
    datetime.now() - start,
    model_folder,
    test_auc_2
))

# Cleanup
session.close()
gc.collect()
print()

0:00:00.392998 - Pretrained model from tf_models\bpr-mf_5k_0.0001l2_0.0001l2bias_ml-1m evaluated on test data, with AUC: 0.84519

