## Application of WALS on Movielens

In [20]:
#%load_ext watermark
%load_ext autoreload 
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
import os
import sys
import numpy as np
import pandas as pd
from pybpr import *
import matplotlib.pyplot as plt
from functools import partial
import itertools
import scipy.sparse as ss
import pathos.multiprocessing as mp
import tqdm

In [22]:
%%time
#df = load_movielens_data('ml-1m')
df = load_movielens_data('ml-100k')
df.head()

CPU times: user 21.5 ms, sys: 2.22 ms, total: 23.8 ms
Wall time: 26 ms


Unnamed: 0,user_id,item_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [23]:
%%time
dfshort = df[df['rating'] > 0]
cf = UserItemInteractions(
    name='MovieLens-1m',
    users = dfshort['user_id'], 
    items = dfshort['item_id'],
    min_num_rating_per_user=30,
    min_num_rating_per_item=10
)
cf.print_memory_usage()

--- Memory usage for MovieLens-1m:
Sparse User-Item matrix = 0.09 MB
User-Item dataframe df = 2.85 MB
Item dataframe df_item = 0.03 MB
Item dataframe df_user = 0.02 MB
---
CPU times: user 30.3 ms, sys: 1.53 ms, total: 31.8 ms
Wall time: 30.9 ms


In [24]:
%%time
cf.generate_train_test(user_test_ratio = 0.2)

CPU times: user 1.26 s, sys: 1.8 ms, total: 1.26 s
Wall time: 1.26 s


In [25]:
wals = MF_WALS(
    num_features=10, 
    reg_lambda=0., 
    ncores=1, 
    weighting_strategy='same', 
    num_iters=10, 
    initial_std=0.1, 
    seed=None
)

In [26]:
%%time
wals.fit(cf.R_train)

CPU times: user 6min 10s, sys: 9min 55s, total: 16min 5s
Wall time: 26.8 s


In [27]:
als = MF_ALS(
    num_features=10, 
    reg_lambda=0., 
    num_iters=10, 
    initial_std=0.1, 
    seed=None
)

In [28]:
als.fit(cf.R_train)

In [29]:
als_ndcg_score = cf.get_ndcg_metric(
    user_mat = als.user_mat,
    item_mat=als.item_mat,
    num_items = 10,
    test=True,
    truncate=True
)
wals_ndcg_score = cf.get_ndcg_metric(
    user_mat = wals.user_mat,
    item_mat=wals.item_mat,
    num_items = 10,
    test=True,
    truncate=True
)

In [30]:
als_ndcg_score, wals_ndcg_score

(0.44989681124501735, 0.448821480128063)

## ALS = WALS when weighting negative and positive examples same

## Effect of num of features on ndcg

In [13]:
list_of_num_features = np.arange(1,50,50)
list_of_wgt_strategies = ['same', 'uniform', 'user-oriented', 'item-oriented']
list_of_ndcg_num_items = [5]
list_of_num_iters = [2]
list_of_reg_lambda = [0.]
iter_list = list(itertools.product(list_of_num_features, 
                                   list_of_wgt_strategies,
                                   list_of_ndcg_num_items,
                                   list_of_num_iters,
                                   list_of_reg_lambda))
columns =['num_features', 'wgt_strategy', 'ndcg_num_items', 'num_iters','reg_lambda']
df = pd.DataFrame(list(iter_list), columns=columns) 
# df['ndcg_test'] = np.nan
# df['ndcg_train'] = np.nan
# df['wall_time'] = np.nan
print(df.shape)

(4, 7)


In [14]:
# for idx, irow in df.iterrows():
#     print(irow)

In [15]:
def get_ndcg_wals(irow, icf):
    idx, idict = irow
    print(idx, end="-", flush=True)
    wals = MF_WALS(
        num_features=idict['num_features'], 
        reg_lambda=idict['reg_lambda'],  
        weighting_strategy=idict['wgt_strategy'], 
        num_iters= idict['num_iters'], 
        initial_std=0.1, 
        seed=None
    )
    wals.fit(icf.R_train)
    wals_ndcg_func = partial(
        icf.get_ndcg_metric, 
        user_mat = wals.user_mat, 
        item_mat= wals.item_mat, 
        num_items = idict['ndcg_num_items']
    )
    out_dict = {}
    out_dict['ndcg_test'] = wals_ndcg_func(test=True)
    out_dict['ndcg_train'] = wals_ndcg_func(test=False)
    return out_dict

In [16]:
%%time
results = []
for irow in df.iterrows():
    results.append(get_ndcg_wals(irow, icf=cf))

0-1-2-3-CPU times: user 3min 42s, sys: 6min 7s, total: 9min 50s
Wall time: 19.4 s


In [None]:
%%time
with mp.ProcessPool(4) as p:
    results = tqdm.tqdm(p.imap(lambda irow: get_ndcg_wals(irow, icf=cf), df.iterrows()), total=df.shape[0])

In [17]:
df = pd.concat([df,pd.DataFrame(results)], axis=1)

In [18]:
df

Unnamed: 0,num_features,wgt_strategy,ndcg_num_items,num_iters,reg_lambda,ndcg_test,ndcg_train,ndcg_test.1,ndcg_train.1
0,1,same,5,2,0.0,,,0.311646,0.491332
1,1,uniform,5,2,0.0,,,0.311225,0.49512
2,1,user-oriented,5,2,0.0,,,0.262617,0.415504
3,1,item-oriented,5,2,0.0,,,0.313189,0.498886


In [19]:
df.to_csv(os.path.join(os.path.curdir, 'output', 'wals_results.csv'))