In [1]:
from lightfm.data import Dataset
import os
import dask
from distributed import Client

import dask.dataframe as dd

from lightfm import LightFM

from lightfm.evaluation import precision_at_k


import numpy as np

In [2]:
dataset = Dataset()
train_data = dd.read_parquet('~/als_train_set.parquet', engine='pyarrow')
train_data

Unnamed: 0_level_0,user_id,rmsid_int,ratings
npartitions=25,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,int32,int32,int64
,...,...,...
...,...,...,...
,...,...,...
,...,...,...


In [3]:
%time dataset.fit(train_data['user_id'], train_data['rmsid_int'])

CPU times: user 1min 2s, sys: 2.74 s, total: 1min 4s
Wall time: 1min 17s


In [4]:
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 7909, num_items 20284117.


In [5]:
%time (interactions, weights) = dataset.build_interactions([(x[1]['user_id'], x[1]['rmsid_int'], x[1]['ratings']) for x in train_data.iterrows()])
print(repr(interactions))

CPU times: user 26min 46s, sys: 10.8 s, total: 26min 56s
Wall time: 27min 9s
<7909x20284117 sparse matrix of type '<class 'numpy.int32'>'
	with 50182469 stored elements in COOrdinate format>


In [6]:
weights

<7909x20284117 sparse matrix of type '<class 'numpy.float32'>'
	with 50182469 stored elements in COOrdinate format>

In [7]:
repr(interactions)

"<7909x20284117 sparse matrix of type '<class 'numpy.int32'>'\n\twith 50182469 stored elements in COOrdinate format>"

In [8]:
model = LightFM(loss='warp', no_components=25,  item_alpha=10e-6, user_alpha=10e-6, learning_rate=0.0001)
%time model.fit(interactions)
model

CPU times: user 2min 12s, sys: 3.05 s, total: 2min 15s
Wall time: 2min 16s


<lightfm.lightfm.LightFM at 0x15384f78fdf0>

In [9]:
import pickle

with open('savemodel.pickle', 'wb') as fle:
    pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
val_data = dd.read_parquet('~/als_val_set.parquet', engine='pyarrow')

%time (interactions_val, _) = dataset.build_interactions([(x[1]['user_id'], x[1]['rmsid_int'], x[1]['ratings']) for x in val_data.iterrows()])

CPU times: user 7min 1s, sys: 3.29 s, total: 7min 4s
Wall time: 7min 6s


In [11]:
with open('savemodel.pickle', 'rb') as f:
    model_new = pickle.load(f)

In [None]:
%time np.mean(precision_at_k(model, test_interactions=interactions_val, k=100, num_threads=14))

In [None]:
%time np.mean(precision_at_k(model_new, test_interactions=interactions_val, k=100, num_threads=14))

In [None]:
test_data = dd.read_parquet('~/als_test_set.parquet', engine='pyarrow')

%time (interactions_test, _) = dataset.build_interactions([(x[1]['user_id'], x[1]['rmsid_int'], x[1]['ratings']) for x in test_data.iterrows()])

In [None]:
%time np.mean(precision_at_k(model, test_interactions=interactions_test, k=100, num_threads=14))