In [2]:
from lightfm import LightFM
import scipy.sparse as sp
from scipy.sparse import vstack
import pickle
import numpy as np
import pandas as pd
import random
from scipy.sparse import csr_matrix



In [3]:
class recommender:
    def __init__(self,file_model,item_prop,n_items):
        self.n_items=n_items
        self.file_model=file_model
        self.item_prop=item_prop
        store_model=open(self.file_model,'rb')
        store_item_prop=open(self.item_prop,'rb')
        self.model=pickle.load(store_model)
        self.item_to_property_matrix_sparse=pickle.load(store_item_prop)
    def get_predictions(self,user_id):
        pid_array = np.arange(self.n_items, dtype=np.int32)
        predictions = self.model.predict(user_id,pid_array,item_features=self.item_to_property_matrix_sparse,num_threads=4)
        return predictions 
        
        
        
        
        

In [4]:
def retrain(events,items):
    user_activity_count = dict()
    for row in events.itertuples():
        if row.visitorid not in user_activity_count:
            user_activity_count[row.visitorid] = {'view':0 , 'addtocart':0, 'transaction':0};
        if row.event == 'addtocart':
            user_activity_count[row.visitorid]['addtocart'] += 1 
        elif row.event == 'transaction':
            user_activity_count[row.visitorid]['transaction'] += 1
        elif row.event == 'view':
            user_activity_count[row.visitorid]['view'] += 1 

    d = pd.DataFrame(user_activity_count)
    dataframe = d.transpose()
    # Activity range
    dataframe['activity'] = dataframe['view'] + dataframe['addtocart'] + dataframe['transaction']
    # removing users with only a single view
    cleaned_data = dataframe[dataframe['activity']!=1]
    # all users contains the userids with more than 1 activity in the events (4lac)
    all_users = set(cleaned_data.index.values)
    all_items = set(events['itemid'])
    # todo: we need to clear items which are only viewed once

    visitorid_to_index_mapping  = {}
    itemid_to_index_mapping  = {}
    vid = 0
    iid = 0
    for row in events.itertuples():
        if row.visitorid in all_users and row.visitorid not in visitorid_to_index_mapping:
            visitorid_to_index_mapping[row.visitorid] = vid
            vid = vid + 1

        if row.itemid in all_items and row.itemid not in itemid_to_index_mapping:
            itemid_to_index_mapping[row.itemid] = iid
            iid = iid + 1
    n_users = len(all_users)
    n_items = len(all_items)
    user_to_item_matrix = sp.dok_matrix((n_users, n_items), dtype=np.int8)
    # We need to check whether we need to add the frequency of view, addtocart and transation.
    # Currently we are only taking a single value for each row and column.
    action_weights = [1,2,3]

    for row in events.itertuples():
        if row.visitorid not in all_users:
            continue


        mapped_visitor_id = visitorid_to_index_mapping[row.visitorid]
        mapped_item_id    = itemid_to_index_mapping[row.itemid]

        value = 0
        if row.event == 'view':
            value = action_weights[0]
        elif row.event == 'addtocart':
            value = action_weights[1]        
        elif row.event == 'transaction':
            value = action_weights[2]

        current_value = user_to_item_matrix[mapped_visitor_id, mapped_item_id]
        if value>current_value:
            user_to_item_matrix[mapped_visitor_id, mapped_item_id] = value

    user_to_item_matrix = user_to_item_matrix.tocsr()
    all_items = set(events['itemid'])
    filtered_items = items[items.itemid.isin(all_items)]
    fake_itemid = []
    fake_timestamp = []
    fake_property = []
    fake_value = []
    all_items_with_property = set(items.itemid)
    for itx in list(all_items):
        if itx not in all_items_with_property:
            fake_itemid.insert(0, itx)
            fake_timestamp.insert(0, 0)
            fake_property.insert(0, 888)
            fake_value.insert(0, 0)

    fake_property_dict = {'itemid':fake_itemid, 'timestamp':fake_timestamp, 'property':fake_property,
                         'value':fake_value}

    fake_df = pd.DataFrame(fake_property_dict, columns=filtered_items.columns.values)
    filtered_items = pd.concat([filtered_items, fake_df])
    filtered_items['itemid'] = filtered_items['itemid'].apply(lambda x: itemid_to_index_mapping[x])
    filtered_items = filtered_items.sort_values('timestamp', ascending=False).drop_duplicates(['itemid','property'])
    filtered_items.sort_values(by='itemid', inplace=True)
    item_to_property_matrix = filtered_items.pivot(index='itemid', columns='property', values='value')
    useful_cols = list()
    cols = item_to_property_matrix.columns
    for col in cols:
        value = len(item_to_property_matrix[col].value_counts())
        if value < 50:
            useful_cols.insert(0, col)
    item_to_property_matrix = item_to_property_matrix[useful_cols]
    item_to_property_matrix_one_hot_sparse = pd.get_dummies(item_to_property_matrix)
    item_to_property_matrix_sparse = csr_matrix(item_to_property_matrix_one_hot_sparse.values)
    return (user_to_item_matrix,item_to_property_matrix_sparse)

In [5]:
def make_train(ratings, pct_test = 0.2):
    test_set = ratings.copy() # Make a copy of the original set to be the test set. 
    test_set[test_set != 0] = 1 # Store the test set as a binary preference matrix
    training_set = ratings.copy() # Make a copy of the original data we can alter as our training set. 
    nonzero_inds = training_set.nonzero() # Find the indices in the ratings data where an interaction exists
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1])) # Zip these pairs together of user,item index into list
    random.seed(0) # Set the random seed to zero for reproducibility
    num_samples = int(np.ceil(pct_test*len(nonzero_pairs))) # Round the number of samples needed to the nearest integer
    samples = random.sample(nonzero_pairs, num_samples) # Sample a random number of user-item pairs without replacement
    user_inds = [index[0] for index in samples] # Get the user row indices
    item_inds = [index[1] for index in samples] # Get the item column indices
    training_set[user_inds, item_inds] = 0 # Assign all of the randomly chosen user-item pairs to zero
    training_set.eliminate_zeros() # Get rid of zeros in sparse array storage after update to save space
    return training_set, test_set, list(set(user_inds)) # Output the unique list of user rows that were altered  

In [6]:
r=recommender('model.pickle','item_to_property_matrix_sparse.pickle',88000)

In [7]:
a=open('model.pickle','rb')

In [8]:
model=pickle.load(a)

In [9]:
r.get_predictions(12345)

array([ 0.72322857,  1.51320255, -0.35197493, ...,  1.18143356,
        0.82504654,  0.06508896])