In [2]:
# Importing libraries in python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from lightfm import LightFM
from lightfm.evaluation import precision_at_k,auc_score,reciprocal_rank
import scipy
import time
import math
from lightfm.data import Dataset
import warnings
warnings.filterwarnings('ignore')



In [1]:
# Loading data

users = pd.read_csv('users_tag.csv',index_col=0)
business = pd.read_csv('business_cj.csv',index_col=0)
review = pd.read_csv('IL_review.csv',index_col=0)

## Check Sparsity

In [None]:
#For recommender system you want to see which user bought which item or which user rated which item.
#In practical scenario users do not rate/ buy every item, large number of users are concentrated on few items 
# and hence a certain amount of items are untouched by users.

# Since we do not have any action of customers on some items while we have it on certain other items, 
# this emptiness of interaction is called sparsity problem.

n_users = review.user_id.unique().shape[0]
n_items = review.business_id.unique().shape[0]

print('Number of users: {}'.format(n_users))
print('Number of models: {}'.format(n_items))
print('Sparsity: {:4.3f}%'.format(float(review.shape[0]) / float(n_users*n_items) * 100))

In [479]:
# checking if there are duplicate reviews
a = review.groupby(['business_id','user_id']).agg(['count']).reset_index()

In [480]:
# computing users with more than one review

tmp = a.useful.sort_values(by = 'count',ascending = False).reset_index()
tmp[tmp['count'] >1].shape

(1081, 2)

### 1081 users have multiple review for one business 

In [481]:
multiple_reviewers = tmp[tmp['count']>1]['index']

In [482]:
multiple_reviewers

0       26917
1        5843
2        5233
3       22282
4          69
        ...  
1076    16741
1077     5210
1078    17853
1079     5204
1080    27569
Name: index, Length: 1081, dtype: int64

In [483]:
# removing duplicate entries and taking latest entries of users for a particular business
for ind in multiple_reviewers:
    tmp = review[(review.user_id == a.loc[ind, 'user_id'][0]) & 
                 (review.business_id == a.loc[ind, 'business_id'][0])].sort_values(by = 'date', ascending = True)
    review.drop(tmp.index[0:len(tmp.index)-1],inplace=True)

In [484]:
# checking if duplicate values have been removed
b = review.groupby(['business_id','user_id']).agg(['count']).reset_index()
tmp = b.useful.sort_values(by = 'count',ascending = False).reset_index()
tmp[tmp['count'] >1].shape

(0, 2)

In [485]:
# the review_stars contains bias, thus needs to be normalized by subtracting the average stars from it and 
# make negative stars to -1 and positive stars to +1

user_ind = review.user_id.unique()
for ind in user_ind:
    this_avg=users.average_stars[users.user_id == ind]
    temp = review.stars[review.user_id == ind] - float(this_avg)
    temp[temp>0] = 1
    temp[temp<0] = -1
    review.stars[review.user_id == ind] = temp


In [486]:
review.stars.unique()

array([-1.,  1.,  0.])

In [487]:
# The first thing we need to do is to create a mapping between the user id and business ids from our input data 
# This mapping will provide indices that will be used internally by our model.

# model establishment
dataset = Dataset()
dataset.fit(review.user_id,review.business_id)
num_users, num_items = dataset.interactions_shape()
print('Num users: {}, num_items {}.'.format(num_users, num_items))

Num users: 12859, num_items 845.


In [488]:
# fit item and user features.
dataset.fit_partial(items=business.business_id,
                    item_features=['stars'])
dataset.fit_partial(items=business.business_id,
                    item_features=['review_count'])


In [507]:
business.category #=business.drop(columns='category')

0      Ethnic Food, Food Trucks, Specialty Food, Impo...
1      Restaurants, Diners, Sandwiches, Breakfast & B...
2                        Hot Dogs, Restaurants, Barbeque
3                             Donuts, Food, Coffee & Tea
4                                     Pizza, Restaurants
                             ...                        
840            Sushi Bars, Chinese, Buffets, Restaurants
841      Fast Food, Restaurants, Tex-Mex, Mexican, Tacos
842                                 Italian, Restaurants
843                 Fast Food, Restaurants, Pizza, Salad
844                   Sandwiches, Fast Food, Restaurants
Name: category, Length: 845, dtype: object

In [508]:
# We have categories from column number 44
business.columns[45:]

Index(['Fast Food', 'Nightlife', 'American (Traditional)', 'Bars', 'Pizza',
       'Sandwiches', 'Burgers', 'Coffee & Tea', 'Mexican', 'Chinese',
       'American (New)', 'Breakfast & Brunch', 'Grocery', 'Bakeries',
       'Specialty Food', 'Italian', 'Shopping', 'Ice Cream & Frozen Yogurt',
       'Event Planning & Services', 'Desserts', 'Salad', 'Chicken Wings',
       'Barbeque', 'Delis', 'Cafes', 'Asian Fusion', 'Korean', 'Sushi Bars',
       'Beer', 'Wine & Spirits', 'Caterers', 'Sports Bars', 'Japanese',
       'Food Trucks', 'Thai', 'Drugstores', 'Convenience Stores', 'Seafood',
       'Steakhouses', 'Pubs', 'Juice Bars & Smoothies', 'Diners',
       'Mediterranean', 'Indian', 'Hot Dogs', 'Arts & Entertainment',
       'Bubble Tea', 'Donuts', 'Lounges', 'Venues & Event Spaces',
       'Flowers & Gifts', 'Fashion', 'Food Delivery Services', 'Vegetarian',
       'Ethnic Food', 'Tex-Mex', 'Breweries', 'Beauty & Spas',
       'Department Stores', 'Automotive', 'Bagels'],
      dtype

In [509]:
tar_cols = [x for x in business.columns[45:]]

dataset.fit_partial(items = business.business_id,
                   item_features = tar_cols) 

In [510]:
tar_cols

['Fast Food',
 'Nightlife',
 'American (Traditional)',
 'Bars',
 'Pizza',
 'Sandwiches',
 'Burgers',
 'Coffee & Tea',
 'Mexican',
 'Chinese',
 'American (New)',
 'Breakfast & Brunch',
 'Grocery',
 'Bakeries',
 'Specialty Food',
 'Italian',
 'Shopping',
 'Ice Cream & Frozen Yogurt',
 'Event Planning & Services',
 'Desserts',
 'Salad',
 'Chicken Wings',
 'Barbeque',
 'Delis',
 'Cafes',
 'Asian Fusion',
 'Korean',
 'Sushi Bars',
 'Beer',
 'Wine & Spirits',
 'Caterers',
 'Sports Bars',
 'Japanese',
 'Food Trucks',
 'Thai',
 'Drugstores',
 'Convenience Stores',
 'Seafood',
 'Steakhouses',
 'Pubs',
 'Juice Bars & Smoothies',
 'Diners',
 'Mediterranean',
 'Indian',
 'Hot Dogs',
 'Arts & Entertainment',
 'Bubble Tea',
 'Donuts',
 'Lounges',
 'Venues & Event Spaces',
 'Flowers & Gifts',
 'Fashion',
 'Food Delivery Services',
 'Vegetarian',
 'Ethnic Food',
 'Tex-Mex',
 'Breweries',
 'Beauty & Spas',
 'Department Stores',
 'Automotive',
 'Bagels']

In [511]:
users.columns

Index(['user_id', 'name', 'review_count', 'yelping_since', 'useful', 'funny',
       'cool', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos', 'is_elite', 'year', 'Shopping', 'Hot Dogs',
       'Cafes', 'American (New)', 'Vegetarian', 'American (Traditional)',
       'Beer', 'Delis', 'Grocery', 'Salad', 'Fast Food',
       'Ice Cream & Frozen Yogurt', 'Donuts', 'Breakfast & Brunch',
       'Chicken Wings', 'Burgers', 'Bakeries', 'Indian', 'Convenience Stores',
       'Bubble Tea', 'Steakhouses', 'Thai', 'Mexican', 'Mediterranean',
       'Food Delivery Services', 'Sushi Bars', 'Coffee & Tea',
       'Event Planning & Services', 'Chinese', 'Sandwiches', 'Seafood',
       'Specialty Food', 'Barbeque', 'Nightlife', 'Japanese', 'Desserts',
       'Arts &

In [512]:
user_cols = [x for x in users.drop(columns = ['user_id', 'name', 'yelping_since', 'elite', 'friends', 'fans', 'average_stars', 'compliment_hot',
       'compliment_more', 'compliment_profile', 'compliment_cute',
       'compliment_list', 'compliment_note', 'compliment_plain',
       'compliment_cool', 'compliment_funny', 'compliment_writer',
       'compliment_photos', 'year',]).columns]

In [513]:
dataset.fit_partial(users = users.user_id,
                    user_features = user_cols)

In [514]:
# Check data
print(type(dataset))
print(dataset.model_dimensions())
print(dataset.user_features_shape())
print(dataset.item_features_shape())
print(dataset.interactions_shape())

<class 'lightfm.data.Dataset'>
(12909, 909)
(12859, 12909)
(845, 909)
(12859, 845)


In [515]:
# look at item feature mapping
a = dataset.mapping()[3]
list(a.items())[0:10]

[('9A1C1f0m4nQltQrOOTl-Kw', 0),
 ('VHsNB3pdGVcRgs6C3jt6Zg', 1),
 ('Ah4i15g8Ow_zphzcpulTxQ', 2),
 ('9MnbQg7kfb_WgxoV0hXKSQ', 3),
 ('t_yiQnxUDdPPCN2z4QyezA', 4),
 ('-fiUXzkxRfbHY9TKWwuptw', 5),
 ('NEVA0IYbawceL6kz5v5DAw', 6),
 ('RwMlwusAtxZc5a3ZYduulg', 7),
 ('ObNQVg_ohRVLex4ppmMC5w', 8),
 ('-Jhlh8Scjy669NdtCfKSSg', 9)]

In [516]:
#build interaction
(interactions, weights) = dataset.build_interactions([(x['user_id'], x['business_id'], 
                                                       x['stars']) for index,x in review.iterrows()])

In [517]:
print(repr(interactions))

<12859x845 sparse matrix of type '<class 'numpy.int32'>'
	with 34223 stored elements in COOrdinate format>


In [518]:
def build_dict(df,tar_cols,val_list):
    rst = {}
    for col in tar_cols:
        rst[col] = df[col]
    sum_val = sum(list(rst.values())) # get sum of all the tfidf values
    
    if(sum_val == 0):
        return rst
    else:
        
        w = (2-sum(val_list))/sum_val # weight for each tag to be able to sum to 1
        for key,value in rst.items():
            rst[key] = value * w
    return rst

# get max of each column to regularize value to [0,1]
max_star = max(business.stars)
max_b_review_count = max(business.review_count)
print('maximum business review count')
print(max_b_review_count)

# give CF info weight 0.5, all other 0.5. Then in others, give (star, review count) 0.25 and tags 0.25
item_features = dataset.build_item_features([(x['business_id'], 
                                              {'stars':0.5*x['stars']/max_star,
                                               'review_count':0.5*x['review_count']/max_b_review_count,
                                               **build_dict(x,tar_cols,[0.5*x['stars']/max_star,
                                                           0.5*x['review_count']/max_b_review_count])})
                                              for index,x in business.iterrows()])

max_u_review_count = max(users.review_count)
max_useful = max(users.useful)
print('maximum user review count')
print(max_u_review_count)
user_features = dataset.build_user_features(((x['user_id'],
                                             {'review_count':0.35*x['review_count']/max_u_review_count,'is_elite':0.35*int(x['is_elite']),
                                              'useful':0.35*x['useful']/max_useful,
                                             **build_dict(x,user_cols,[0.35*x['review_count']/max_u_review_count,
                                                                            0.35*int(x['is_elite']),
                                                                            0.35*x['useful']/max_useful])})
                                           for index, x in users.iterrows()))

print(repr(item_features))
print(item_features.shape)

print(repr(user_features))
print(user_features.shape)

maximum business review count
6.723832440821209
maximum user review count
8.482394685873542
<845x909 sparse matrix of type '<class 'numpy.float32'>'
	with 54080 stored elements in Compressed Sparse Row format>
(845, 909)
<12859x12909 sparse matrix of type '<class 'numpy.float32'>'
	with 655809 stored elements in Compressed Sparse Row format>
(12859, 12909)


In [519]:

# check features to see if weights make sense
idx = 3
tt = list(item_features[idx].nonzero())
print(item_features[idx].shape)
print(tt)

uu = list(user_features[idx].nonzero())
print(user_features[idx].shape)
uu

(1, 909)
[array([0, 0, 0, 0, 0, 0, 0, 0]), array([  3, 845, 846, 849, 850, 851, 870, 887])]
(1, 12909)


[array([0, 0, 0, 0]), array([    3, 12859, 12860, 12869])]

In [520]:

tt = item_features[idx].todense()
import pandas as pd
tt = pd.DataFrame(tt)
print(list(map(set,tt.values)))# feature weight sum to 1, so if two are nonzero, each take weight 0.5. 

uu = user_features[idx].todense()
uu = pd.DataFrame(uu)
list(map(set,uu.values))# feature weight sum to 1, so if two are nonzero, each take weight 0.5.

[{0.0, 0.33333334, 0.07, 0.14999999, 0.16666667}]


[{0.0, 0.114341676, 0.23776683, 0.29328063, 0.35461086}]

In [521]:
tt

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,899,900,901,902,903,904,905,906,907,908
0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [522]:
uu


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12899,12900,12901,12902,12903,12904,12905,12906,12907,12908
0,0.0,0.0,0.0,0.354611,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [523]:
#train-test split

seed = 1001
from lightfm.cross_validation import random_train_test_split
train,test=random_train_test_split(interactions,test_percentage=0.2,random_state=np.random.RandomState(seed))

print('The dataset has %s users and %s items, '
      'with %s interactions in the test and %s interactions in the training set.'
      % (train.shape[0], train.shape[1], test.getnnz(), train.getnnz()))

train.multiply(test).nnz == 0 # make sure train and test are truly disjoint

The dataset has 12859 users and 845 items, with 6845 interactions in the test and 27378 interactions in the training set.


True

In [524]:

from lightfm import LightFM

# Set the number of threads; you can increase this if you have more physical cores available.

## These parameters are obtained after tunning for specific loss through skopt in the bottom chunk
NUM_THREADS = 25
NUM_COMPONENTS = 43    
NUM_EPOCHS = 18
ITEM_ALPHA = 2.88752e-6
learning_rate=0.06652
k = 5 # for precision at k

## Pure Collaborative Filtering models

# Logistic loss
model_1 = LightFM(loss='logistic',random_state=seed,
                item_alpha=ITEM_ALPHA,
               no_components=NUM_COMPONENTS,
               learning_rate=learning_rate)

# time it.
%time model_1 = model.fit(train,epochs=NUM_EPOCHS,num_threads=NUM_THREADS)


# Import the evaluation routines
from lightfm.evaluation import auc_score

# Compute and print the AUC score
train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean()
print('Collaborative filtering train AUC: %s' % train_auc)

test_auc = auc_score(model, test,num_threads=NUM_THREADS).mean()
print('Collaborative filtering test AUC: %s' % test_auc)

from lightfm.evaluation import precision_at_k,recall_at_k

print("Train precision: %.4f" % precision_at_k(model, train, k=k,num_threads=NUM_THREADS).mean())
print("Test precision: %.4f" % precision_at_k(model, test,train_interactions=train, k=k,num_threads=NUM_THREADS).mean())

Wall time: 3.48 s
Collaborative filtering train AUC: 0.8142713
Collaborative filtering test AUC: 0.805228
Train precision: 0.0429
Test precision: 0.0320


In [525]:

NUM_THREADS = 25
NUM_COMPONENTS = 45    
NUM_EPOCHS = 43
ITEM_ALPHA = 1.3846e-6
learning_rate=0.0161445

# BPR loss
model = LightFM(loss='bpr',random_state=seed,
                item_alpha=ITEM_ALPHA,
               no_components=NUM_COMPONENTS,
               learning_rate=learning_rate)

# time it.
%time model = model.fit(train,epochs=NUM_EPOCHS,num_threads=NUM_THREADS)


# Compute and print the AUC score
train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean()
print('Collaborative filtering train AUC: %s' % train_auc)

test_auc = auc_score(model, test,num_threads=NUM_THREADS).mean()
print('Collaborative filtering test AUC: %s' % test_auc)

print("Train precision: %.4f" % precision_at_k(model, train, k=k,num_threads=NUM_THREADS).mean())
print("Test precision: %.4f" % precision_at_k(model, test,train_interactions=train, k=k,num_threads=NUM_THREADS).mean())

Wall time: 14.2 s
Collaborative filtering train AUC: 0.85738117
Collaborative filtering test AUC: 0.5631318
Train precision: 0.1526
Test precision: 0.0165


# Here we can see that the loss function bpr is showing serious overfitting with drop in accuracy of the train and test

In [526]:
NUM_THREADS = 25
NUM_COMPONENTS = 21    
NUM_EPOCHS = 16
ITEM_ALPHA = 5.97967e-6
learning_rate=0.033

# Let's fit a WARP model
model = LightFM(loss='warp',random_state=seed,
                item_alpha=ITEM_ALPHA,
               no_components=NUM_COMPONENTS,
               learning_rate=learning_rate)

#  time it.
%time model = model.fit(train,epochs=NUM_EPOCHS,num_threads=NUM_THREADS)



# Compute and print the AUC score
train_auc = auc_score(model, train, num_threads=NUM_THREADS).mean()
print('Collaborative filtering train AUC: %s' % train_auc)

test_auc = auc_score(model, test,num_threads=NUM_THREADS).mean()
print('Collaborative filtering test AUC: %s' % test_auc)


print("Train precision: %.4f" % precision_at_k(model, train, k=k,num_threads=NUM_THREADS).mean())
print("Test precision: %.4f" % precision_at_k(model, test,train_interactions=train, k=k,num_threads=NUM_THREADS).mean())

Wall time: 1.95 s
Collaborative filtering train AUC: 0.96783483
Collaborative filtering test AUC: 0.8013681
Train precision: 0.2008
Test precision: 0.0294


In [527]:
NUM_THREADS = 25
NUM_COMPONENTS = 30
learning_rate = 0.0053
NUM_EPOCHS = 6
ITEM_ALPHA = 2.228e-5

## Hybrid models

#Combine user_feature and item feature

#logistic loss
model.iii = LightFM(loss='logistic',
                item_alpha=ITEM_ALPHA,
                    random_state=seed,
               no_components=NUM_COMPONENTS,learning_rate=learning_rate)

# time it.
%time model.iii = model.iii.fit(train,user_features=user_features,item_features=item_features,epochs=NUM_EPOCHS,num_threads=NUM_THREADS)

# Compute and print the AUC score
train_auc = auc_score(model.iii, train,user_features=user_features,item_features=item_features, num_threads=NUM_THREADS).mean()
print(' Hybrid train AUC: %s' % train_auc)

test_auc = auc_score(model.iii, test,user_features=user_features,item_features=item_features,num_threads=NUM_THREADS).mean()
print('Hybrid test AUC: %s' % test_auc)

#precision @k
print("Train precision: %.4f" % precision_at_k(model.iii, train,
                                               item_features=item_features,user_features=user_features, k=k,
                                               num_threads=NUM_THREADS).mean())
print("Test precision: %.4f" % precision_at_k(model.iii, test,train_interactions=train,
                                              item_features=item_features,user_features=user_features, k=k,
                                             num_threads=NUM_THREADS).mean())


Wall time: 15.7 s
 Hybrid train AUC: 0.7422592
Hybrid test AUC: 0.73809147
Train precision: 0.0148
Test precision: 0.0097


In [528]:
NUM_THREADS = 25
NUM_COMPONENTS = 77    
NUM_EPOCHS = 80
ITEM_ALPHA = 1.419e-6
learning_rate=0.035


# BPR
model.iii = LightFM(loss='bpr',
                item_alpha=ITEM_ALPHA,
                     random_state=seed,
               no_components=NUM_COMPONENTS,learning_rate=learning_rate)

#  time it.
%time model.iii = model.iii.fit(train,user_features=user_features,item_features=item_features,epochs=NUM_EPOCHS,num_threads=NUM_THREADS)


# Compute and print the AUC score
train_auc = auc_score(model.iii, train,user_features=user_features,item_features=item_features, num_threads=NUM_THREADS).mean()
print('Hybrid train AUC: %s' % train_auc)

test_auc = auc_score(model.iii, test,user_features=user_features,item_features=item_features,num_threads=NUM_THREADS).mean()
print('Hybrid test AUC: %s' % test_auc)

#precision @k
print("Train precision: %.4f" % precision_at_k(model.iii, train,
                                               item_features=item_features,user_features=user_features, k=k,
                                              num_threads=NUM_THREADS).mean())#0.41
print("Test precision: %.4f" % precision_at_k(model.iii, test,train_interactions=train,
                                              item_features=item_features,user_features=user_features, k=k,
                                             num_threads=NUM_THREADS).mean())#0.17


Wall time: 11min 6s
Hybrid train AUC: 0.93435514
Hybrid test AUC: 0.8115924
Train precision: 0.0502
Test precision: 0.0263


In [574]:
NUM_THREADS = 50
NUM_COMPONENTS = 42    
NUM_EPOCHS = 30
ITEM_ALPHA = 0.000256
learning_rate=0.0529
# WARP
model.iii = LightFM(loss='warp',
                item_alpha=ITEM_ALPHA, random_state=seed,
               no_components=NUM_COMPONENTS,learning_rate=learning_rate,learning_schedule='adagrad')

#time it.
%time model.iii = model.iii.fit(train,user_features=user_features,item_features=item_features,epochs=NUM_EPOCHS,num_threads=NUM_THREADS)



# Compute and print the AUC score
train_auc = auc_score(model.iii, train,user_features=user_features,item_features=item_features, num_threads=NUM_THREADS).mean()
print('Hybrid train AUC: %s' % train_auc)

test_auc = auc_score(model.iii, test,user_features=user_features,item_features=item_features,num_threads=NUM_THREADS).mean()
print('Hybrid test AUC: %s' % test_auc)

#precision @k
print("Train precision: %.4f" % precision_at_k(model.iii, train,
                                               item_features=item_features,user_features=user_features, k=k,
                                              num_threads=NUM_THREADS).mean())
print("Test precision: %.4f" % precision_at_k(model.iii, test,train_interactions=train,
                                              item_features=item_features,user_features=user_features, k=k,
                                             num_threads=NUM_THREADS).mean())

Wall time: 1min 25s
Hybrid train AUC: 0.9542297
Hybrid test AUC: 0.8861522
Train precision: 0.0957
Test precision: 0.0578


In [575]:
NUM_THREADS = 50
NUM_COMPONENTS = 42    
NUM_EPOCHS = 30
ITEM_ALPHA = 0.000256
learning_rate=0.0529
# WARP
model.iii = LightFM(loss='warp-kos',
                item_alpha=ITEM_ALPHA, random_state=seed,
               no_components=NUM_COMPONENTS,learning_rate=learning_rate,learning_schedule='adagrad')

#time it.
%time model.iii = model.iii.fit(train,user_features=user_features,item_features=item_features,epochs=NUM_EPOCHS,num_threads=NUM_THREADS)



# Compute and print the AUC score
train_auc = auc_score(model.iii, train,user_features=user_features,item_features=item_features, num_threads=NUM_THREADS).mean()
print('Hybrid train AUC: %s' % train_auc)

test_auc = auc_score(model.iii, test,user_features=user_features,item_features=item_features,num_threads=NUM_THREADS).mean()
print('Hybrid test AUC: %s' % test_auc)

#precision @k
print("Train precision: %.4f" % precision_at_k(model.iii, train,
                                               item_features=item_features,user_features=user_features, k=k,
                                              num_threads=NUM_THREADS).mean())
print("Test precision: %.4f" % precision_at_k(model.iii, test,train_interactions=train,
                                              item_features=item_features,user_features=user_features, k=k,
                                             num_threads=NUM_THREADS).mean())

Wall time: 1min 37s
Hybrid train AUC: 0.9622207
Hybrid test AUC: 0.8829761
Train precision: 0.0981
Test precision: 0.0577


In [579]:
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))
    
def sample_train_recommendation(model, train, data_meta, user_ids, k, name, mapping, tag=None, user_features=None,
                                item_features=None, num_threads=2):
    n_users, n_items = train.shape

    # =============================================================================
    #     ranks = model.predict_rank(interactions,
    #                                train_interactions=train_interactions,
    #                                user_features=user_features,
    #                                item_features=item_features,
    #                                num_threads=num_threads,
    #                                check_intersections=check_intersections,
    #                                )
    #
    #     ranks.data = np.less(ranks.data, k, ranks.data)
    #
    #     precision = np.squeeze(np.array(ranks.sum(axis=1))) / k
    #
    #     if not preserve_rows:
    #         precision = precision[test_interactions.getnnz(axis=1) > 0]
    #
    #     return precision
    # =============================================================================

    for user_id in user_ids:

        t_idx = {value: key for key, value in mapping.items()}
        u_idx = [x for x in train.tocsr()[user_id].indices]
        known_positives = data_meta.loc[u_idx, name]  # may need change
        if tag is not None:
            known_tags = data_meta.loc[u_idx, tag]  # get item tags.

        if (len(known_positives) < k):
            print('not enough known positives, return max number')

        scores = model.predict(user_id, np.arange(n_items), user_features=user_features, item_features=item_features,
                               num_threads=num_threads)
        i_idx = [x for x in np.argsort(-scores)]
        top_items = data_meta.loc[i_idx, name]
        if tag is not None:
            top_tags = data_meta.loc[i_idx, tag]  # get item tags.

        printmd("**User %s**" % user_id)
        printmd("**Known positives:**")

        if tag is not None:
            for x in range(len(known_positives)):
                print(" %s | %s" % (known_positives.values[x], known_tags.values[x]))
        else:
            for x in known_positives[:len(known_positives)]:
                print("        %s" % x)

        printmd("**Recommended:**")
        cnt = 0
        if tag is not None:
            for x in range(k):
                print(" %s | %s" % (top_items.values[x], top_tags.values[x]))
                if (top_items.values[x] in known_positives.values):
                    cnt += 1
                    print('This one clicked')
        else:
            for x in top_items[:k]:
                print("        %s" % x)
                if (x in known_positives.values):
                    cnt += 1
                    print('This one clicked')
        #printmd('*cnt: *' + str(cnt))
        printmd('*k_p: %s*'%str(len(known_positives)))
        p_k = cnt / k
        printmd('*precicion at k : %s*'%str(p_k))
        print('----------------------------------------------------------------------')


def sample_test_recommendation(model, train, test, data_meta, user_ids, k, name, mapping, tag=None,
                               train_interactions=None, user_features=None,
                               item_features=None, num_threads=2):
    n_users, n_items = test.shape

    for user_id in user_ids:
        
        printmd("**User %s**" % user_id)
        
        t_idx = {value: key for key, value in mapping.items()}
        u_idx = [x for x in test.tocsr()[user_id].indices]

        known_positives = data_meta.loc[u_idx, name]  # may need change

        print('length of known_positives: ' + str(len(known_positives)))
        if (len(known_positives) == 0):
            sample_train_recommendation(model, train, data_meta, [user_id], k, name, mapping, tag, user_features,
                                        item_features)
            continue

        elif (len(known_positives) < k):
            print('not enough known positives, return max number')

        if tag is not None:
            known_tags = data_meta.loc[u_idx, tag]  # get item tags.

        if (train_interactions is None):
            scores = model.predict(user_id, np.arange(n_items), user_features=user_features,
                                   item_features=item_features,
                                   num_threads=num_threads)
            i_idx = [x for x in np.argsort(-scores)]
            top_items = data_meta.loc[i_idx, name]
            if tag is not None:
                top_tags = data_meta.loc[i_idx, tag]  # get item tags.

        else:
            item_ids = np.delete(np.arange(n_items), train.tocsr()[user_id].indices)
            scores = model.predict(user_id, item_ids, user_features=user_features, item_features=item_features,
                                   num_threads=num_threads)
            i_idx = [x for x in np.argsort(-scores)]
            top_items = data_meta.loc[i_idx, name]
            if tag is not None:
                top_tags = data_meta.loc[i_idx, tag]  # get item tags.

        
        printmd("**Known positives:**")

        if tag is not None:
            for x in range(len(known_positives)):
                print(" %s | %s" % (known_positives.values[x], known_tags.values[x]))
        else:
            for x in known_positives[:len(known_positives)]:
                print("        %s" % x)

        printmd("**Recommended:**")
        cnt = 0
        if tag is not None:
            for x in range(k):
                print(" %s | %s" % (top_items.values[x], top_tags.values[x]))
                if (top_items.values[x] in known_positives.values):
                    cnt += 1
                    print('This one clicked')
        else:
            for x in top_items[:k]:
                print("        %s" % x)
                if (x in known_positives.values):
                    cnt += 1
                    print('This one clicked')
        #printmd('*cnt: *' + str(cnt))
        printmd('*k_p: %s*'%str(len(known_positives)))
        p_k = cnt / k
        printmd('*precicion at k : %s*'%str(p_k))
        print('----------------------------------------------------------------------')

def get_user_index(test):
    return scipy.sparse.find(test)[0]

In [580]:

#test corresponding recpmmendation

sample_train_recommendation(model.iii,train,business,[105],5,'name',mapping=dataset.mapping()[2],tag='category',
                              user_features = user_features,item_features=item_features)
user_index=list(set(get_user_index(test)))
sample_test_recommendation(model.iii,train,test,business,[user_index[51]],5,'name',mapping=dataset.mapping()[2],
                              train_interactions=train,tag='category',user_features = user_features,item_features=item_features)

not enough known positives, return max number


**User 105**

**Known positives:**

 Monical's Pizza | Italian, Pizza, Restaurants


**Recommended:**

 Monical's Pizza | Italian, Pizza, Restaurants
This one clicked
 Siam Terrace | Thai, Sushi Bars, Restaurants
 Dunkin' | Food, Coffee & Tea, Bagels, Donuts
 Schnucks Savoy | Pharmacy, Bakeries, Health & Medical, Food, Grocery, Beer, Wine & Spirits
 V Picasso | Wine Bars, Event Planning & Services, Farmers Market, Bars, Food, Tapas Bars, Venues & Event Spaces, Nightlife, Breakfast & Brunch, Restaurants, American (New), Sandwiches, American (Traditional)


*k_p: 1*

*precicion at k : 0.2*

----------------------------------------------------------------------


**User 62**

length of known_positives: 4
not enough known positives, return max number


**Known positives:**

 The Bread Company | Bakeries, American (New), Modern European, Food, Restaurants, Sandwiches, Delis
 Sushi Ichiban | Sushi Bars, Japanese, Restaurants
 Dunkin' | Coffee & Tea, Donuts, Food
 Home of Gourmet Chinese & Thai Restaurant | Restaurants, Thai, Chinese


**Recommended:**

 B Won | Korean, Restaurants
 Dunkin' | Donuts, Food, Coffee & Tea
This one clicked
 Alexander's Steakhouse | Steakhouses, Seafood, Restaurants
 Scratch | Sandwiches, American (Traditional), Seafood, Restaurants
 Jimmy John's | Fast Food, Sandwiches, Delis, Pizza, Food Delivery Services, Food, Restaurants


*k_p: 4*

*precicion at k : 0.2*

----------------------------------------------------------------------


In [583]:
# get similar tags
def get_similar_tags(model, tag_id,k):
    # Define similarity as the cosine of the angle
    # between the tag latent vectors

    # Normalize the vectors to unit length
    tag_embeddings = (model.item_embeddings[849:].T
                      / np.linalg.norm(model.item_embeddings[849:], axis=1)).T

    query_embedding = tag_embeddings[tag_id]
    similarity = np.dot(tag_embeddings, query_embedding)
    most_similar = np.argsort(-similarity)[1:k+1]

    return most_similar
tag_labels = list(dataset.mapping()[3].keys())[849:]
#tag_labels

target_ls = ['Chinese','Bars','Ice Cream & Frozen Yogurt','Italian']
for tag in target_ls:
   tag_id = tag_labels.index(tag)
   print('Most similar tags for %s: %s' % (tag_labels[tag_id],
                                           [tag_labels[x] for x in get_similar_tags(model.iii, tag_id,5)]))

Most similar tags for Chinese: ['Seafood', 'Barbeque', 'Asian Fusion', 'Sushi Bars', 'Convenience Stores']
Most similar tags for Bars: ['Nightlife', 'Breweries', 'Sports Bars', 'Lounges', 'Arts & Entertainment']
Most similar tags for Ice Cream & Frozen Yogurt: ['Desserts', 'Juice Bars & Smoothies', 'Bubble Tea', 'Bakeries', 'Food Trucks']
Most similar tags for Italian: ['Event Planning & Services', 'Pizza', 'Caterers', 'Salad', 'Sandwiches']
