# Recsys based on Stars

In this notebook, I will be creating various Recommender systems, taking different approaches based on Yelp review star ratings.

In [1]:
# Imports 
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import matplotlib.pyplot as plt
from IPython.display import Image
seed=42

In [78]:
data = pd.read_csv("/home/schubert/DSI/capstone_project/model_notebooks/star_rec_data_nv")

In [15]:
data.head()

Unnamed: 0,business_id,name,user_id,stars
0,-6e0liTvH5EoB4HuncuQgA,"""Small Bar Fort Mill""",Ow07iTOW_JUer07MWNhazg,5
1,-6e0liTvH5EoB4HuncuQgA,"""Small Bar Fort Mill""",nFS3GfDeOOzg-bXKuNNHsA,5
2,-6e0liTvH5EoB4HuncuQgA,"""Small Bar Fort Mill""",M3ncFIlEfaSdSpoiMINwBA,4
3,-6e0liTvH5EoB4HuncuQgA,"""Small Bar Fort Mill""",T81BRLNvCpRkgVtYyabImQ,4
4,-6e0liTvH5EoB4HuncuQgA,"""Small Bar Fort Mill""",QKm7NuvVMmvWHKmW7blABg,5


## Approach

![test](recsys_workflow.png)

In [16]:
users_interactions_count_df = data.groupby(['user_id', 'name']).size().groupby('user_id').size()
print('# users: %d' % len(users_interactions_count_df))
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 2].reset_index()[['user_id']]
print('# users with at least 2 interactions: %d' % len(users_with_enough_interactions_df))

# users: 3361
# users with at least 2 interactions: 845


In [17]:
# Recsys Train-Test-Split
train_data, test_data = train_test_split(data,
                                   stratify=data['stars'], # we have imbalanced classes, re: J-shaped distribution
                                   test_size=0.20,
                                   random_state=42)

In [18]:
print('Size of training data: %d' % len(train_data))
print('Size of test data: %d' % len(test_data))

Size of training data: 4397
Size of test data: 1100


### Top N Accuracy

In [19]:
# Indexing by user_id  to speed up the searches during evaluation
data_indexed = data.set_index('user_id')
train_indexed = train_data.set_index('user_id')
test_indexed = test_data.set_index('user_id')

In [20]:
def find_reviwed_restaurants(user_id, data):
    '''
    Fuction that matches user_id to the restaurants they've reviewed
    ----
    Parameters:
    user_id: corresponding unique user id 
    data: pandas dataframe containing business_id, name, user_id, stars
    '''
    reviewed_restaurants = data.loc[user_id]['name']
    return set(reviewed_restaurants if type(reviewed_restaurants) == pd.Series else [reviewed_restaurants])

In [25]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 1

class RecsysEvaluator:


    def businesses_not_reviewed_sample(self, user_id, sample_size, seed=42):
        reviewed_restaurants = find_reviwed_restaurants(user_id, data_indexed)
        all_restaurants = set(data['name'])
        non_reviewed_restaurants = all_restaurants - reviewed_restaurants

        random.seed(seed)
        non_reviewed_restaurants_sample = random.sample(non_reviewed_restaurants, sample_size)
        return set(non_reviewed_restaurants_sample)

    def _check_hit_top_n(self, name, recommended_restaurants, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_restaurants) if c == name)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_reviewer(self, model, user_id):
        # Getting the items in test set
        reviewed_restaurants_testset = test_indexed.loc[user_id]
        if type(reviewed_restaurants_testset['name']) == pd.Series:
            person_reviewed_restaurant_testset = set(reviewed_restaurants_testset['name'])
        else:
            person_reviewed_restaurant_testset = set(reviewed_restaurants_testset['name'])  
        reviewed_restaurant_count_testset = len(reviewed_restaurants_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(user_id, 
                                               restaurants_to_ignore = find_reviwed_restaurants(user_id,
                                                                                                data_indexed),
                                               topn=10000000000)

        hits_at_2_count = 0
        hits_at_3_count = 0
        #For each item the user has interacted in test set
        for item_id in person_reviewed_restaurant_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_reviewed_restaurants_sample = self.businesses_not_reviewed_sample(user_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=42)

            #Combining the current interacted item with the 100 random items
            restaurants_to_filter_recs = non_reviewed_restaurants_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['name'].isin(restaurants_to_filter_recs)]                    
            valid_recs = valid_recs_df['name'].values
            
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_2, index_at_2 = self._check_hit_top_n(item_id, valid_recs, 2)
            hits_at_2_count += hit_at_2
            hit_at_3, index_at_3 = self._check_hit_top_n(item_id, valid_recs, 3)
            hits_at_3_count += hit_at_3

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_2 = hits_at_2_count / float(reviewed_restaurant_count_testset)
        recall_at_3 = hits_at_3_count / float(reviewed_restaurant_count_testset)

        person_metrics = {'hits@2_count':[hits_at_2_count], 
                          'hits@3_count':[hits_at_3_count], 
                          'interacted_count': [reviewed_restaurant_count_testset],
                          'recall@2': [recall_at_2],
                          'recall@3': [recall_at_3]}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        reviewers_metrics = []
        for idx, user_id in enumerate(list(test_indexed.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            reviewer_metrics = self.evaluate_model_for_reviewer(model, user_id)  
            reviewer_metrics['_user_id'] = user_id
            reviewers_metrics.append(reviewer_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(reviewer_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_2 = detailed_results_df['hits@2_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_3 = detailed_results_df['hits@3_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@2': global_recall_at_2,
                          'recall@3': global_recall_at_3}    
        return global_metrics, detailed_results_df
    
recsys_evaluator = RecsysEvaluator()

Popularity Model

In [26]:
restaurant_popularity = data.groupby('name')['stars'].median().sort_values(ascending=False).reset_index()
restaurant_popularity.head(10)

Unnamed: 0,name,stars
0,"""MorningStar Missions Cafe""",5.0
1,"""The Flipside Cafe""",5.0
2,"""Small Bar Fort Mill""",5.0
3,"""Jersey Mike's Subs""",5.0
4,"""Pelican's SnoBalls""",5.0
5,"""Sweet Dough""",5.0
6,"""China Inn""",5.0
7,"""Pasquale's Restaurant & Pub""",5.0
8,"""Tacos Nayarit #2""",5.0
9,"""Papa Murphy's""",5.0


In [27]:
class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, restaurant_popularity, items_df=None):
        self.restaurant_popularity = restaurant_popularity
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, restaurants_to_ignore=[], topn=10, verbose=False):
        # Recommend the more popular items that the user hasn't seen yet.
        recommendations_df = self.restaurant_popularity[~self.restaurant_popularity['name'].isin(restaurants_to_ignore)] \
                               .sort_values('stars', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'contentId', 
                                                          right_on = 'contentId')[['eventStrength', 'contentId', 'title', 'url', 'lang']]


        return recommendations_df
    
popularity_model = PopularityRecommender(restaurant_popularity, data)

In [28]:
print('Evaluating Popularity recommendation model...')
pop_global_metrics, pop_detailed_results_df = recsys_evaluator.evaluate_model(popularity_model)
print('\nGlobal metrics:\n%s' % pop_global_metrics)
pop_detailed_results_df

Evaluating Popularity recommendation model...
906 users processed

Global metrics:
{'modelName': 'Popularity', 'recall@2': 0.0, 'recall@3': 0.0}


Unnamed: 0,hits@2_count,hits@3_count,interacted_count,recall@2,recall@3,_user_id
0,0,0,3,0.0,0.0,q4-Nvtr-FlHxAZu66vm-ig


7p4mOmAl0X1P-ACgLcH2Yg

## Trying something different

In [66]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate, KFold, GridSearchCV
from surprise import BaselineOnly, NormalPredictor, accuracy
from surprise import Reader
import os

In [72]:
# path to dataset file
# file_path = os.path.expanduser("/home/schubert/DSI/capstone_project/model_notebooks/star_rec_data_nv.csv")

In [79]:
data.head()

Unnamed: 0,business_id,name,user_id,stars
0,--9e1ONYQuAa-CB_Rrw7Tw,"""Delmonico Steakhouse""",xP1IYu2eGfxMWV9tjrurIw,5
1,--9e1ONYQuAa-CB_Rrw7Tw,"""Delmonico Steakhouse""",oFyOUOeGTRZhFPF9uTqrTQ,5
2,--9e1ONYQuAa-CB_Rrw7Tw,"""Delmonico Steakhouse""",2aeNFntqY2QDZLADNo8iQQ,4
3,--9e1ONYQuAa-CB_Rrw7Tw,"""Delmonico Steakhouse""",gmPP4YFrgYsYQqPYokMgFA,5
4,--9e1ONYQuAa-CB_Rrw7Tw,"""Delmonico Steakhouse""",9bxdPvAhP6cuipD5s2UnCg,5


In [80]:
# As we're loading a custom dataset, we need to define a reader. In the
# movielens-100k dataset, each line has the following format:
# 'user item rating timestamp', separated by '\t' characters.
# reader = Reader(line_format='business_id ; name ; user_id ; stars', sep=',')

In [81]:
reader = Reader(rating_scale=(1, 5))

In [82]:
data = Dataset.load_from_df(data[['name', 'user_id', 'stars']], reader)

In [83]:
cross_validate(BaselineOnly(), data, verbose=True)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2346  1.2347  1.2355  1.2338  1.2361  1.2349  0.0008  
MAE (testset)     0.9965  0.9978  0.9979  0.9970  0.9996  0.9978  0.0010  
Fit time          3.71    4.14    4.01    3.99    3.94    3.96    0.14    
Test time         1.21    1.22    1.19    1.49    1.24    1.27    0.11    


{'fit_time': (3.7063400745391846,
  4.13522481918335,
  4.01305627822876,
  3.990678548812866,
  3.9350712299346924),
 'test_mae': array([0.99650055, 0.99779882, 0.99790255, 0.99704806, 0.99957772]),
 'test_rmse': array([1.23460107, 1.23471877, 1.23545423, 1.23383139, 1.23612105]),
 'test_time': (1.2068123817443848,
  1.2165818214416504,
  1.189164638519287,
  1.493720531463623,
  1.2391531467437744)}

In [84]:
cross_validate(NormalPredictor(), data, cv=2)

{'fit_time': (0.5419800281524658, 0.749880313873291),
 'test_mae': array([1.40661143, 1.40456196]),
 'test_rmse': array([1.77800242, 1.77524064]),
 'test_time': (3.6128809452056885, 3.780947685241699)}

In [85]:
# define a cross-validation iterator
kf = KFold(n_splits=3)

algo = SVD()

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 1.2630
RMSE: 1.2620
RMSE: 1.2619


In [86]:
param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])

1.2512658750475067
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [87]:
# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(data.build_full_trainset())

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f2ecce23208>

In [88]:
results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df.head()

Unnamed: 0,split0_test_rmse,split1_test_rmse,split2_test_rmse,mean_test_rmse,std_test_rmse,rank_test_rmse,split0_test_mae,split1_test_mae,split2_test_mae,mean_test_mae,std_test_mae,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_epochs,param_lr_all,param_reg_all
0,1.268387,1.265663,1.267009,1.26702,0.001112,7,1.040979,1.037885,1.039616,1.039493,0.001266,6,9.864846,0.056042,2.638301,0.272984,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.4}",5,0.002,0.4
1,1.274219,1.271427,1.272727,1.272791,0.001141,8,1.048038,1.044812,1.046615,1.046488,0.00132,8,9.491911,0.012535,2.497279,0.302326,"{'n_epochs': 5, 'lr_all': 0.002, 'reg_all': 0.6}",5,0.002,0.6
2,1.258668,1.256008,1.257297,1.257324,0.001086,2,1.031735,1.02862,1.030123,1.030159,0.001272,2,9.685466,0.142121,2.497078,0.292577,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}",5,0.005,0.4
3,1.265265,1.26264,1.263859,1.263921,0.001072,5,1.039891,1.036772,1.038325,1.038329,0.001273,5,9.590522,0.135125,2.51186,0.338188,"{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.6}",5,0.005,0.6
4,1.260555,1.25797,1.259228,1.259251,0.001055,4,1.033356,1.030274,1.032089,1.031906,0.001265,3,17.901575,0.216289,2.855,0.072783,"{'n_epochs': 10, 'lr_all': 0.002, 'reg_all': 0.4}",10,0.002,0.4


In [54]:
algo = SVD()

In [55]:
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.3344  1.2859  1.3148  1.2859  1.3055  1.3053  0.0184  
MAE (testset)     1.1064  1.0571  1.0919  1.0597  1.0814  1.0793  0.0188  
Fit time          0.24    0.23    0.24    0.24    0.27    0.24    0.02    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


{'fit_time': (0.2445533275604248,
  0.2275230884552002,
  0.23551177978515625,
  0.2358841896057129,
  0.2743501663208008),
 'test_mae': array([1.10641844, 1.05713763, 1.09192113, 1.05970146, 1.0814054 ]),
 'test_rmse': array([1.33441692, 1.28590703, 1.31476337, 1.28588078, 1.30546843]),
 'test_time': (0.0077037811279296875,
  0.006069660186767578,
  0.009871244430541992,
  0.006154060363769531,
  0.00622105598449707)}