## This Jupyter file is to prepare the data for Recommendation of restaurants to customers based on different criteria. Below Recommendations are given
   * Popularity based recommendation for each city
   * Content based seach Engine
   * Item similarity based Collaborative filter with KNN and SVD techniques using 'surprise' library.

In [1]:
#Library to connect with Mongodb
import pymongo
from pymongo import MongoClient

import pandas as pd
import numpy as np
from pandas import DataFrame

import math

#Library for Collaborative filtering(Item based similarity)
from surprise import Reader, Dataset, SVD, evaluate
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from surprise import KNNBasic

import pickle

pd.options.display.max_columns = 999
pd.options.display.max_rows = 999
    
client = MongoClient('mongodb://localhost:27017')
#Get the yelpDB database
db = client.yelp

# Creating User-Restaurant Dataframe for Recommendation work

In [2]:
#Read AZ_restaurant_review_data collection from mongodb. This collection contains the Reviews details of each business.
#Filter the content for top 5 cities (Based on the EDA)
AZ_restaurant_review_data = db.AZ_restaurant_review_data.find({'city':{"$in" : ['Phoenix','Scottsdale','Mesa','Tempe','Chandler']}})
AZ_restaurant_review_data_df = DataFrame(list(AZ_restaurant_review_data))
AZ_restaurant_review_data_df.shape

(5451, 16)

In [3]:
#Read the cleaned AZ_Restaurant_Final_Clean_Data.csv file with flattened restaurant details. This is the output file from EDA &
# data cleansing. 
    AZ_Restaurant_Flat_Data= pd.read_csv("AZ_Restaurant_Final_Clean_Data.csv", sep =";", index_col="Unnamed: 0")
AZ_Restaurant_Flat_Data.shape

(7367, 234)

In [4]:
#Merge the above 2 dataframes to a single dataframe
AZ_restrnt_flatten_df = pd.merge(AZ_restaurant_review_data_df,AZ_Restaurant_Flat_Data,on = 'business_id')

#Below columns would be retained based on what goes into the UI. These columns are chose based on the valid data availble(less Nans)
column_list = ["business_id", "name_x", "address_x", "city_x", "state", "postal_code_x", "review_count_x", "review_details",
               "stars_x", "RestaurantsPriceRange2", "GoodForMeal_breakfast", "GoodForMeal_lunch", "GoodForMeal_dinner", 
               "GoodForMeal_latenight", "Alcohol", "Ambience_casual", "Ambience_classy", "Ambience_romantic", "Ambience_trendy",
               "Ambience_upscale", "HasTV", "BusinessParking_garage", "BusinessParking_lot", "BusinessParking_street",
               "BusinessParking_valet", "WiFi", "category_Fast Food", "category_Sandwiches", "category_Mexican", 
               "category_American (Traditional)", "category_Nightlife", "category_Pizza", "category_Bars", "category_Burgers",
               "category_Breakfast & Brunch", "category_American (New)"]
AZ_restrnt_flatten_df = AZ_restrnt_flatten_df[column_list]

# Rename the column header with new name
column_list_updated = ["Business_id", "Name", "Address", "City", "State", "Postal_code", "Review_count", "Review_details",
               "Restaurant_ratings", "RestaurantsPriceRange2", "GoodForMeal_breakfast", "GoodForMeal_lunch", "GoodForMeal_dinner", 
               "GoodForMeal_latenight", "Alcohol", "Ambience_casual", "Ambience_classy", "Ambience_romantic", "Ambience_trendy",
               "Ambience_upscale", "HasTV", "BusinessParking_garage", "BusinessParking_lot", "BusinessParking_street",
               "BusinessParking_valet", "WiFi", "Category_Fast_Food", "Category_Sandwiches", "Category_Mexican", 
               "Category_American_Traditional", "Category_Nightlife", "Category_Pizza", "Category_Bars", "Category_Burgers",
               "Category_Breakfast_Brunch", "Category_American_New"]
AZ_restrnt_flatten_df.columns = column_list_updated
print(AZ_restrnt_flatten_df.shape)

(5451, 36)


In [5]:
AZ_restrnt_flatten_df.head(3)

Unnamed: 0,Business_id,Name,Address,City,State,Postal_code,Review_count,Review_details,Restaurant_ratings,RestaurantsPriceRange2,GoodForMeal_breakfast,GoodForMeal_lunch,GoodForMeal_dinner,GoodForMeal_latenight,Alcohol,Ambience_casual,Ambience_classy,Ambience_romantic,Ambience_trendy,Ambience_upscale,HasTV,BusinessParking_garage,BusinessParking_lot,BusinessParking_street,BusinessParking_valet,WiFi,Category_Fast_Food,Category_Sandwiches,Category_Mexican,Category_American_Traditional,Category_Nightlife,Category_Pizza,Category_Bars,Category_Burgers,Category_Breakfast_Brunch,Category_American_New
0,rDMptJYWtnMhpQu_rRXHng,McDonald's,719 E Thunderbird Rd,Phoenix,AZ,85022,10,"[{'review_id': 'bABGON0ehmb7MBJrI02l7Q', 'user...",1.0,1.0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0
1,1WBkAuQg81kokZIPMpn9Zg,Charr An American Burger Bar,"777 E Thunderbird Rd, Ste 107",Phoenix,AZ,85022,232,"[{'review_id': 'xj31weGCI08_2xGQwEx0hA', 'user...",3.0,2.0,0,0,1,0,2,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0
2,iPa__LOhse-hobC2Xmp-Kw,McDonald's,1635 E Camelback Rd,Phoenix,AZ,85016,34,"[{'review_id': 'myIo7pMWP6B4XiZYv2EsSQ', 'user...",3.0,1.0,1,1,1,1,0,1,0,0,0,0,0,0,1,0,0,2,1,0,0,0,0,0,0,1,0,0


## Creating separate dataframe for each city 

In [6]:
AZ_restrnt_Phoenix_df = AZ_restrnt_flatten_df[AZ_restrnt_flatten_df['City']== 'Phoenix']
AZ_restrnt_Scottsdale_df = AZ_restrnt_flatten_df[AZ_restrnt_flatten_df['City']== 'Scottsdale']
AZ_restrnt_Mesa_df = AZ_restrnt_flatten_df[AZ_restrnt_flatten_df['City']== 'Mesa']
AZ_restrnt_Tempe_df = AZ_restrnt_flatten_df[AZ_restrnt_flatten_df['City']== 'Tempe']
AZ_restrnt_Chandler_df = AZ_restrnt_flatten_df[AZ_restrnt_flatten_df['City']== 'Chandler']

## Function to flatten the reviews of the Restaurants by each users

In [7]:
def review_flatten(Review_data_Price_rate):
    i = 0
    j = 0
    Price_review_df = pd.DataFrame()
    while i < len(Review_data_Price_rate):

        review1 = pd.DataFrame(Review_data_Price_rate.iloc[i]['Review_details'])
        review1['Business_id'] = Review_data_Price_rate.iloc[i]['Business_id']
        Price_review_df = pd.concat([review1,Price_review_df])
             
        #percentage = (i/len(Review_data_Price_rate))*100
        #if(int(round(percentage)) % 10 == 0):
        #    print (int(round(percentage)) ,"%", i,end =' ')
        i +=1
    return Price_review_df

## Creating Restaurant Data in Phoenix for Recommendation

In [8]:
#Spread of ratings given to Restaurants
AZ_restrnt_Phoenix_df['Restaurant_ratings'].value_counts()

4.0    633
3.5    544
3.0    401
4.5    340
2.5    267
2.0    191
5.0     85
1.5     82
1.0     28
Name: Restaurant_ratings, dtype: int64

In [9]:
AZ_restrnt_Phoenix_flatten = pd.DataFrame()
AZ_restrnt_Phoenix_flatten = review_flatten(AZ_restrnt_Phoenix_df) #calling the function to flatten the review details

AZ_restrnt_Phoenix_flatten.rename(columns={'stars':'Review_ratings', 'user_id' : 'User_id'}, inplace=True) #rename columns
AZ_restrnt_Phoenix_flatten.head(3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,Business_id,cool,date,funny,review_id,Review_ratings,text,useful,User_id
0,AEYNihHmGIjmUciRFo3qwA,0.0,2017-07-19,0.0,vXxwVW-hbf6NLYAyXTRO6w,5.0,Yins has some of the best Chinese food in town...,0.0,PxDKVBipTwYFaNBkvv9xbg
1,AEYNihHmGIjmUciRFo3qwA,2.0,2013-08-06,2.0,gPcN_EXBi2RBB7Rt7ObOgw,4.0,Run down little chinese place that hadn't been...,5.0,EWv-QUwo3hMfoM04sIXdUw
2,AEYNihHmGIjmUciRFo3qwA,0.0,2017-07-13,0.0,fh5ThP629BSniNT_e8OQHg,5.0,I live less than a mile from this place and on...,1.0,o3kKx7Yi4doooe8_o_Ja5w


**Merge the Business and user review dataframes and remove the unwanted columns(like cool, date, funny, useful, review_id, review_text)**

In [10]:
AZ_restrnt_Phoenix_flatten_df = pd.merge(AZ_restrnt_Phoenix_df,AZ_restrnt_Phoenix_flatten,on = 'Business_id')
column_list = ["User_id","Business_id", "Review_ratings","Name", "Address", "City", "State", "Postal_code", "Review_count",
               "Restaurant_ratings", "RestaurantsPriceRange2", "GoodForMeal_breakfast", "GoodForMeal_lunch", "GoodForMeal_dinner", 
               "GoodForMeal_latenight", "Alcohol", "Ambience_casual", "Ambience_classy", "Ambience_romantic", "Ambience_trendy",
               "Ambience_upscale", "HasTV", "BusinessParking_garage", "BusinessParking_lot", "BusinessParking_street",
               "BusinessParking_valet", "WiFi", "Category_Fast_Food", "Category_Sandwiches", "Category_Mexican", 
               "Category_American_Traditional", "Category_Nightlife", "Category_Pizza", "Category_Bars", "Category_Burgers",
               "Category_Breakfast_Brunch", "Category_American_New"]
AZ_restrnt_Phoenix_flatten_final = AZ_restrnt_Phoenix_flatten_df[column_list]
print(AZ_restrnt_Phoenix_flatten_final.shape)

(232597, 37)


In [11]:
AZ_restrnt_Phoenix_flatten_final.head(2)

Unnamed: 0,User_id,Business_id,Review_ratings,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,RestaurantsPriceRange2,GoodForMeal_breakfast,GoodForMeal_lunch,GoodForMeal_dinner,GoodForMeal_latenight,Alcohol,Ambience_casual,Ambience_classy,Ambience_romantic,Ambience_trendy,Ambience_upscale,HasTV,BusinessParking_garage,BusinessParking_lot,BusinessParking_street,BusinessParking_valet,WiFi,Category_Fast_Food,Category_Sandwiches,Category_Mexican,Category_American_Traditional,Category_Nightlife,Category_Pizza,Category_Bars,Category_Burgers,Category_Breakfast_Brunch,Category_American_New
0,Ck73f1qtZbu68F_vjzsBrQ,rDMptJYWtnMhpQu_rRXHng,1.0,McDonald's,719 E Thunderbird Rd,Phoenix,AZ,85022,10,1.0,1.0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0
1,F95NFEFwuwA__SIRt9IJNA,rDMptJYWtnMhpQu_rRXHng,1.0,McDonald's,719 E Thunderbird Rd,Phoenix,AZ,85022,10,1.0,1.0,1,1,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0


In [12]:
print("Analysis of Phoenix data")
print("Number of Users : ", np.count_nonzero(AZ_restrnt_Phoenix_flatten_final['User_id'].unique()))
print("Number of Restaurants rated : ", np.count_nonzero(AZ_restrnt_Phoenix_flatten_final['Business_id'].unique()))
print("Number of Ratings provided : ", np.count_nonzero(AZ_restrnt_Phoenix_flatten_final['Review_ratings']))

Analysis of Phoenix data
Number of Users :  102426
Number of Restaurants rated :  2556
Number of Ratings provided :  232597


In [13]:
# The density of rating matrix is calculated as total number of entries divided by observed entries
print("The density of the rating matrix for Restaurants in Phoenix is: " , 
     np.count_nonzero(AZ_restrnt_Phoenix_flatten_final['Review_ratings']) * 100 / \
        float(np.count_nonzero(AZ_restrnt_Phoenix_flatten_final['User_id'].unique()) * \
              np.count_nonzero(AZ_restrnt_Phoenix_flatten_final['Business_id'].unique())))

The density of the rating matrix for Restaurants in Phoenix is:  0.08884501126306478


**The Data sparsity is high. We can increase the density of the dataset by filtering restaurant ratings above 3. We wouldnt recommend low rated restaurants to the users hence these restaurants can be removed from the dataset.**

In [14]:
AZ_restrnt_Phoenix_flatten_final = AZ_restrnt_Phoenix_flatten_final[AZ_restrnt_Phoenix_flatten_final['Restaurant_ratings'] > 3]

In [15]:
print("Number of Users : ", np.count_nonzero(AZ_restrnt_Phoenix_flatten_final['User_id'].unique()))
print("Number of Restaurants rated : ", np.count_nonzero(AZ_restrnt_Phoenix_flatten_final['Business_id'].unique()))
print("Number of Ratings provided : ", np.count_nonzero(AZ_restrnt_Phoenix_flatten_final['Review_ratings']))

Number of Users :  91616
Number of Restaurants rated :  1600
Number of Ratings provided :  200958


In [16]:
# The density of rating matrix is calculated as total number of entries divided by observed entries
print("The updated density of the rating matrix for Restaurants in Phoenix is: " , 
     np.count_nonzero(AZ_restrnt_Phoenix_flatten_final['Review_ratings']) * 100 / \
        float(np.count_nonzero(AZ_restrnt_Phoenix_flatten_final['User_id'].unique()) * \
              np.count_nonzero(AZ_restrnt_Phoenix_flatten_final['Business_id'].unique())))

The updated density of the rating matrix for Restaurants in Phoenix is:  0.13709259299685644


Let's look at the number of restaurants that were reviewed less than 5 times. We can remove the count from the recommendation as we dont have enough reviews to include it in the recommendation.

In [17]:
print("Number of restaurants with 3 reviews: ", len(AZ_restrnt_Phoenix_flatten_final[AZ_restrnt_Phoenix_flatten_final
                                                                                ['Review_count'] == 3]['Business_id'].unique()))
print("Number of restaurants with 4 reviews: ", len(AZ_restrnt_Phoenix_flatten_final[AZ_restrnt_Phoenix_flatten_final
                                                                                ['Review_count'] == 4]['Business_id'].unique()))
print("Number of restaurants with < 5 reviews: ", len(AZ_restrnt_Phoenix_flatten_final[AZ_restrnt_Phoenix_flatten_final
                                                                                ['Review_count'] < 5]['Business_id'].unique()))

Number of restaurants with 3 reviews:  42
Number of restaurants with 4 reviews:  40
Number of restaurants with < 5 reviews:  82


In [18]:
#We will not consider restaurants that were reviewed just 3 times
AZ_restrnt_Phoenix_flatten_final1 = AZ_restrnt_Phoenix_flatten_final[AZ_restrnt_Phoenix_flatten_final['Review_count'] >= 4]

In [19]:
print("Number of Users : ", np.count_nonzero(AZ_restrnt_Phoenix_flatten_final1['User_id'].unique()))
print("Number of Restaurants rated : ", np.count_nonzero(AZ_restrnt_Phoenix_flatten_final1['Business_id'].unique()))
print("Number of Ratings provided : ", np.count_nonzero(AZ_restrnt_Phoenix_flatten_final1['Review_ratings']))

Number of Users :  91573
Number of Restaurants rated :  1558
Number of Ratings provided :  200836


In [20]:
# The density of rating matrix is calculated as total number of entries divided by observed entries
print("The updated density of the rating matrix for Restaurants in Phoenix is: " , 
     np.count_nonzero(AZ_restrnt_Phoenix_flatten_final1['Review_ratings']) * 100 / \
        float(np.count_nonzero(AZ_restrnt_Phoenix_flatten_final1['User_id'].unique()) * \
              np.count_nonzero(AZ_restrnt_Phoenix_flatten_final1['Business_id'].unique())))

The updated density of the rating matrix for Restaurants in Phoenix is:  0.14076888396747156


In [21]:
# Lets group user ids by ratings to identify users who rated less number of times.
AZ_restrnt_Phoenix_user_grp = AZ_restrnt_Phoenix_flatten_final1.groupby('User_id')['Restaurant_ratings'].count().sort_values()

In [22]:
AZ_restrnt_Phoenix_user_grp = pd.DataFrame({'User_id': AZ_restrnt_Phoenix_user_grp.index,
                                            'User_total_ratings': AZ_restrnt_Phoenix_user_grp.values})
print(AZ_restrnt_Phoenix_user_grp.shape)
AZ_restrnt_Phoenix_user_grp.head()

(91573, 2)


Unnamed: 0,User_id,User_total_ratings
0,--3WaS23LcIXtxyFULJHTA,1
1,_zEjgPdkHXL_oOvQ2gLiWw,1
2,_zMQgIEqnNfj3JsfLvX_xg,1
3,_zNW1LpB-XktJvdHYBVKQg,1
4,_zOK112d-fiAvRydmSSckA,1


In [23]:
#Join the above table with the cleaned dataset
AZ_restrnt_Phoenix_flatten_final_df = AZ_restrnt_Phoenix_flatten_final1.merge(AZ_restrnt_Phoenix_user_grp, on='User_id', 
                                                                              how='inner')
print(AZ_restrnt_Phoenix_flatten_final_df.shape)
AZ_restrnt_Phoenix_flatten_final_df.head()

(200836, 38)


Unnamed: 0,User_id,Business_id,Review_ratings,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,RestaurantsPriceRange2,GoodForMeal_breakfast,GoodForMeal_lunch,GoodForMeal_dinner,GoodForMeal_latenight,Alcohol,Ambience_casual,Ambience_classy,Ambience_romantic,Ambience_trendy,Ambience_upscale,HasTV,BusinessParking_garage,BusinessParking_lot,BusinessParking_street,BusinessParking_valet,WiFi,Category_Fast_Food,Category_Sandwiches,Category_Mexican,Category_American_Traditional,Category_Nightlife,Category_Pizza,Category_Bars,Category_Burgers,Category_Breakfast_Brunch,Category_American_New,User_total_ratings
0,yv0wMqujPcOcer6A4GSrdw,YhV93k9uiMdr3FlV4FHjwA,5.0,Caviness Studio,,Phoenix,AZ,85001,4,5.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,2
1,yv0wMqujPcOcer6A4GSrdw,aKFA85Miwb96A0IAS3mRgQ,5.0,FEZ,105 W Portland St,Phoenix,AZ,85003,303,3.5,2.0,0,0,1,0,2,1,0,0,1,0,1,1,0,1,0,1,0,0,0,0,1,0,1,0,0,1,2
2,lLCWWDw3bcTreCbona1QsA,YhV93k9uiMdr3FlV4FHjwA,5.0,Caviness Studio,,Phoenix,AZ,85001,4,5.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,v-f06-9BKbYFe_IG7WSs-g,YhV93k9uiMdr3FlV4FHjwA,5.0,Caviness Studio,,Phoenix,AZ,85001,4,5.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
4,9ypQEpG-2JjydzquNF7vQA,YhV93k9uiMdr3FlV4FHjwA,5.0,Caviness Studio,,Phoenix,AZ,85001,4,5.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1


In [24]:
#Analyze how many users gave least amount of rating
AZ_restrnt_Phoenix_user_grp['User_total_ratings'].value_counts()

1      59987
2      14682
3       6068
4       3249
5       1894
6       1279
7        843
8        630
9        489
10       373
11       295
12       228
13       184
14       155
15       127
16       102
17        99
18        92
19        70
22        55
20        50
21        45
27        45
26        41
25        41
24        40
23        33
28        24
29        23
30        23
32        23
33        22
31        20
36        17
41        15
38        14
35        14
34        13
42        11
40        10
39         9
45         8
43         8
50         8
37         7
44         7
57         7
46         6
64         6
49         5
59         4
47         4
71         4
55         4
79         4
62         4
51         4
53         3
66         3
48         3
67         3
99         2
56         2
60         2
75         2
61         2
129        1
65         1
262        1
70         1
128        1
135        1
68         1
63         1
72         1
108        1
125        1

Lets remove users who gave only 1 or 2 reviews

In [25]:
AZ_restrnt_Phoenix_flatten_final_df = AZ_restrnt_Phoenix_flatten_final_df[AZ_restrnt_Phoenix_flatten_final_df
                                                                          ['User_total_ratings'] >= 3]

In [26]:
print("Number of Users : ", np.count_nonzero(AZ_restrnt_Phoenix_flatten_final_df['User_id'].unique()))
print("Number of Restaurants rated : ", np.count_nonzero(AZ_restrnt_Phoenix_flatten_final_df['Business_id'].unique()))
print("Number of Ratings provided : ", np.count_nonzero(AZ_restrnt_Phoenix_flatten_final_df['Review_ratings']))

Number of Users :  16904
Number of Restaurants rated :  1543
Number of Ratings provided :  111485


In [27]:
# The density of rating matrix is calculated as total number of entries divided by observed entries
print("The updated density of the rating matrix for Restaurants in Phoenix is: " , 
     np.count_nonzero(AZ_restrnt_Phoenix_flatten_final_df['Review_ratings']) * 100 / \
        float(np.count_nonzero(AZ_restrnt_Phoenix_flatten_final_df['User_id'].unique()) * \
              np.count_nonzero(AZ_restrnt_Phoenix_flatten_final_df['Business_id'].unique())))

The updated density of the rating matrix for Restaurants in Phoenix is:  0.427426090194362


In [28]:
AZ_restrnt_Phoenix_flatten_final_df.to_csv('AZ_restrnt_Phoenix_flatten_final_df.csv', sep='\t',index=False, encoding='utf-8')

**The above file will be persisted to be used for recommendation in UI. During production run, these information will be stored in Databases like Cassandra or other on memory storage to quickly read details and provide recommendations.**

# Creating Restaurant Data in Scottsdale for Recommendation

In [29]:
AZ_restrnt_Scottsdale_flatten = pd.DataFrame()
AZ_restrnt_Scottsdale_flatten = review_flatten(AZ_restrnt_Scottsdale_df) #calling the function to flatten the review details
AZ_restrnt_Scottsdale_flatten.rename(columns={'stars':'Review_ratings', 'user_id' : 'User_id'}, inplace=True) #rename columns
AZ_restrnt_Scottsdale_flatten.head(3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,Business_id,cool,date,funny,review_id,Review_ratings,text,useful,User_id
0,Gr-2oBg4XyduSKbvnE-i9g,0.0,2017-06-25,0.0,staWSGxbXTuDiEp1YIcgkQ,5.0,"This is a hidden gem. Amazing tacos, margarita...",1.0,wLuc66bIM1HMU5zGZPdUQg
1,Gr-2oBg4XyduSKbvnE-i9g,0.0,2016-11-22,0.0,wJFwRLW0bf72AA9bhkRi2Q,1.0,Will never go back. 3rd time is the charm. Wen...,1.0,bzlattlWsc4Y8QRpayI2pA
2,Gr-2oBg4XyduSKbvnE-i9g,0.0,2017-05-04,0.0,ALCnqhWhXftZB9sBS68HNg,1.0,Arrived around 7:40pm. My friend had a table a...,0.0,KN4ZcwkGOeAUq5f0tUih7g


**Merge the Business and user review dataframes and remove the unwanted columns (like cool, date, funny, useful, review_id, review_text)**

In [30]:
AZ_restrnt_Scottsdale_flatten_df = pd.merge(AZ_restrnt_Scottsdale_df,AZ_restrnt_Scottsdale_flatten,on = 'Business_id')
column_list = ["User_id","Business_id", "Review_ratings","Name", "Address", "City", "State", "Postal_code", "Review_count",
               "Restaurant_ratings", "RestaurantsPriceRange2", "GoodForMeal_breakfast", "GoodForMeal_lunch", "GoodForMeal_dinner", 
               "GoodForMeal_latenight", "Alcohol", "Ambience_casual", "Ambience_classy", "Ambience_romantic", "Ambience_trendy",
               "Ambience_upscale", "HasTV", "BusinessParking_garage", "BusinessParking_lot", "BusinessParking_street",
               "BusinessParking_valet", "WiFi", "Category_Fast_Food", "Category_Sandwiches", "Category_Mexican", 
               "Category_American_Traditional", "Category_Nightlife", "Category_Pizza", "Category_Bars", "Category_Burgers",
               "Category_Breakfast_Brunch", "Category_American_New"]
AZ_restrnt_Scottsdale_flatten_final = AZ_restrnt_Scottsdale_flatten_df[column_list]
print(AZ_restrnt_Scottsdale_flatten_final.shape)

(121869, 37)


In [31]:
AZ_restrnt_Scottsdale_flatten_final.head(2)

Unnamed: 0,User_id,Business_id,Review_ratings,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,RestaurantsPriceRange2,GoodForMeal_breakfast,GoodForMeal_lunch,GoodForMeal_dinner,GoodForMeal_latenight,Alcohol,Ambience_casual,Ambience_classy,Ambience_romantic,Ambience_trendy,Ambience_upscale,HasTV,BusinessParking_garage,BusinessParking_lot,BusinessParking_street,BusinessParking_valet,WiFi,Category_Fast_Food,Category_Sandwiches,Category_Mexican,Category_American_Traditional,Category_Nightlife,Category_Pizza,Category_Bars,Category_Burgers,Category_Breakfast_Brunch,Category_American_New
0,-KmtFCYymBipzs6bmSfrKg,VdlPZg2NAu8t8GkdbPLecg,5.0,Tandoori Times Indian Bistro,"8140 N Hayden Rd, Ste H115",Scottsdale,AZ,85258,263,3.5,2.0,0,0,1,0,2,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
1,QLqQ5Cy9piHNo6KBuf_YdA,VdlPZg2NAu8t8GkdbPLecg,3.0,Tandoori Times Indian Bistro,"8140 N Hayden Rd, Ste H115",Scottsdale,AZ,85258,263,3.5,2.0,0,0,1,0,2,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0


In [32]:
print("Analysis of Scottsdale data")
print("Number of Users : ", np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final['User_id'].unique()))
print("Number of Restaurants rated : ", np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final['Business_id'].unique()))
print("Number of Ratings provided : ", np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final['Review_ratings']))

Analysis of Scottsdale data
Number of Users :  60548
Number of Restaurants rated :  895
Number of Ratings provided :  121869


In [33]:
# The density of rating matrix is calculated as total number of entries divided by observed entries
print("The density of the rating matrix for Restaurants in Scottsdale is: " , 
     np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final['Review_ratings']) * 100 / \
        float(np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final['User_id'].unique()) * \
              np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final['Business_id'].unique())))

The density of the rating matrix for Restaurants in Scottsdale is:  0.22489013748914477


**The Data sparsity is high. We can increase the density of the dataset by filtering restaurant ratings above 3. We wouldnt recommend low rated restaurants to the users hence these restaurants can be removed from the dataset.**

In [34]:
AZ_restrnt_Scottsdale_flatten_final = AZ_restrnt_Scottsdale_flatten_final[AZ_restrnt_Scottsdale_flatten_final
                                                                          ['Restaurant_ratings'] > 3]

In [35]:
print("Number of Users : ", np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final['User_id'].unique()))
print("Number of Restaurants rated : ", np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final['Business_id'].unique()))
print("Number of Ratings provided : ", np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final['Review_ratings']))

Number of Users :  56604
Number of Restaurants rated :  680
Number of Ratings provided :  110806


In [36]:
# The density of rating matrix is calculated as total number of entries divided by observed entries
print("The updated density of the rating matrix for Restaurants in Scottsdale is: " , 
     np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final['Review_ratings']) * 100 / \
        float(np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final['User_id'].unique()) * \
              np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final['Business_id'].unique())))

The updated density of the rating matrix for Restaurants in Scottsdale is:  0.28787718182460603


Let's look at the number of restaurants that were reviewed less than 5 times. We can remove the count from the recommendation as we dont have enough reviews to include it in the recommendation.

In [37]:
print("Number of restaurants with 3 reviews: ", len(AZ_restrnt_Scottsdale_flatten_final[AZ_restrnt_Scottsdale_flatten_final
                                                                                ['Review_count'] == 3]['Business_id'].unique()))
print("Number of restaurants with 4 reviews: ", len(AZ_restrnt_Scottsdale_flatten_final[AZ_restrnt_Scottsdale_flatten_final
                                                                                ['Review_count'] == 4]['Business_id'].unique()))
print("Number of restaurants with < 5 reviews: ", len(AZ_restrnt_Scottsdale_flatten_final[AZ_restrnt_Scottsdale_flatten_final
                                                                                ['Review_count'] < 5]['Business_id'].unique()))

Number of restaurants with 3 reviews:  12
Number of restaurants with 4 reviews:  10
Number of restaurants with < 5 reviews:  22


In [38]:
#We will not consider restaurants that were reviewed just 3 times
AZ_restrnt_Scottsdale_flatten_final1 = AZ_restrnt_Scottsdale_flatten_final[AZ_restrnt_Scottsdale_flatten_final
                                                                           ['Review_count'] >= 4]

In [39]:
print("Number of Users : ", np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final1['User_id'].unique()))
print("Number of Restaurants rated : ", np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final1['Business_id'].unique()))
print("Number of Ratings provided : ", np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final1['Review_ratings']))

Number of Users :  56587
Number of Restaurants rated :  668
Number of Ratings provided :  110772


In [40]:
# The density of rating matrix is calculated as total number of entries divided by observed entries
print("The updated density of the rating matrix for Restaurants in Scottsdale is: " , 
     np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final1['Review_ratings']) * 100 / \
        float(np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final1['User_id'].unique()) * \
              np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final1['Business_id'].unique())))

The updated density of the rating matrix for Restaurants in Scottsdale is:  0.2930467197508071


In [41]:
# Lets group user ids by ratings to identify users who rated less number of times.
AZ_restrnt_Scottsdale_user_grp = AZ_restrnt_Scottsdale_flatten_final1.groupby('User_id')['Restaurant_ratings'] \
                                                                                        .count().sort_values()

In [42]:
AZ_restrnt_Scottsdale_user_grp = pd.DataFrame({'User_id': AZ_restrnt_Scottsdale_user_grp.index,
                                            'User_total_ratings': AZ_restrnt_Scottsdale_user_grp.values})
print(AZ_restrnt_Scottsdale_user_grp.shape)
AZ_restrnt_Scottsdale_user_grp.head()

(56587, 2)


Unnamed: 0,User_id,User_total_ratings
0,---PLwSf5gKdIoVnyRHgBA,1
1,aayy4ZYDcifp-kDGinhhTA,1
2,ab3ecerBXTG5zTC38NhTjg,1
3,ab5KKxwwATl_aoMlEKuNgQ,1
4,abAIOK-fx1pL8_8v6RY45w,1


In [43]:
#Join the above table with the cleaned dataset
AZ_restrnt_Scottsdale_flatten_final_df = AZ_restrnt_Scottsdale_flatten_final1.merge(AZ_restrnt_Scottsdale_user_grp, on='User_id', 
                                                                              how='inner')
print(AZ_restrnt_Scottsdale_flatten_final_df.shape)
AZ_restrnt_Scottsdale_flatten_final_df.head()

(110772, 38)


Unnamed: 0,User_id,Business_id,Review_ratings,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,RestaurantsPriceRange2,GoodForMeal_breakfast,GoodForMeal_lunch,GoodForMeal_dinner,GoodForMeal_latenight,Alcohol,Ambience_casual,Ambience_classy,Ambience_romantic,Ambience_trendy,Ambience_upscale,HasTV,BusinessParking_garage,BusinessParking_lot,BusinessParking_street,BusinessParking_valet,WiFi,Category_Fast_Food,Category_Sandwiches,Category_Mexican,Category_American_Traditional,Category_Nightlife,Category_Pizza,Category_Bars,Category_Burgers,Category_Breakfast_Brunch,Category_American_New,User_total_ratings
0,-KmtFCYymBipzs6bmSfrKg,VdlPZg2NAu8t8GkdbPLecg,5.0,Tandoori Times Indian Bistro,"8140 N Hayden Rd, Ste H115",Scottsdale,AZ,85258,263,3.5,2.0,0,0,1,0,2,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,2
1,-KmtFCYymBipzs6bmSfrKg,EC7NrIHKk6i3US_XJlDWDA,5.0,Julio's Too,7305 E Camelback Rd,Scottsdale,AZ,85251,166,4.0,1.0,0,1,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,2
2,QLqQ5Cy9piHNo6KBuf_YdA,VdlPZg2NAu8t8GkdbPLecg,3.0,Tandoori Times Indian Bistro,"8140 N Hayden Rd, Ste H115",Scottsdale,AZ,85258,263,3.5,2.0,0,0,1,0,2,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,5
3,QLqQ5Cy9piHNo6KBuf_YdA,orMyWvonOj55yswWf-k72Q,3.0,Farm & Craft,4302 N Scottsdale Rd,Scottsdale,AZ,85251,342,4.0,2.0,0,1,0,0,2,1,0,0,1,0,1,1,0,1,0,1,0,0,0,0,1,0,1,0,1,1,5
4,QLqQ5Cy9piHNo6KBuf_YdA,UOtoOybZ5vAB4aXBLc57vA,4.0,Bootleggers Modern American Smokehouse,7217 E 1st St,Scottsdale,AZ,85251,503,4.0,2.0,0,1,1,0,2,1,0,0,0,0,1,0,0,1,1,1,0,0,0,0,1,0,1,0,0,1,5


In [44]:
#Analyze how many users gave least amount of rating
AZ_restrnt_Scottsdale_user_grp['User_total_ratings'].value_counts()

1      38975
2       8665
3       3507
4       1744
5       1053
6        605
7        469
8        273
9        214
10       152
11       130
12       114
13        85
14        66
15        63
16        48
17        46
18        39
20        37
21        28
19        22
23        20
27        18
26        17
24        16
22        15
29        13
30        11
32        10
28        10
31         9
25         9
33         7
34         6
36         5
38         5
45         5
41         5
47         4
48         4
39         4
50         4
35         3
37         3
43         3
55         3
62         3
40         3
67         2
72         2
59         2
61         2
63         2
86         2
44         2
116        2
56         2
114        1
80         1
64         1
177        1
49         1
65         1
93         1
51         1
66         1
60         1
83         1
110        1
46         1
54         1
57         1
74         1
71         1
42         1
88         1
Name: User_t

Lets remove users who gave only 1 review

In [45]:
AZ_restrnt_Scottsdale_flatten_final_df = AZ_restrnt_Scottsdale_flatten_final_df[AZ_restrnt_Scottsdale_flatten_final_df
                                                                          ['User_total_ratings'] >= 2]

In [46]:
print("Number of Users : ", np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final_df['User_id'].unique()))
print("Number of Restaurants rated : ", np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final_df['Business_id'].unique()))
print("Number of Ratings provided : ", np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final_df['Review_ratings']))

Number of Users :  17612
Number of Restaurants rated :  666
Number of Ratings provided :  71797


In [47]:
# The density of rating matrix is calculated as total number of entries divided by observed entries
print("The updated density of the rating matrix for Restaurants in Scottsdale is: " , 
     np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final_df['Review_ratings']) * 100 / \
        float(np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final_df['User_id'].unique()) * \
              np.count_nonzero(AZ_restrnt_Scottsdale_flatten_final_df['Business_id'].unique())))

The updated density of the rating matrix for Restaurants in Scottsdale is:  0.6121014268868005


In [48]:
AZ_restrnt_Scottsdale_flatten_final_df.to_csv('AZ_restrnt_Scottsdale_flatten_final_df.csv', sep='\t', index=False, encoding='utf-8')

# Creating Restaurant Data in Mesa for Recommendation

In [49]:
AZ_restrnt_Mesa_flatten = pd.DataFrame()
AZ_restrnt_Mesa_flatten = review_flatten(AZ_restrnt_Mesa_df) #calling the function to flatten the review details
AZ_restrnt_Mesa_flatten.rename(columns={'stars':'Review_ratings', 'user_id' : 'User_id'}, inplace=True)  #rename columns
AZ_restrnt_Mesa_flatten.head(3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,Business_id,cool,date,funny,review_id,Review_ratings,text,useful,User_id
0,sUAJOwb1yFaSDreFADEO5Q,0.0,2013-01-31,0.0,58NSa6w2f4ORNg3m2930bw,5.0,One of the BEST Thai food restaurants in the v...,0.0,ejePftHxbrq5qCEL0HzppA
1,sUAJOwb1yFaSDreFADEO5Q,1.0,2014-08-30,1.0,y_WAUxAtxxr9kOpkx2kt4g,4.0,"My visit was happenstance. I google ""Papaya r...",2.0,5ix6Hu_tOqhJZYnKO2VDqA
2,sUAJOwb1yFaSDreFADEO5Q,0.0,2016-04-04,0.0,aKpZWOETExjdzrz6Kn0q4A,3.0,The least spicy Thai food I have ever had. If ...,0.0,AfjVKIsLqzF71r2UccW2Fg


**Merge the Business and user review dataframes and remove the unwanted columns(like cool, date, funny, useful, review_id, review_text)**

In [50]:
AZ_restrnt_Mesa_flatten_df = pd.merge(AZ_restrnt_Mesa_df,AZ_restrnt_Mesa_flatten,on = 'Business_id')
column_list = ["User_id","Business_id", "Review_ratings","Name", "Address", "City", "State", "Postal_code", "Review_count",
               "Restaurant_ratings", "RestaurantsPriceRange2", "GoodForMeal_breakfast", "GoodForMeal_lunch", "GoodForMeal_dinner", 
               "GoodForMeal_latenight", "Alcohol", "Ambience_casual", "Ambience_classy", "Ambience_romantic", "Ambience_trendy",
               "Ambience_upscale", "HasTV", "BusinessParking_garage", "BusinessParking_lot", "BusinessParking_street",
               "BusinessParking_valet", "WiFi", "Category_Fast_Food", "Category_Sandwiches", "Category_Mexican", 
               "Category_American_Traditional", "Category_Nightlife", "Category_Pizza", "Category_Bars", "Category_Burgers",
               "Category_Breakfast_Brunch", "Category_American_New"]
AZ_restrnt_Mesa_flatten_final = AZ_restrnt_Mesa_flatten_df[column_list]
print(AZ_restrnt_Mesa_flatten_final.shape)

(52812, 37)


In [51]:
AZ_restrnt_Mesa_flatten_final.head(2)

Unnamed: 0,User_id,Business_id,Review_ratings,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,RestaurantsPriceRange2,GoodForMeal_breakfast,GoodForMeal_lunch,GoodForMeal_dinner,GoodForMeal_latenight,Alcohol,Ambience_casual,Ambience_classy,Ambience_romantic,Ambience_trendy,Ambience_upscale,HasTV,BusinessParking_garage,BusinessParking_lot,BusinessParking_street,BusinessParking_valet,WiFi,Category_Fast_Food,Category_Sandwiches,Category_Mexican,Category_American_Traditional,Category_Nightlife,Category_Pizza,Category_Bars,Category_Burgers,Category_Breakfast_Brunch,Category_American_New
0,nqQsUiqwjBqwrtTjRjjRSw,kKx8iCJkomVQBdWHnmmOiA,5.0,Little Caesars Pizza,10720 E Southern Ave,Mesa,AZ,85209,4,2.5,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,kBROct5gqIm091v7VhBJRw,kKx8iCJkomVQBdWHnmmOiA,2.0,Little Caesars Pizza,10720 E Southern Ave,Mesa,AZ,85209,4,2.5,2.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [52]:
print("Analysis of Mesa data")
print("Number of Users : ", np.count_nonzero(AZ_restrnt_Mesa_flatten_final['User_id'].unique()))
print("Number of Restaurants rated : ", np.count_nonzero(AZ_restrnt_Mesa_flatten_final['Business_id'].unique()))
print("Number of Ratings provided : ", np.count_nonzero(AZ_restrnt_Mesa_flatten_final['Review_ratings']))

Analysis of Mesa data
Number of Users :  28904
Number of Restaurants rated :  825
Number of Ratings provided :  52812


In [53]:
# The density of rating matrix is calculated as total number of entries divided by observed entries
print("The density of the rating matrix is of Restaurants in Mesa is: " , 
     np.count_nonzero(AZ_restrnt_Mesa_flatten_final['Review_ratings']) * 100 / \
        float(np.count_nonzero(AZ_restrnt_Mesa_flatten_final['User_id'].unique()) * \
              np.count_nonzero(AZ_restrnt_Mesa_flatten_final['Business_id'].unique())))

The density of the rating matrix is of Restaurants in Mesa is:  0.22147296379236595


**The Data sparsity is high. We can increase the density of the dataset by filtering restaurant ratings above 3. We wouldnt recommend low rated restaurants to the users hence these restaurants can be removed from the dataset.**

In [54]:
AZ_restrnt_Mesa_flatten_final = AZ_restrnt_Mesa_flatten_final[AZ_restrnt_Mesa_flatten_final['Restaurant_ratings'] > 3]

In [55]:
print("Number of Users : ", np.count_nonzero(AZ_restrnt_Mesa_flatten_final['User_id'].unique()))
print("Number of Restaurants rated : ", np.count_nonzero(AZ_restrnt_Mesa_flatten_final['Business_id'].unique()))
print("Number of Ratings provided : ", np.count_nonzero(AZ_restrnt_Mesa_flatten_final['Review_ratings']))

Number of Users :  24516
Number of Restaurants rated :  465
Number of Ratings provided :  41787


In [56]:
# The density of rating matrix is calculated as total number of entries divided by observed entries
print("The updated density of the rating matrix for Restaurants in Mesa is: " , 
     np.count_nonzero(AZ_restrnt_Mesa_flatten_final['Review_ratings']) * 100 / \
        float(np.count_nonzero(AZ_restrnt_Mesa_flatten_final['User_id'].unique()) * \
              np.count_nonzero(AZ_restrnt_Mesa_flatten_final['Business_id'].unique())))

The updated density of the rating matrix for Restaurants in Mesa is:  0.366554560813478


Let's look at the number of restaurants that were reviewed less than 5 times.

In [57]:
print("Number of restaurants with 3 reviews: ", len(AZ_restrnt_Mesa_flatten_final[AZ_restrnt_Mesa_flatten_final
                                                                                ['Review_count'] == 3]['Business_id'].unique()))
print("Number of restaurants with 4 reviews: ", len(AZ_restrnt_Mesa_flatten_final[AZ_restrnt_Mesa_flatten_final
                                                                                ['Review_count'] == 4]['Business_id'].unique()))
print("Number of restaurants with < 5 reviews: ", len(AZ_restrnt_Mesa_flatten_final[AZ_restrnt_Mesa_flatten_final
                                                                                ['Review_count'] < 5]['Business_id'].unique()))

Number of restaurants with 3 reviews:  13
Number of restaurants with 4 reviews:  9
Number of restaurants with < 5 reviews:  22


We will not remove any restaurants based on review count

In [58]:
# Lets group user ids by ratings to identify users who rated less number of times.
AZ_restrnt_Mesa_user_grp = AZ_restrnt_Mesa_flatten_final.groupby('User_id')['Restaurant_ratings'].count().sort_values()

In [59]:
AZ_restrnt_Mesa_user_grp = pd.DataFrame({'User_id': AZ_restrnt_Mesa_user_grp.index,
                                         'User_total_ratings': AZ_restrnt_Mesa_user_grp.values})
print(AZ_restrnt_Mesa_user_grp.shape)
AZ_restrnt_Mesa_user_grp.head()

(24516, 2)


Unnamed: 0,User_id,User_total_ratings
0,--4t44TiOHQ2rhqsrtBTuQ,1
1,b8pmVqgKOaBFGooT_FugyA,1
2,b8nWusu1o9HIZ9fk-gKV5Q,1
3,b8kTnsOtOLH_APLSmQRkTg,1
4,b8JVVTCXolNotLGa3wl2Tg,1


In [60]:
#Join the above table with the cleaned dataset
AZ_restrnt_Mesa_flatten_final_df = AZ_restrnt_Mesa_flatten_final.merge(AZ_restrnt_Mesa_user_grp, on='User_id', how='inner')
print(AZ_restrnt_Mesa_flatten_final_df.shape)
AZ_restrnt_Mesa_flatten_final_df.head()

(41787, 38)


Unnamed: 0,User_id,Business_id,Review_ratings,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,RestaurantsPriceRange2,GoodForMeal_breakfast,GoodForMeal_lunch,GoodForMeal_dinner,GoodForMeal_latenight,Alcohol,Ambience_casual,Ambience_classy,Ambience_romantic,Ambience_trendy,Ambience_upscale,HasTV,BusinessParking_garage,BusinessParking_lot,BusinessParking_street,BusinessParking_valet,WiFi,Category_Fast_Food,Category_Sandwiches,Category_Mexican,Category_American_Traditional,Category_Nightlife,Category_Pizza,Category_Bars,Category_Burgers,Category_Breakfast_Brunch,Category_American_New,User_total_ratings
0,wIA3P5Qgm4f52x_iM08n6w,MTH-AcNyWfsBa9sXp04HcQ,4.0,Firehouse Subs,"3420 E Baseline Rd, Ste 101",Mesa,AZ,85204,82,3.5,1.0,0,1,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,5
1,wIA3P5Qgm4f52x_iM08n6w,80gdzmn-E3IaXK7Z6qzC6A,4.0,Costa Vida Fresh Mexican Grill,"1744 S Val Vista Dr, Ste 106",Mesa,AZ,85204,161,3.5,1.0,0,1,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,5
2,wIA3P5Qgm4f52x_iM08n6w,YbweWBgYj9T3VqMFJcQlOw,4.0,Flaming Kabob,2252 E Baseline Rd,Mesa,AZ,85204,177,4.0,2.0,0,1,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,5
3,wIA3P5Qgm4f52x_iM08n6w,DT35ycaAVzbq83vVGRCoIQ,4.0,Cafe Zupas,"3420 E Baseline Rd, Ste 107",Mesa,AZ,85204,214,4.0,1.0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,5
4,wIA3P5Qgm4f52x_iM08n6w,DhRJJ5aUUuBp3XR1JbLeuw,3.0,Sauce Pizza and Wine,"3426 E Baseline Rd, Ste 119",Mesa,AZ,85204,151,4.0,2.0,0,1,1,0,1,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,5


In [61]:
#Analyze how many users gave least amount of rating
AZ_restrnt_Mesa_user_grp['User_total_ratings'].value_counts()

1     17800
2      3522
3      1320
4       656
5       348
6       235
7       156
8       113
9        75
10       49
11       47
12       30
15       23
13       20
14       18
18       15
16       14
20       11
19       10
21        9
17        8
22        7
23        4
25        4
33        3
24        3
34        3
26        2
35        2
28        2
32        1
30        1
27        1
44        1
82        1
42        1
31        1
Name: User_total_ratings, dtype: int64

Lets remove users who gave only 1 review

In [62]:
AZ_restrnt_Mesa_flatten_final_df = AZ_restrnt_Mesa_flatten_final_df[AZ_restrnt_Mesa_flatten_final_df
                                                                          ['User_total_ratings'] >= 2]

In [63]:
print("Number of Users : ", np.count_nonzero(AZ_restrnt_Mesa_flatten_final_df['User_id'].unique()))
print("Number of Restaurants rated : ", np.count_nonzero(AZ_restrnt_Mesa_flatten_final_df['Business_id'].unique()))
print("Number of Ratings provided : ", np.count_nonzero(AZ_restrnt_Mesa_flatten_final_df['Review_ratings']))

Number of Users :  6716
Number of Restaurants rated :  462
Number of Ratings provided :  23987


In [64]:
# The density of rating matrix is calculated as total number of entries divided by observed entries
print("The updated density of the rating matrix for Restaurants in Mesa is: " , 
     np.count_nonzero(AZ_restrnt_Mesa_flatten_final_df['Review_ratings']) * 100 / \
        float(np.count_nonzero(AZ_restrnt_Mesa_flatten_final_df['User_id'].unique()) * \
              np.count_nonzero(AZ_restrnt_Mesa_flatten_final_df['Business_id'].unique())))

The updated density of the rating matrix for Restaurants in Mesa is:  0.773077924656245


In [65]:
AZ_restrnt_Mesa_flatten_final_df.to_csv('AZ_restrnt_Mesa_flatten_final_df.csv', sep='\t', index=False, encoding='utf-8')

# Creating Restaurant Data in Tempe for Recommendation

In [66]:
AZ_restrnt_Tempe_flatten = pd.DataFrame()
AZ_restrnt_Tempe_flatten = review_flatten(AZ_restrnt_Tempe_df) #calling the function to flatten the review details

AZ_restrnt_Tempe_flatten.rename(columns={'stars':'Review_ratings', 'user_id' : 'User_id'}, inplace=True) #rename columns
AZ_restrnt_Tempe_flatten.head(3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,Business_id,cool,date,funny,review_id,Review_ratings,text,useful,User_id
0,5zva2MTtB5IX6TaoVLL-NA,0.0,2014-01-31,0.0,Frd0FWwZHJTnFjMdVdPuGQ,1.0,Bummer!! They are no longer there. \nI don't...,0.0,zW_mtish6KUnHatYs_T67Q
0,hVC6E29dg5Rx4ADSxt3kTA,1.0,2016-05-10,0.0,df1rOqwxVVxWguC9-PPIVg,5.0,"A long time Tempe staple, Restaurant Mexico is...",10.0,AfjVKIsLqzF71r2UccW2Fg
1,hVC6E29dg5Rx4ADSxt3kTA,0.0,2017-03-31,0.0,ovnWzIC1Los5-dkpLXSSXQ,4.0,I visited this location on Tuesday to order so...,0.0,8oml7qh4oTsted_TNMzH6A


**Merge the Business and user review dataframes and remove the unwanted columns (like cool, date, funny, useful, review_id, review_text)**

In [67]:
AZ_restrnt_Tempe_flatten_df = pd.merge(AZ_restrnt_Tempe_df,AZ_restrnt_Tempe_flatten,on = 'Business_id')
column_list = ["User_id","Business_id", "Review_ratings","Name", "Address", "City", "State", "Postal_code", "Review_count",
               "Restaurant_ratings", "RestaurantsPriceRange2", "GoodForMeal_breakfast", "GoodForMeal_lunch", "GoodForMeal_dinner", 
               "GoodForMeal_latenight", "Alcohol", "Ambience_casual", "Ambience_classy", "Ambience_romantic", "Ambience_trendy",
               "Ambience_upscale", "HasTV", "BusinessParking_garage", "BusinessParking_lot", "BusinessParking_street",
               "BusinessParking_valet", "WiFi", "Category_Fast_Food", "Category_Sandwiches", "Category_Mexican", 
               "Category_American_Traditional", "Category_Nightlife", "Category_Pizza", "Category_Bars", "Category_Burgers",
               "Category_Breakfast_Brunch", "Category_American_New"]
AZ_restrnt_Tempe_flatten_final = AZ_restrnt_Tempe_flatten_df[column_list]
print(AZ_restrnt_Tempe_flatten_final.shape)

(63114, 37)


In [68]:
AZ_restrnt_Tempe_flatten_final.head(2)

Unnamed: 0,User_id,Business_id,Review_ratings,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,RestaurantsPriceRange2,GoodForMeal_breakfast,GoodForMeal_lunch,GoodForMeal_dinner,GoodForMeal_latenight,Alcohol,Ambience_casual,Ambience_classy,Ambience_romantic,Ambience_trendy,Ambience_upscale,HasTV,BusinessParking_garage,BusinessParking_lot,BusinessParking_street,BusinessParking_valet,WiFi,Category_Fast_Food,Category_Sandwiches,Category_Mexican,Category_American_Traditional,Category_Nightlife,Category_Pizza,Category_Bars,Category_Burgers,Category_Breakfast_Brunch,Category_American_New
0,mx_wi036asQPrTaYDtCQ4w,IqsQRoiVE535Z2JR-7xoWQ,1.0,Jack in the Box,2145 E University Dr,Tempe,AZ,85281,12,2.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
1,BoQikn6ZQsBqbVaiYKmL5w,IqsQRoiVE535Z2JR-7xoWQ,1.0,Jack in the Box,2145 E University Dr,Tempe,AZ,85281,12,2.0,1.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [69]:
print("Analysis of Tempe data")
print("Number of Users : ", np.count_nonzero(AZ_restrnt_Tempe_flatten_final['User_id'].unique()))
print("Number of Restaurants rated : ", np.count_nonzero(AZ_restrnt_Tempe_flatten_final['Business_id'].unique()))
print("Number of Ratings provided : ", np.count_nonzero(AZ_restrnt_Tempe_flatten_final['Review_ratings']))

Analysis of Tempe data
Number of Users :  36368
Number of Restaurants rated :  615
Number of Ratings provided :  63114


In [70]:
# The density of rating matrix is calculated as total number of entries divided by observed entries
print("The density of the rating matrix is of Restaurants in Tempe is: " , 
     np.count_nonzero(AZ_restrnt_Tempe_flatten_final['Review_ratings']) * 100 / \
        float(np.count_nonzero(AZ_restrnt_Tempe_flatten_final['User_id'].unique()) * \
              np.count_nonzero(AZ_restrnt_Tempe_flatten_final['Business_id'].unique())))

The density of the rating matrix is of Restaurants in Tempe is:  0.28218321118538947


**The Data sparsity is high. We can increase the density of the dataset by filtering restaurant ratings above 3. We wouldnt recommend low rated restaurants to the users hence these restaurants can be removed from the dataset.**

In [71]:
AZ_restrnt_Tempe_flatten_final = AZ_restrnt_Tempe_flatten_final[AZ_restrnt_Tempe_flatten_final['Restaurant_ratings'] > 3]

In [72]:
print("Number of Users : ", np.count_nonzero(AZ_restrnt_Tempe_flatten_final['User_id'].unique()))
print("Number of Restaurants rated : ", np.count_nonzero(AZ_restrnt_Tempe_flatten_final['Business_id'].unique()))
print("Number of Ratings provided : ", np.count_nonzero(AZ_restrnt_Tempe_flatten_final['Review_ratings']))

Number of Users :  31091
Number of Restaurants rated :  399
Number of Ratings provided :  51675


In [73]:
# The density of rating matrix is calculated as total number of entries divided by observed entries
print("The updated density of the rating matrix for Restaurants in Tempe is: " , 
     np.count_nonzero(AZ_restrnt_Tempe_flatten_final['Review_ratings']) * 100 / \
        float(np.count_nonzero(AZ_restrnt_Tempe_flatten_final['User_id'].unique()) * \
              np.count_nonzero(AZ_restrnt_Tempe_flatten_final['Business_id'].unique())))

The updated density of the rating matrix for Restaurants in Tempe is:  0.41655552473541774


Let's look at the number of restaurants that were reviewed less than 5 times.

In [74]:
print("Number of restaurants with 3 reviews: ", len(AZ_restrnt_Tempe_flatten_final[AZ_restrnt_Tempe_flatten_final
                                                                                ['Review_count'] == 3]['Business_id'].unique()))
print("Number of restaurants with 4 reviews: ", len(AZ_restrnt_Tempe_flatten_final[AZ_restrnt_Tempe_flatten_final
                                                                                ['Review_count'] == 4]['Business_id'].unique()))
print("Number of restaurants with < 5 reviews: ", len(AZ_restrnt_Tempe_flatten_final[AZ_restrnt_Tempe_flatten_final
                                                                                ['Review_count'] < 5]['Business_id'].unique()))

Number of restaurants with 3 reviews:  4
Number of restaurants with 4 reviews:  4
Number of restaurants with < 5 reviews:  8


We will not remove any restaurants based on review count

In [75]:
# Lets group user ids by ratings to identify users who rated less number of times.
AZ_restrnt_Tempe_user_grp = AZ_restrnt_Tempe_flatten_final.groupby('User_id')['Restaurant_ratings'].count().sort_values()

In [76]:
AZ_restrnt_Tempe_user_grp = pd.DataFrame({'User_id': AZ_restrnt_Tempe_user_grp.index,
                                          'User_total_ratings': AZ_restrnt_Tempe_user_grp.values})
print(AZ_restrnt_Tempe_user_grp.shape)
AZ_restrnt_Tempe_user_grp.head()

(31091, 2)


Unnamed: 0,User_id,User_total_ratings
0,--CIuK7sUpaNzalLAlHJKA,1
1,b_PcCAKjxEHpq-4XRNGmgA,1
2,b_LJldcEC9pAUud7lMGkIw,1
3,b_HFDkHMFMwlBkVA-LbeSw,1
4,b_8NLo7oAlfpz1E25tanzw,1


In [77]:
#Join the above table with the cleaned dataset
AZ_restrnt_Tempe_flatten_final_df = AZ_restrnt_Tempe_flatten_final.merge(AZ_restrnt_Tempe_user_grp, on='User_id', how='inner')
print(AZ_restrnt_Tempe_flatten_final_df.shape)
AZ_restrnt_Tempe_flatten_final_df.head()

(51675, 38)


Unnamed: 0,User_id,Business_id,Review_ratings,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,RestaurantsPriceRange2,GoodForMeal_breakfast,GoodForMeal_lunch,GoodForMeal_dinner,GoodForMeal_latenight,Alcohol,Ambience_casual,Ambience_classy,Ambience_romantic,Ambience_trendy,Ambience_upscale,HasTV,BusinessParking_garage,BusinessParking_lot,BusinessParking_street,BusinessParking_valet,WiFi,Category_Fast_Food,Category_Sandwiches,Category_Mexican,Category_American_Traditional,Category_Nightlife,Category_Pizza,Category_Bars,Category_Burgers,Category_Breakfast_Brunch,Category_American_New,User_total_ratings
0,mvq6mTZBd3mdDg_bZRWiuQ,aiX_WP7NKPTdF9CfI-M-wg,3.0,Culinary Dropout,149 S Farmer,Tempe,AZ,85281,984,4.0,2.0,0,1,1,0,2,1,0,0,1,0,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,5
1,mvq6mTZBd3mdDg_bZRWiuQ,-IZvuqxekWEvJqDw308daQ,3.0,Los Favoritos Tacos Shop,"1340 E Broadway Rd, Ste 105",Tempe,AZ,85282,174,3.5,1.0,1,1,1,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,1,0,5
2,mvq6mTZBd3mdDg_bZRWiuQ,OoYwJANV9zD-_OA4Atu-gg,3.0,Republic Ramen + Noodles,1301 E University Dr,Tempe,AZ,85281,571,3.5,1.0,0,1,1,0,1,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,5
3,mvq6mTZBd3mdDg_bZRWiuQ,3vta1BSPwdSulcCFMpBjDw,2.0,Fujiya Market,"1335 W University Dr, Ste 5",Tempe,AZ,85281,91,4.0,1.0,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,5
4,mvq6mTZBd3mdDg_bZRWiuQ,fZM_o3kKZ9mR-1pvBeow8A,2.0,Tempe Marketplace,2000 E Rio Salado Pkwy,Tempe,AZ,85281,220,3.5,2.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,5


In [78]:
#Analyze how many users gave least amount of rating
AZ_restrnt_Tempe_user_grp['User_total_ratings'].value_counts()

1      23026
2       4366
3       1572
4        696
5        428
6        272
7        175
8        121
9         96
10        49
12        46
11        46
13        36
16        23
14        23
15        14
17        11
19         9
18         9
20         8
22         8
21         8
24         6
31         6
27         5
26         5
32         3
23         3
29         3
25         2
49         2
36         2
35         2
30         1
37         1
46         1
54         1
33         1
39         1
55         1
128        1
42         1
47         1
Name: User_total_ratings, dtype: int64

Lets remove users who gave only 1 review

In [79]:
AZ_restrnt_Tempe_flatten_final_df = AZ_restrnt_Tempe_flatten_final_df[AZ_restrnt_Tempe_flatten_final_df
                                                                          ['User_total_ratings'] >= 2]

In [80]:
print("Number of Users : ", np.count_nonzero(AZ_restrnt_Tempe_flatten_final_df['User_id'].unique()))
print("Number of Restaurants rated : ", np.count_nonzero(AZ_restrnt_Tempe_flatten_final_df['Business_id'].unique()))
print("Number of Ratings provided : ", np.count_nonzero(AZ_restrnt_Tempe_flatten_final_df['Review_ratings']))

Number of Users :  8065
Number of Restaurants rated :  399
Number of Ratings provided :  28649


In [81]:
# The density of rating matrix is calculated as total number of entries divided by observed entries
print("The updated density of the rating matrix for Restaurants in Tempe is: " , 
     np.count_nonzero(AZ_restrnt_Tempe_flatten_final_df['Review_ratings']) * 100 / \
        float(np.count_nonzero(AZ_restrnt_Tempe_flatten_final_df['User_id'].unique()) * \
              np.count_nonzero(AZ_restrnt_Tempe_flatten_final_df['Business_id'].unique())))

The updated density of the rating matrix for Restaurants in Tempe is:  0.8902914446687084


In [82]:
AZ_restrnt_Tempe_flatten_final_df.to_csv('AZ_restrnt_Tempe_flatten_final_df.csv', sep='\t', index=False, encoding='utf-8')

# Creating Restaurant Data in Chandler for Recommendation

In [83]:
AZ_restrnt_Chandler_flatten = pd.DataFrame()
AZ_restrnt_Chandler_flatten = review_flatten(AZ_restrnt_Chandler_df) #calling the function to flatten the review details

AZ_restrnt_Chandler_flatten.rename(columns={'stars':'Review_ratings', 'user_id' : 'User_id'}, inplace=True) #rename columns
AZ_restrnt_Chandler_flatten.head(3)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




Unnamed: 0,Business_id,cool,date,funny,review_id,Review_ratings,text,useful,User_id
0,UdEmYOnk2iJDY9lpEPAlJQ,2.0,2014-12-30,0.0,OwX02bSKJ0dZVUhlHcpNiw,4.0,"Solid, cheap, delicious Italian. Good calzone...",2.0,mvq6mTZBd3mdDg_bZRWiuQ
1,UdEmYOnk2iJDY9lpEPAlJQ,2.0,2014-11-02,0.0,Ws-n-35XsXE-Cvgd8Biiig,5.0,We come here a lot. My kids love their pizza a...,2.0,eNf0HUg_VNjKe9enwimRsg
2,UdEmYOnk2iJDY9lpEPAlJQ,1.0,2015-08-29,0.0,g2cHc3gWnf0dEaWuu3RM1w,5.0,I've been coming here since I was 14 years old...,0.0,314dp8E50a7mjjSeZhKB-w


**Merge the Business and user review dataframes and remove the unwanted columns (like cool, date, funny, useful, review_id, review_text)**

In [84]:
AZ_restrnt_Chandler_flatten_df = pd.merge(AZ_restrnt_Chandler_df,AZ_restrnt_Chandler_flatten,on = 'Business_id')
column_list = ["User_id","Business_id", "Review_ratings","Name", "Address", "City", "State", "Postal_code", "Review_count",
               "Restaurant_ratings", "RestaurantsPriceRange2", "GoodForMeal_breakfast", "GoodForMeal_lunch", "GoodForMeal_dinner", 
               "GoodForMeal_latenight", "Alcohol", "Ambience_casual", "Ambience_classy", "Ambience_romantic", "Ambience_trendy",
               "Ambience_upscale", "HasTV", "BusinessParking_garage", "BusinessParking_lot", "BusinessParking_street",
               "BusinessParking_valet", "WiFi", "Category_Fast_Food", "Category_Sandwiches", "Category_Mexican", 
               "Category_American_Traditional", "Category_Nightlife", "Category_Pizza", "Category_Bars", "Category_Burgers",
               "Category_Breakfast_Brunch", "Category_American_New"]
AZ_restrnt_Chandler_flatten_final = AZ_restrnt_Chandler_flatten_df[column_list]
print(AZ_restrnt_Chandler_flatten_final.shape)

(48075, 37)


In [85]:
AZ_restrnt_Chandler_flatten_final.head(2)

Unnamed: 0,User_id,Business_id,Review_ratings,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,RestaurantsPriceRange2,GoodForMeal_breakfast,GoodForMeal_lunch,GoodForMeal_dinner,GoodForMeal_latenight,Alcohol,Ambience_casual,Ambience_classy,Ambience_romantic,Ambience_trendy,Ambience_upscale,HasTV,BusinessParking_garage,BusinessParking_lot,BusinessParking_street,BusinessParking_valet,WiFi,Category_Fast_Food,Category_Sandwiches,Category_Mexican,Category_American_Traditional,Category_Nightlife,Category_Pizza,Category_Bars,Category_Burgers,Category_Breakfast_Brunch,Category_American_New
0,trieqETxCrE8i2sNIWa3Dg,mFE7N0p3f_7vcMTUy76ifw,5.0,Village Inn,2780 E Germann Rd,Chandler,AZ,85248,49,3.5,2.0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,1,1,0
1,M22zgFsTTQ_enDRKd47fEg,mFE7N0p3f_7vcMTUy76ifw,4.0,Village Inn,2780 E Germann Rd,Chandler,AZ,85248,49,3.5,2.0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,1,1,0


In [86]:
print("Analysis of Chandler data")
print("Number of Users : ", np.count_nonzero(AZ_restrnt_Chandler_flatten_final['User_id'].unique()))
print("Number of Restaurants rated : ", np.count_nonzero(AZ_restrnt_Chandler_flatten_final['Business_id'].unique()))
print("Number of Ratings provided : ", np.count_nonzero(AZ_restrnt_Chandler_flatten_final['Review_ratings']))

Analysis of Chandler data
Number of Users :  25615
Number of Restaurants rated :  529
Number of Ratings provided :  48075


In [87]:
# The density of rating matrix is calculated as total number of entries divided by observed entries
print("The density of the rating matrix is of Restaurants in Chandler is: " , 
     np.count_nonzero(AZ_restrnt_Chandler_flatten_final['Review_ratings']) * 100 / \
        float(np.count_nonzero(AZ_restrnt_Chandler_flatten_final['User_id'].unique()) * \
              np.count_nonzero(AZ_restrnt_Chandler_flatten_final['Business_id'].unique())))

The density of the rating matrix is of Restaurants in Chandler is:  0.3547882764522058


**The Data sparsity is high. We can increase the density of the dataset by filtering restaurant ratings above 3. We wouldnt recommend low rated restaurants to the users hence these restaurants can be removed from the dataset.**

In [88]:
AZ_restrnt_Chandler_flatten_final = AZ_restrnt_Chandler_flatten_final[AZ_restrnt_Chandler_flatten_final
                                                                      ['Restaurant_ratings'] > 3]

In [89]:
print("Number of Users : ", np.count_nonzero(AZ_restrnt_Chandler_flatten_final['User_id'].unique()))
print("Number of Restaurants rated : ", np.count_nonzero(AZ_restrnt_Chandler_flatten_final['Business_id'].unique()))
print("Number of Ratings provided : ", np.count_nonzero(AZ_restrnt_Chandler_flatten_final['Review_ratings']))

Number of Users :  22947
Number of Restaurants rated :  358
Number of Ratings provided :  41042


In [90]:
# The density of rating matrix is calculated as total number of entries divided by observed entries
print("The updated density of the rating matrix for Restaurants in Chandler is: " , 
     np.count_nonzero(AZ_restrnt_Chandler_flatten_final['Review_ratings']) * 100 / \
        float(np.count_nonzero(AZ_restrnt_Chandler_flatten_final['User_id'].unique()) * \
              np.count_nonzero(AZ_restrnt_Chandler_flatten_final['Business_id'].unique())))

The updated density of the rating matrix for Restaurants in Chandler is:  0.4995967146056507


Let's look at the number of restaurants that were reviewed less than 5 times.

In [91]:
print("Number of restaurants with 3 reviews: ", len(AZ_restrnt_Chandler_flatten_final[AZ_restrnt_Chandler_flatten_final
                                                                                ['Review_count'] == 3]['Business_id'].unique()))
print("Number of restaurants with 4 reviews: ", len(AZ_restrnt_Chandler_flatten_final[AZ_restrnt_Chandler_flatten_final
                                                                                ['Review_count'] == 4]['Business_id'].unique()))
print("Number of restaurants with < 5 reviews: ", len(AZ_restrnt_Chandler_flatten_final[AZ_restrnt_Chandler_flatten_final
                                                                                ['Review_count'] < 5]['Business_id'].unique()))

Number of restaurants with 3 reviews:  11
Number of restaurants with 4 reviews:  10
Number of restaurants with < 5 reviews:  21


We will not remove any restaurants based on review count

In [92]:
# Lets group user ids by ratings to identify users who rated less number of times.
AZ_restrnt_Chandler_user_grp = AZ_restrnt_Chandler_flatten_final.groupby('User_id')['Restaurant_ratings'].count().sort_values()

In [93]:
AZ_restrnt_Chandler_user_grp = pd.DataFrame({'User_id': AZ_restrnt_Chandler_user_grp.index,
                                            'User_total_ratings': AZ_restrnt_Chandler_user_grp.values})
print(AZ_restrnt_Chandler_user_grp.shape)
AZ_restrnt_Chandler_user_grp.head()

(22947, 2)


Unnamed: 0,User_id,User_total_ratings
0,--9kVKrIDkSP6lqK2PDTDw,1
1,axnfd_OY7X483_k9tw5dyw,1
2,axXmvGSUprV2YVlimazYsw,1
3,ax7Q514jNfZJ1JsjzvtKqw,1
4,awzmlqC_3G-OZV8HGcGXFA,1


In [94]:
#Join the above table with the cleaned dataset
AZ_restrnt_Chandler_flatten_final_df = AZ_restrnt_Chandler_flatten_final.merge(AZ_restrnt_Chandler_user_grp, on='User_id', 
                                                                              how='inner')
print(AZ_restrnt_Chandler_flatten_final_df.shape)
AZ_restrnt_Chandler_flatten_final_df.head()

(41042, 38)


Unnamed: 0,User_id,Business_id,Review_ratings,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,RestaurantsPriceRange2,GoodForMeal_breakfast,GoodForMeal_lunch,GoodForMeal_dinner,GoodForMeal_latenight,Alcohol,Ambience_casual,Ambience_classy,Ambience_romantic,Ambience_trendy,Ambience_upscale,HasTV,BusinessParking_garage,BusinessParking_lot,BusinessParking_street,BusinessParking_valet,WiFi,Category_Fast_Food,Category_Sandwiches,Category_Mexican,Category_American_Traditional,Category_Nightlife,Category_Pizza,Category_Bars,Category_Burgers,Category_Breakfast_Brunch,Category_American_New,User_total_ratings
0,trieqETxCrE8i2sNIWa3Dg,mFE7N0p3f_7vcMTUy76ifw,5.0,Village Inn,2780 E Germann Rd,Chandler,AZ,85248,49,3.5,2.0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,1,1,0,1
1,M22zgFsTTQ_enDRKd47fEg,mFE7N0p3f_7vcMTUy76ifw,4.0,Village Inn,2780 E Germann Rd,Chandler,AZ,85248,49,3.5,2.0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,1,1,0,1
2,_Ajvi5vsNJXDrGYL8XMBRA,mFE7N0p3f_7vcMTUy76ifw,3.0,Village Inn,2780 E Germann Rd,Chandler,AZ,85248,49,3.5,2.0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,1,1,0,1
3,3NQbw2S-iXSbP-mw9QyinA,mFE7N0p3f_7vcMTUy76ifw,1.0,Village Inn,2780 E Germann Rd,Chandler,AZ,85248,49,3.5,2.0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,1,1,0,4
4,3NQbw2S-iXSbP-mw9QyinA,UtsJThJWezQCAz1Ag2PaBQ,5.0,ATL Wings,"70 W Warner Rd, Ste 100",Chandler,AZ,85225,413,4.0,1.0,0,1,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,4


In [95]:
#Analyze how many users gave least amount of rating
AZ_restrnt_Chandler_user_grp['User_total_ratings'].value_counts()

1     16441
2      3311
3      1246
4       673
5       378
6       213
7       154
8       122
9        81
10       50
11       45
13       34
12       30
14       26
16       19
15       17
17       13
18       11
22       11
19        8
20        7
23        5
27        5
24        5
21        4
30        4
31        4
33        3
36        3
29        3
54        2
25        2
46        2
26        2
35        2
34        1
62        1
61        1
39        1
45        1
28        1
52        1
68        1
37        1
72        1
63        1
Name: User_total_ratings, dtype: int64

We will not remove users with less reviews as the sparsity seems accpetable even before removing users with 1 review

In [96]:
AZ_restrnt_Chandler_flatten_final_df.to_csv('AZ_restrnt_Chandler_flatten_final_df.csv', sep='\t', index=False, encoding='utf-8')

___________________________________________________________________________________________________________________________

**Below code is to read the CSV file as a dataframe**

In [97]:
AZ_restrnt_Phoenix_flatten_final_df_temp = pd.read_csv("AZ_restrnt_Phoenix_flatten_final_df.csv", sep  = '\t')

In [98]:
AZ_restrnt_Phoenix_flatten_final_df_temp.columns

Index(['User_id', 'Business_id', 'Review_ratings', 'Name', 'Address', 'City',
       'State', 'Postal_code', 'Review_count', 'Restaurant_ratings',
       'RestaurantsPriceRange2', 'GoodForMeal_breakfast', 'GoodForMeal_lunch',
       'GoodForMeal_dinner', 'GoodForMeal_latenight', 'Alcohol',
       'Ambience_casual', 'Ambience_classy', 'Ambience_romantic',
       'Ambience_trendy', 'Ambience_upscale', 'HasTV',
       'BusinessParking_garage', 'BusinessParking_lot',
       'BusinessParking_street', 'BusinessParking_valet', 'WiFi',
       'Category_Fast_Food', 'Category_Sandwiches', 'Category_Mexican',
       'Category_American_Traditional', 'Category_Nightlife', 'Category_Pizza',
       'Category_Bars', 'Category_Burgers', 'Category_Breakfast_Brunch',
       'Category_American_New', 'User_total_ratings'],
      dtype='object')

In [99]:
AZ_restrnt_Phoenix_flatten_final_df_temp.head(3)

Unnamed: 0,User_id,Business_id,Review_ratings,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,RestaurantsPriceRange2,GoodForMeal_breakfast,GoodForMeal_lunch,GoodForMeal_dinner,GoodForMeal_latenight,Alcohol,Ambience_casual,Ambience_classy,Ambience_romantic,Ambience_trendy,Ambience_upscale,HasTV,BusinessParking_garage,BusinessParking_lot,BusinessParking_street,BusinessParking_valet,WiFi,Category_Fast_Food,Category_Sandwiches,Category_Mexican,Category_American_Traditional,Category_Nightlife,Category_Pizza,Category_Bars,Category_Burgers,Category_Breakfast_Brunch,Category_American_New,User_total_ratings
0,ymV6L8ziRAplNuZ2KA8OPA,M3uV9Y3EDSpy9d4YwyNSAQ,5.0,Yakiramen,10605 N 43rd Ave,Phoenix,AZ,85029.0,66,4.0,2.0,0,1,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,4
1,ymV6L8ziRAplNuZ2KA8OPA,0LmDg_Y6Ht3zFqtvmLhp6g,5.0,Cornish Pasty,7 W Monroe St,Phoenix,AZ,85003.0,168,4.0,2.0,0,1,1,0,2,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,4
2,ymV6L8ziRAplNuZ2KA8OPA,dR3HS1tLVo53KO7F8BF6og,4.0,Taylor's Chowder House,3538 W Calavar Rd,Phoenix,AZ,85053.0,102,3.5,2.0,0,1,1,0,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,4


______________________

## Building Popularity matrix for each City

**In order to factor in the review counts along with the star ratings for the top performing restaurants, we will use Weighted Rating (WR) as shown below.** [Courtsey:  IMDB's weighted rating formula ]

$\large Weighted\; Rating (WR) = (\frac{Rev\_cnt}{Rev\_cnt + n} . Res\_rating) + (\frac{n}{Rev\_cnt + n} . Avg\_rating)$

where,
    <br>'Rev_cnt' is the number of Review_counts for the Restaurant <br>
    'n' is the minimum count required to be listed in the recommendation <br>
    'Res_rating' is the overall rating of the Restaurant in Business dataframe <br>
    'Avg_rating' is the mean Rating across all the restaurants for the city <br>
    
   As the next step, we will consider a restaurant to be listed in the recommendation if the number of review counts have more votes than at least 95% of all restaurants.

In [104]:
def Popularity_Recommendation(city, percentile=0.95):
    df = AZ_restrnt_flatten_df[AZ_restrnt_flatten_df['City'] == city]
    
    Rev_cnt = df[df['Review_count'].notnull()]['Review_count'].astype('int') # Review counts from the DF
    n = Rev_cnt.quantile(percentile)
    Res_rating = df[df['Restaurant_ratings'].notnull()]['Restaurant_ratings'].astype('int')  #Average rating from the DF
    Avg_rating = Res_rating.mean()
    
    qualified = df[(df['Review_count'] >= n) ]
    
    qualified['Weighted_rating'] = qualified.apply(lambda x: 
                        (x['Review_count']/(x['Review_count']+n) * x['Restaurant_ratings']) + (n/(n+x['Review_count']) * 
                                                                                               Avg_rating), axis=1)
    qualified = qualified.filter(['Business_id','Name','Address','City','State','Postal_code','Restaurant_ratings',
                                  'Review_count','Weighted_rating']).sort_values('Weighted_rating', ascending=False).head(10)
    
    return qualified

**Popular Restaurants in Phoenix**

In [105]:
Popularity_Recommendation('Phoenix')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Business_id,Name,Address,City,State,Postal_code,Restaurant_ratings,Review_count,Weighted_rating
4755,Xg5qEQiB-7L6kGJ5F4K3bQ,Little Miss BBQ,4301 E University Dr,Phoenix,AZ,85034,5.0,1463,4.588637
3075,VyVIneSU7XAWgMBllI6LnQ,Bobby Q,8501 N 27th Ave,Phoenix,AZ,85051,4.5,1940,4.261803
3543,OgJ0KxwJcJ9R5bUK0ixCbg,Lux Central,4400 N Central Ave,Phoenix,AZ,85012,4.5,1770,4.243376
5340,9a3DrZvpYxVs3k_qwlCNSw,Cibo,603 N 5th Ave,Phoenix,AZ,85003,4.5,1698,4.234683
2178,u-SJ5QUwrNquL9VnXwl8cg,Postino Arcadia,3939 E Campbell Ave,Phoenix,AZ,85018,4.5,1186,4.150492
1036,E4JyAzB5_2quptwtemyhYA,Original Breakfast House,13623 N 32nd St,Phoenix,AZ,85032,4.5,1097,4.130088
71,7m1Oa1VYV98UUuo_6i0EZg,Paradise Valley Burger Company,4001 E Bell Rd,Phoenix,AZ,85032,4.5,1019,4.110141
1053,FogTa-wmjhVnJCoTiaxvZA,Postino Central,5144 N Central Ave,Phoenix,AZ,85012,4.5,940,4.087619
5172,S-oLPRdhlyL5HAknBKTUcQ,Harumi Sushi,"114 W Adams St, Ste C101",Phoenix,AZ,85003,4.5,906,4.077105
5001,Tw3miGKZHtmxmaQZIYFRrA,Federal Pizza,5210 N Central Ave,Phoenix,AZ,85012,4.5,762,4.025909


In [106]:
#Example to read the data
print(Popularity_Recommendation('Phoenix')['Name'].iloc[0] +'\n' + 
      Popularity_Recommendation('Phoenix')['Address'].iloc[0] +'\n' +
      Popularity_Recommendation('Phoenix')['City'].iloc[0] + ', ' +
      Popularity_Recommendation('Phoenix')['Postal_code'].iloc[0] + '\n' + "Review Count: " +
      Popularity_Recommendation('Phoenix')['Review_count'].astype(str).iloc[0] + '\n' + "Rating: " +
      Popularity_Recommendation('Phoenix')['Restaurant_ratings'].astype(str).iloc[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Little Miss BBQ
4301 E University Dr
Phoenix, 85034
Review Count: 1463
Rating: 5.0


**Popular Restaurants in Scottsdale**

In [107]:
Popularity_Recommendation('Scottsdale')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Business_id,Name,Address,City,State,Postal_code,Restaurant_ratings,Review_count,Weighted_rating
4001,3l54GTr8-E3XPbIxnF_sAA,Rehab Burger Therapy,7210 E 2nd St,Scottsdale,AZ,85251,4.5,1724,4.234067
4891,d10IxZPirVJlOSpdRZJczA,Citizen Public House,"7111 E 5th Ave, Ste E",Scottsdale,AZ,85251,4.5,1550,4.212277
1542,r5PLDU-4mSbde5XekTXSCA,Defalco's Italian Grocery,"2334 N Scottsdale Rd, Ste A133",Scottsdale,AZ,85257,4.5,1057,4.125286
2449,Iq7NqQD-sESu3vr9iEGuTA,Butters Pancakes & Café,"8390 E Via De Ventura, Ste F-108",Scottsdale,AZ,85258,4.5,972,4.104679
2992,TkEMlu88OZn9TKZyeY9CJg,Mastro's City Hall,6991 E Camelback Rd,Scottsdale,AZ,85251,4.5,785,4.050266
3099,Noi53T0PWNEN9mQRS3-Ncg,D'Lite Healthy On The Go,2613 N Scottsdale Rd,Scottsdale,AZ,85257,4.5,767,4.044227
2040,vsFFbN71ehRCp46KeR5RdQ,Butterfield's Pancake House,7388 E Shea Blvd,Scottsdale,AZ,85260,4.5,732,4.032009
3518,C2BR0TjNacoNzItPLUHTAA,Coconut's Fish Cafe,16640 N Scottsdale Rd,Scottsdale,AZ,85254,4.5,702,4.021003
3015,PNTyiqS7R-0c1ofxOfDijQ,Cafe Monarch,6939 E 1st Ave,Scottsdale,AZ,85251,4.5,576,3.968503
1250,K-uQkfSUTwu5LIwPB4b_vg,The Mission Old Town,3815 N Brown Ave,Scottsdale,AZ,85251,4.0,1659,3.854784


**Popular Restaurants in Mesa**

In [108]:
Popularity_Recommendation('Mesa')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Business_id,Name,Address,City,State,Postal_code,Restaurant_ratings,Review_count,Weighted_rating
787,ohEnmKpF7i2_ujme1p_vUQ,Cornish Pasty Company,"1941 W Guadalupe Rd, Ste 101",Mesa,AZ,85202,4.5,1023,4.215346
2706,16d3BlncEyCTzb0GxXrBXQ,Green Corner Restaurant,"1038 W Southern Ave, Ste 1",Mesa,AZ,85210,5.0,355,4.194439
4872,e4NQLZynhSmvwl38hC4m-A,Backyard Taco,1524 E University,Mesa,AZ,85203,4.5,892,4.182693
39,SSCH4Z2gw-hh2KZy7aH4qw,Worth Takeaway,218 W Main St,Mesa,AZ,85201,5.0,336,4.16832
2784,tV5PcOIFlB12dpcbCy78VA,India Oven,1315 S Gilbert Rd,Mesa,AZ,85204,4.5,623,4.08492
545,bsauQzwixetBDjA7SqUlVA,Crackers & Co Cafe,"535 W Iron Ave, Ste 131",Mesa,AZ,85210,4.5,458,3.988186
1586,c7lAaz-pW58i1h1rSlyqcw,Republica Empanada,204 E 1st Ave,Mesa,AZ,85210,4.5,434,3.970227
4014,LYgAf_vpMQU6JqBcTXg-Sw,Bobby Q - Mesa,1610 S Stapley Dr,Mesa,AZ,85204,4.5,414,3.95427
4148,i066yR2IDP4FWt6p-k9aFg,Ike's Love & Sandwiches,"1130 W Grove Ave, Ste 110",Mesa,AZ,85210,4.5,395,3.938194
348,iJBnqweAPDTCfyMcRrG90w,Giant Hamburgers,"2753 E Broadway Rd, Ste 104",Mesa,AZ,85204,4.5,343,3.88893


**Popular Restaurants in Tempe**

In [109]:
Popularity_Recommendation('Tempe')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Business_id,Name,Address,City,State,Postal_code,Restaurant_ratings,Review_count,Weighted_rating
4719,JzOp695tclcNCNMuBl7oxA,Four Peaks Brewing,"1340 E 8th St, Ste 104",Tempe,AZ,85281,4.5,1965,4.245397
1988,wl0QZqAzr1DelslQ02JGCQ,Cornish Pasty,960 W University Dr,Tempe,AZ,85281,4.5,1504,4.18623
3167,5FIOXmUE3qMviX9GafGH-Q,Green New American Vegetarian,2240 N Scottsdale Rd,Tempe,AZ,85281,4.5,1068,4.097841
4502,2GmGT-7QjowR1ihup3FbVA,Haji-Baba,1513 E Apache Blvd,Tempe,AZ,85281,4.5,720,3.981191
1464,vK1_qKEG7zHvjiTOsN0CRg,Snooze An AM Eatery,"615 S College Ave, Ste 103",Tempe,AZ,85281,4.5,708,3.97595
2639,uKKNw68mZZaa1jcyszEbyA,Crêpe Bar,"7520 S Rural Rd, Ste A12",Tempe,AZ,85283,4.5,704,3.974179
4139,QuybD_bJcAB2CHcpTfREYg,Detroit Coney Grill,930 W Broadway Rd,Tempe,AZ,85282,4.5,546,3.893185
4797,366FaRQ1MWEHBhaEKTcriQ,Spinato's Pizza,227 S Smith Rd,Tempe,AZ,85281,4.5,530,3.883569
252,qgAZPDQStZP_Y5XTiQ6UqA,Postino Annex,615 S College Ave,Tempe,AZ,85281,4.5,513,3.873013
35,aiX_WP7NKPTdF9CfI-M-wg,Culinary Dropout,149 S Farmer,Tempe,AZ,85281,4.0,984,3.73864


**Popular Restaurants in Chandler**

In [110]:
Popularity_Recommendation('Chandler')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Business_id,Name,Address,City,State,Postal_code,Restaurant_ratings,Review_count,Weighted_rating
3811,8yAVuMwtijEosdjiCrtyXQ,Fired Pie,"2855 W Ray Rd, Ste 5",Chandler,AZ,85224,4.5,454,3.9671
523,ujgpePdD8Q-fP1mPFnw0Qw,Peixoto Coffee,11 W Boston Suite,Chandler,AZ,85225,4.5,423,3.945593
4524,Pthe4qk5xh4n-ef-9bvMSg,Chon Thai Food,"2330 N Alma School Rd, Ste 116",Chandler,AZ,85224,4.5,420,3.943419
987,U4OLUFb9VxMmpMXm1ZF-cQ,Pho Chandler,"4055 S Arizona Ave, Ste 8",Chandler,AZ,85248,4.5,402,3.930009
124,S4qUsi44l5D9BbUCkhwQEA,New India Gate,"4939 W Ray Rd, Ste 1",Chandler,AZ,85226,4.5,354,3.890873
4784,L9pTWWAATj7HoLaRe6ZA1Q,Philly's Famous,"1250 N Alma School Rd, Ste 31",Chandler,AZ,85224,4.5,347,3.884713
3588,mD7zqv7Y3kvsa_p_MtTayg,SanTan Brewing Company,8 S San Marcos Pl,Chandler,AZ,85225,4.0,1100,3.824725
4222,wHVWfC_a6koeiJXAu78e1g,Rudy's Country Store And Bar-B-Q,7300 W Chandler Blvd,Chandler,AZ,85226,4.0,729,3.764184
3044,S37sKRRfkhFZRpxaYzWo_A,China Magic Noodle House,"2015 N Dobson Rd, Ste 2",Chandler,AZ,85224,4.0,505,3.702046
1155,6ZIHxvFTHC1pvAzAS0uLDA,Lee's Sandwiches,"1901 W Warner Rd, Ste 1",Chandler,AZ,85224,4.0,451,3.681836


**Below steps will help persist the data for popularity recommendation as a pickled binary file to read from the UI**

In [111]:
pickle.dump(Popularity_Recommendation('Phoenix'), open('Popularity_restaurant_Phoenix.pkl', 'wb'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [112]:
# Steps to read the pickled file
Popularity_restaurant_Phoenix = pickle.load(open('Popularity_restaurant_Phoenix.pkl', 'rb'))
Popularity_restaurant_Phoenix.head(3)

Unnamed: 0,Business_id,Name,Address,City,State,Postal_code,Restaurant_ratings,Review_count,Weighted_rating
4755,Xg5qEQiB-7L6kGJ5F4K3bQ,Little Miss BBQ,4301 E University Dr,Phoenix,AZ,85034,5.0,1463,4.588637
3075,VyVIneSU7XAWgMBllI6LnQ,Bobby Q,8501 N 27th Ave,Phoenix,AZ,85051,4.5,1940,4.261803
3543,OgJ0KxwJcJ9R5bUK0ixCbg,Lux Central,4400 N Central Ave,Phoenix,AZ,85012,4.5,1770,4.243376


In [113]:
Popularity_restaurant_Phoenix = Popularity_restaurant_Phoenix.head(3)[['Name','Address','City','State','Postal_code',
                                                                       'Restaurant_ratings','Review_count']]
Popularity_restaurant_Phoenix.head()

Unnamed: 0,Name,Address,City,State,Postal_code,Restaurant_ratings,Review_count
4755,Little Miss BBQ,4301 E University Dr,Phoenix,AZ,85034,5.0,1463
3075,Bobby Q,8501 N 27th Ave,Phoenix,AZ,85051,4.5,1940
3543,Lux Central,4400 N Central Ave,Phoenix,AZ,85012,4.5,1770


In [114]:
Popularity_restaurant_Phoenix.index = ['Recommendation 1','Recommendation 2','Recommendation 3']
Popularity_restaurant_Phoenix.transpose()

Unnamed: 0,Recommendation 1,Recommendation 2,Recommendation 3
Name,Little Miss BBQ,Bobby Q,Lux Central
Address,4301 E University Dr,8501 N 27th Ave,4400 N Central Ave
City,Phoenix,Phoenix,Phoenix
State,AZ,AZ,AZ
Postal_code,85034,85051,85012
Restaurant_ratings,5,4.5,4.5
Review_count,1463,1940,1770


In [115]:
# Pickle the popularity based Restaurants for remaining cities 
pickle.dump(Popularity_Recommendation('Scottsdale'), open('Popularity_restaurant_Scottsdale.pkl', 'wb'))
pickle.dump(Popularity_Recommendation('Mesa'), open('Popularity_restaurant_Mesa.pkl', 'wb'))
pickle.dump(Popularity_Recommendation('Tempe'), open('Popularity_restaurant_Tempe.pkl', 'wb'))
pickle.dump(Popularity_Recommendation('Chandler'), open('Popularity_restaurant_Chandler.pkl', 'wb'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# Building filter for Popular Restaurants based on Content based search Engine

In [116]:
#Reading the persisted file for recommendation. Currently its stored as a CSV file
AZ_restrnt_Phoenix_flatten_final_df.head(3)

Unnamed: 0,User_id,Business_id,Review_ratings,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,RestaurantsPriceRange2,GoodForMeal_breakfast,GoodForMeal_lunch,GoodForMeal_dinner,GoodForMeal_latenight,Alcohol,Ambience_casual,Ambience_classy,Ambience_romantic,Ambience_trendy,Ambience_upscale,HasTV,BusinessParking_garage,BusinessParking_lot,BusinessParking_street,BusinessParking_valet,WiFi,Category_Fast_Food,Category_Sandwiches,Category_Mexican,Category_American_Traditional,Category_Nightlife,Category_Pizza,Category_Bars,Category_Burgers,Category_Breakfast_Brunch,Category_American_New,User_total_ratings
5,ymV6L8ziRAplNuZ2KA8OPA,M3uV9Y3EDSpy9d4YwyNSAQ,5.0,Yakiramen,10605 N 43rd Ave,Phoenix,AZ,85029,66,4.0,2.0,0,1,1,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,4
6,ymV6L8ziRAplNuZ2KA8OPA,0LmDg_Y6Ht3zFqtvmLhp6g,5.0,Cornish Pasty,7 W Monroe St,Phoenix,AZ,85003,168,4.0,2.0,0,1,1,0,2,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,1,0,0,0,4
7,ymV6L8ziRAplNuZ2KA8OPA,dR3HS1tLVo53KO7F8BF6og,4.0,Taylor's Chowder House,3538 W Calavar Rd,Phoenix,AZ,85053,102,3.5,2.0,0,1,1,0,2,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,4


**Function to filter Restaurants based on the search criteria**

In [117]:
def content_based_filter(restaurant_df, price_range_val, meal_pref, alcohol_val,
                         ambience_pref, HasTV_val, parking_pref, WiFi_val,
                         cuisine_pref):
    if not math.isnan(price_range_val):
        restaurant_df = restaurant_df[restaurant_df['RestaurantsPriceRange2'] == price_range_val]
    if meal_pref !='' :
        restaurant_df = restaurant_df[restaurant_df[meal_pref] == 1]
    if not math.isnan(alcohol_val):
        restaurant_df = restaurant_df[restaurant_df['Alcohol'] == alcohol_val]
    if ambience_pref !='' :
        restaurant_df = restaurant_df[restaurant_df[ambience_pref] == 1]
    if not math.isnan(HasTV_val):
        restaurant_df = restaurant_df[restaurant_df['HasTV'] == HasTV_val]
    if parking_pref !='' :
        restaurant_df = restaurant_df[restaurant_df[parking_pref] == 1]
    if not math.isnan(WiFi_val):
        restaurant_df = restaurant_df[restaurant_df['WiFi'] == WiFi_val]
    if cuisine_pref !='' :
        for cuisine in cuisine_pref:
            restaurant_df = restaurant_df[restaurant_df[cuisine] == 1]
    return restaurant_df

**Function for weighted average rating of the restaurants**

In [118]:
def Content_Rating_weightage(User_Restaurant_df, percentile):
    # Review counts from the DF
    Rev_cnt = User_Restaurant_df[User_Restaurant_df['Review_count'].notnull()]['Review_count'].astype('int') 
    n = Rev_cnt.quantile(percentile)
    #Average rating from the DF
    Res_rating = User_Restaurant_df[User_Restaurant_df['Restaurant_ratings'].notnull()]['Restaurant_ratings'].astype('int')
    Avg_rating = Res_rating.mean()

    print('Considering only the top {}th Percentile' .format(percentile*100))
    qualified = User_Restaurant_df[(User_Restaurant_df['Review_count'] >= n) ]
    
    qualified['Weighted_rating'] = qualified.apply(lambda x: 
                        (x['Review_count']/(x['Review_count']+n) * x['Restaurant_ratings']) + (n/(n+x['Review_count']) * 
                                                                                               Avg_rating), axis=1)
    qualified = qualified.filter(['Business_id','Name','Address','City','State','Postal_code','Review_count','Restaurant_ratings',
                                  'Weighted_rating']).sort_values('Weighted_rating', ascending=False)
    
    return qualified

In [119]:
##%%timeit
# Remove duplicate rows by business_id
AZ_restrnt_Phoenix_flatten_final_nodup_df = AZ_restrnt_Phoenix_flatten_final_df.drop_duplicates('Business_id')

# Apply filter on the persisted data based on selection criteria
AZ_restrnt_Phoenix_filter_nodup_df = content_based_filter(AZ_restrnt_Phoenix_flatten_final_nodup_df, price_range_val = 1,
                                                    meal_pref = '', alcohol_val = 0,
                                                    ambience_pref = '', HasTV_val = np.nan,
                                                    parking_pref = 'BusinessParking_lot', WiFi_val = 0,
                                                    cuisine_pref = ['Category_Mexican'])

AZ_restrnt_Phoenix_filter_nodup_df = AZ_restrnt_Phoenix_filter_nodup_df[['Business_id','Name','Address','City',
                                                                         'State','Postal_code','Review_count',
                                                                         'Restaurant_ratings']]

print(AZ_restrnt_Phoenix_filter_nodup_df.shape)
AZ_restrnt_Phoenix_filter_nodup_df.head()

(70, 8)


Unnamed: 0,Business_id,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings
154,URwrgfuyfpVqgnKwkawlHw,Rolando's Mexican Food,10639 N 35th Ave,Phoenix,AZ,85029,85,4.5
166,pBwOqgpkaDmFUDYX8cbnNQ,Tortas El Guero,2518 N 16th St,Phoenix,AZ,85006,163,4.5
298,UDucqwO8PwUOwcmUsJ-gOA,Asadero Norte De Sonora,122 N 16th St,Phoenix,AZ,85034,89,4.5
361,tCkd8eAx2mR0oioQcqkxPw,Mucho Macho Taco,5341 N 7th Ave,Phoenix,AZ,85013,185,4.0
392,9u7knPXVCUOwnsz8sTTT_w,Las 15 Salsas,722 W Hatcher Rd,Phoenix,AZ,85021,48,4.5


In [120]:
#If the total filtered observations are less than 10, then we do not exclude any businesses based on percentile of review count.
# If the total filtered observation is above 9, then we take top 50% restaurants with top reviews to identify the weighted
# rating score.

if len(AZ_restrnt_Phoenix_filter_nodup_df) >= 10:
    percentile=0.5
    AZ_restrnt_Phoenix_content_weighted_df = Content_Rating_weightage(AZ_restrnt_Phoenix_filter_nodup_df,percentile)
elif len(AZ_restrnt_Phoenix_filter_nodup_df) == 0:
    print('No records found')
else:
    percentile=0
    AZ_restrnt_Phoenix_content_weighted_df = Content_Rating_weightage(AZ_restrnt_Phoenix_filter_nodup_df,percentile)

Considering only the top 50.0th Percentile


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [121]:
# We only want to recommend restaurants that has weighted rating of above 3.5
AZ_restrnt_Phoenix_content_weighted_df = AZ_restrnt_Phoenix_content_weighted_df \
                                        [AZ_restrnt_Phoenix_content_weighted_df['Weighted_rating'] > 3.5]
print(AZ_restrnt_Phoenix_content_weighted_df.shape)
AZ_restrnt_Phoenix_content_weighted_df

(35, 9)


Unnamed: 0,Business_id,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,Weighted_rating
1945,7m1Oa1VYV98UUuo_6i0EZg,Paradise Valley Burger Company,4001 E Bell Rd,Phoenix,AZ,85032,1019,4.5,4.463575
945,9vub2LM7Djy8P-LPumcLXA,Tacos Chiwas,1923 E Mcdowell Rd,Phoenix,AZ,85006,306,4.5,4.390929
1109,E_DXCeVllZtHV93hsAIZvw,Tacos Sahuaro,2320 N 32nd St,Phoenix,AZ,85008,207,4.5,4.349157
4277,G_mqw9nNYDFkwtCe5Jd8QA,Tacos Kissi,2720 W Bethany Home,Phoenix,AZ,85017,195,4.5,4.341814
166,pBwOqgpkaDmFUDYX8cbnNQ,Tortas El Guero,2518 N 16th St,Phoenix,AZ,85006,163,4.5,4.318215
1801,4aF5GfnVkJPj9IwJ2yxPtQ,The Tamale Store,13046 N Cave Creek Rd,Phoenix,AZ,85022,128,4.5,4.28277
2717,A2eA3LRbptrexCGw8fu67Q,United Lunchadores Street Gourmet,1339 E Northern Ave,Phoenix,AZ,85020,103,4.5,4.247619
1130,4RV97YE8VEw05tu0WO425g,La Frontera,209 N 16th St,Phoenix,AZ,85034,97,4.5,4.237422
298,UDucqwO8PwUOwcmUsJ-gOA,Asadero Norte De Sonora,122 N 16th St,Phoenix,AZ,85034,89,4.5,4.222471
154,URwrgfuyfpVqgnKwkawlHw,Rolando's Mexican Food,10639 N 35th Ave,Phoenix,AZ,85029,85,4.5,4.214338


**Before we recommend the above top restaurants, we need to identify if the customer has already visited any restaurant with the above filter and if he has rated the restaurant high. In that case we need to give an additional weightage to those restaurants.   
If he has visited any of the filtered restaurants but have not rated good, then we need to bring its weighted rating down, so that those restaurants wouldnt be recommeded to the customer.**

In [122]:
#Provide the customer ID
user_input = 'PxDKVBipTwYFaNBkvv9xbg'

In [123]:
# Capture the top restaurants based on the content filter results and find those restaurants from the persisted flat 
# restaurant data in CSV

Top_Content_based_Restaurant = list(AZ_restrnt_Phoenix_content_weighted_df['Business_id'])
AZ_restrnt_Phoenix_content_weighted_filtered =  AZ_restrnt_Phoenix_flatten_final_df[AZ_restrnt_Phoenix_flatten_final_df \
                                                    ['Business_id'].isin(Top_Content_based_Restaurant)]

In [124]:
print(AZ_restrnt_Phoenix_content_weighted_filtered.shape)
AZ_restrnt_Phoenix_content_weighted_filtered.head(3)

(3722, 38)


Unnamed: 0,User_id,Business_id,Review_ratings,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,RestaurantsPriceRange2,GoodForMeal_breakfast,GoodForMeal_lunch,GoodForMeal_dinner,GoodForMeal_latenight,Alcohol,Ambience_casual,Ambience_classy,Ambience_romantic,Ambience_trendy,Ambience_upscale,HasTV,BusinessParking_garage,BusinessParking_lot,BusinessParking_street,BusinessParking_valet,WiFi,Category_Fast_Food,Category_Sandwiches,Category_Mexican,Category_American_Traditional,Category_Nightlife,Category_Pizza,Category_Bars,Category_Burgers,Category_Breakfast_Brunch,Category_American_New,User_total_ratings
154,Axdhci7Y9g0Ft2WSA-LluQ,URwrgfuyfpVqgnKwkawlHw,5.0,Rolando's Mexican Food,10639 N 35th Ave,Phoenix,AZ,85029,85,4.5,1.0,1,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,8
166,lBrN1thEabiaQoPmhslunw,pBwOqgpkaDmFUDYX8cbnNQ,5.0,Tortas El Guero,2518 N 16th St,Phoenix,AZ,85006,163,4.5,1.0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,3
298,wMkG4oL_WoQ7KhB83zO2sQ,UDucqwO8PwUOwcmUsJ-gOA,5.0,Asadero Norte De Sonora,122 N 16th St,Phoenix,AZ,85034,89,4.5,1.0,0,1,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,3


In [125]:
#Identify if the userID is existing user or a new user.
#For new user, we will not check if there was an existing rating given to the filtered restaurants.
#For existing user, we will check if there was an existing rating by the user and take the score into consideration 
#before final recommendation

#%%timeit
if user_input in AZ_restrnt_Phoenix_content_weighted_filtered['User_id'].unique():
    AZ_restrnt_user_filtered = AZ_restrnt_Phoenix_content_weighted_filtered[AZ_restrnt_Phoenix_content_weighted_filtered\
                                                                            ['User_id'] == user_input]
    
else:
    # Final Content based recommentaion for a New User 
    AZ_restrnt_Phoenix_content_user_RS = AZ_restrnt_Phoenix_content_weighted_df.head(3)
    
AZ_restrnt_user_filtered.head()

Unnamed: 0,User_id,Business_id,Review_ratings,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,RestaurantsPriceRange2,GoodForMeal_breakfast,GoodForMeal_lunch,GoodForMeal_dinner,GoodForMeal_latenight,Alcohol,Ambience_casual,Ambience_classy,Ambience_romantic,Ambience_trendy,Ambience_upscale,HasTV,BusinessParking_garage,BusinessParking_lot,BusinessParking_street,BusinessParking_valet,WiFi,Category_Fast_Food,Category_Sandwiches,Category_Mexican,Category_American_Traditional,Category_Nightlife,Category_Pizza,Category_Bars,Category_Burgers,Category_Breakfast_Brunch,Category_American_New,User_total_ratings
3412,PxDKVBipTwYFaNBkvv9xbg,pBwOqgpkaDmFUDYX8cbnNQ,5.0,Tortas El Guero,2518 N 16th St,Phoenix,AZ,85006,163,4.5,1.0,0,1,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,25
3415,PxDKVBipTwYFaNBkvv9xbg,sVv9E7PrAMPD9zjSHj21XA,5.0,Restaurants Atoyac Estilo Oaxaca,1830 W Glendale Ave,Phoenix,AZ,85021,89,4.0,1.0,0,1,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,25


In [126]:
#Perform a left outer join on the city based weighted rating and user filtered data by User ID
AZ_restrnt_Phoenix_content_weighted_user_merged = pd.merge(AZ_restrnt_Phoenix_content_weighted_df,
                                                           AZ_restrnt_user_filtered[['Business_id','User_id','Review_ratings']],
                                                          on = 'Business_id', how = 'left').sort_values(['User_id']) 

#Fill Nan in the Review_ratings with the Weighted_rating to take the new weighted rating. If the user has not rated a restaurant
# then the existing weighted rating is retained. Otherwise the new weighted rating would change based on the existing rating
# given by the user.

AZ_restrnt_Phoenix_content_weighted_user_merged['Review_ratings'].fillna(AZ_restrnt_Phoenix_content_weighted_user_merged['Weighted_rating'],inplace=True)
AZ_restrnt_Phoenix_content_weighted_user_merged['Weighted_rating_new'] = (AZ_restrnt_Phoenix_content_weighted_user_merged['Weighted_rating'] +
                                                                          AZ_restrnt_Phoenix_content_weighted_user_merged['Review_ratings']) /2
AZ_restrnt_Phoenix_content_user_RS = AZ_restrnt_Phoenix_content_weighted_user_merged.sort_values(['Weighted_rating_new'],ascending=False) 
AZ_restrnt_Phoenix_content_user_RS.head(10)

Unnamed: 0,Business_id,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,Weighted_rating,User_id,Review_ratings,Weighted_rating_new
4,pBwOqgpkaDmFUDYX8cbnNQ,Tortas El Guero,2518 N 16th St,Phoenix,AZ,85006,163,4.5,4.318215,PxDKVBipTwYFaNBkvv9xbg,5.0,4.659108
0,7m1Oa1VYV98UUuo_6i0EZg,Paradise Valley Burger Company,4001 E Bell Rd,Phoenix,AZ,85032,1019,4.5,4.463575,,4.463575,4.463575
27,sVv9E7PrAMPD9zjSHj21XA,Restaurants Atoyac Estilo Oaxaca,1830 W Glendale Ave,Phoenix,AZ,85021,89,4.0,3.905745,PxDKVBipTwYFaNBkvv9xbg,5.0,4.452872
1,9vub2LM7Djy8P-LPumcLXA,Tacos Chiwas,1923 E Mcdowell Rd,Phoenix,AZ,85006,306,4.5,4.390929,,4.390929,4.390929
2,E_DXCeVllZtHV93hsAIZvw,Tacos Sahuaro,2320 N 32nd St,Phoenix,AZ,85008,207,4.5,4.349157,,4.349157,4.349157
3,G_mqw9nNYDFkwtCe5Jd8QA,Tacos Kissi,2720 W Bethany Home,Phoenix,AZ,85017,195,4.5,4.341814,,4.341814,4.341814
5,4aF5GfnVkJPj9IwJ2yxPtQ,The Tamale Store,13046 N Cave Creek Rd,Phoenix,AZ,85022,128,4.5,4.28277,,4.28277,4.28277
6,A2eA3LRbptrexCGw8fu67Q,United Lunchadores Street Gourmet,1339 E Northern Ave,Phoenix,AZ,85020,103,4.5,4.247619,,4.247619,4.247619
7,4RV97YE8VEw05tu0WO425g,La Frontera,209 N 16th St,Phoenix,AZ,85034,97,4.5,4.237422,,4.237422,4.237422
8,UDucqwO8PwUOwcmUsJ-gOA,Asadero Norte De Sonora,122 N 16th St,Phoenix,AZ,85034,89,4.5,4.222471,,4.222471,4.222471


**Top 3 Restaurants for the user based on the search criteria**

In [127]:
AZ_restrnt_Phoenix_content_user_RS[['Business_id', 'Name', 'Address', 'City', 'State', 'Postal_code', 'Review_count', 
                                    'Restaurant_ratings']].head(3)

Unnamed: 0,Business_id,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings
4,pBwOqgpkaDmFUDYX8cbnNQ,Tortas El Guero,2518 N 16th St,Phoenix,AZ,85006,163,4.5
0,7m1Oa1VYV98UUuo_6i0EZg,Paradise Valley Burger Company,4001 E Bell Rd,Phoenix,AZ,85032,1019,4.5
27,sVv9E7PrAMPD9zjSHj21XA,Restaurants Atoyac Estilo Oaxaca,1830 W Glendale Ave,Phoenix,AZ,85021,89,4.0


## Creating a function for content based filter recommendation

In [128]:
#Define Content search function
def Content_search(user_input, AZ_restrnt_city_flatten_final_df, price_range_val,meal_pref, alcohol_val, ambience_pref, HasTV_val,
                                                    parking_pref, WiFi_val,cuisine_pref):
    
    #Function to filter Restaurants based on the search criteria
    def content_based_filter(restaurant_df, price_range_val, meal_pref, alcohol_val,
                         ambience_pref, HasTV_val, parking_pref, WiFi_val,
                         cuisine_pref):
        
        if not math.isnan(price_range_val):
            restaurant_df = restaurant_df[restaurant_df['RestaurantsPriceRange2'] == price_range_val]
        if meal_pref !='' :
            restaurant_df = restaurant_df[restaurant_df[meal_pref] == 1]
        if not math.isnan(alcohol_val):
            restaurant_df = restaurant_df[restaurant_df['Alcohol'] == alcohol_val]
        if ambience_pref !='' :
            restaurant_df = restaurant_df[restaurant_df[ambience_pref] == 1]
        if not math.isnan(HasTV_val):
            restaurant_df = restaurant_df[restaurant_df['HasTV'] == HasTV_val]
        if parking_pref !='' :
            restaurant_df = restaurant_df[restaurant_df[parking_pref] == 1]
        if not math.isnan(WiFi_val):
            restaurant_df = restaurant_df[restaurant_df['WiFi'] == WiFi_val]
        if cuisine_pref !='' :
            for cuisine in cuisine_pref:
                restaurant_df = restaurant_df[restaurant_df[cuisine] == 1]
        return restaurant_df
    
    # Function for weighted average rating of the restaurants
    def Content_Rating_weightage(User_Restaurant_df, percentile):
        # Review counts from the DF
        Rev_cnt = User_Restaurant_df[User_Restaurant_df['Review_count'].notnull()]['Review_count'].astype('int')
        n = Rev_cnt.quantile(percentile)
        #Average rating from the DF
        Res_rating = User_Restaurant_df[User_Restaurant_df['Restaurant_ratings'].notnull()]['Restaurant_ratings'].astype('int')
        Avg_rating = Res_rating.mean()
    
        #print('Considering only the top {}th Percentile' .format(percentile*100))
        qualified = User_Restaurant_df[(User_Restaurant_df['Review_count'] >= n) ]
    
        qualified['Weighted_rating'] = qualified.apply(lambda x: (x['Review_count']/(x['Review_count']+n) * 
                                                                  x['Restaurant_ratings']) + (n/(n+x['Review_count']) * 
                                                                                              Avg_rating), axis=1)
        qualified = qualified.filter(['Business_id','Name','Address','City','State','Postal_code','Review_count',
                                      'Restaurant_ratings','Weighted_rating']).sort_values('Weighted_rating', ascending=False)
    
        return qualified

    
    # Remove duplicate rows by business_ids
    AZ_restrnt_city_flatten_final_nodup_df = AZ_restrnt_city_flatten_final_df.drop_duplicates('Business_id')

    # Apply filter on the persisted data based on selection criteria
    AZ_restrnt_city_filter_nodup_df = content_based_filter(AZ_restrnt_city_flatten_final_nodup_df, price_range_val, meal_pref, 
                                                           alcohol_val,ambience_pref, HasTV_val, parking_pref, WiFi_val, 
                                                           cuisine_pref)

    AZ_restrnt_city_filter_nodup_df = AZ_restrnt_city_filter_nodup_df[['Business_id','Name','Address','City','State',
                                                                       'Postal_code','Review_count','Restaurant_ratings']]

    #If the total filtered observations are less than 10, then we do not exclude any businesses based on percentile of review count.
    # If the total filtered observation is above 9, then we take top 50% restaurants with top reviews to identify the weighted
    # rating score.

    if len(AZ_restrnt_city_filter_nodup_df) >= 10:
        percentile=0.5
        AZ_restrnt_city_content_weighted_df = Content_Rating_weightage(AZ_restrnt_city_filter_nodup_df,percentile)
    elif len(AZ_restrnt_city_filter_nodup_df) == 0:
        # If no restaurants are found with the filter condition, return empty dataframe
        AZ_restrnt_city_content_user_RS = pd.DataFrame(columns =['Business_id','Name','Address','City','State','Postal_code',
                                                                 'Review_count','Restaurant_ratings','Weighted_rating'])
        return AZ_restrnt_city_content_user_RS
    else:
        percentile=0
        AZ_restrnt_city_content_weighted_df = Content_Rating_weightage(AZ_restrnt_city_filter_nodup_df,percentile)
        
    # We only want to recommend restaurants that has weighted rating of above 3.5
    AZ_restrnt_city_content_weighted_df = AZ_restrnt_city_content_weighted_df \
                                         [AZ_restrnt_city_content_weighted_df['Weighted_rating'] > 3.5]
    
    # Before we recommend the above top restaurants, we need to identify if the customer has already visited any restaurant 
    # with the above filter and if he has rated the restaurant high. In that case we need to give an additional weightage 
    # to those restaurants.
    # If he has visited any of the filtered restaurants but have not rated good, then we need to bring its weighted rating down, 
    # so that those restaurants wouldnt be recommended to the customer.
    
    # Capture the top unique restaurants based on the content filter results and find those restaurants from the persisted flat 
    # restaurant data in CSV
    Top_Content_based_Restaurant = list(AZ_restrnt_city_content_weighted_df['Business_id'])
    AZ_restrnt_city_content_weighted_filtered =  AZ_restrnt_city_flatten_final_df[AZ_restrnt_city_flatten_final_df \
                                                    ['Business_id'].isin(Top_Content_based_Restaurant)]
    
    #Identify if the userID is existing user or a new user.
    #For new user, we will not check if there was an existing rating given to the filtered restaurants.
    #For existing user, we will check if there was an existing rating and take the score into consideration before final 
    #recommendation

    if user_input in AZ_restrnt_city_content_weighted_filtered['User_id'].unique():
        AZ_restrnt_user_filtered = AZ_restrnt_city_content_weighted_filtered[AZ_restrnt_city_content_weighted_filtered\
                                                                            ['User_id'] == user_input]
        
        #Perform a left outer join on the city based weighted rating and user filtered data by User ID
        AZ_restrnt_city_content_weighted_user_merged = pd.merge(AZ_restrnt_city_content_weighted_df,
                                                                AZ_restrnt_user_filtered[['Business_id','User_id',
                                                                                          'Review_ratings']],
                                                                on = 'Business_id', how = 'left').sort_values(['User_id']) 

        # Fill Nan in the Review_ratings with the Weighted_rating to take the new weighted rating. If the user has not rated 
        # a restaurant then the existing weighted rating is retained. Otherwise the new weighted rating would change based 
        # on the existing rating given by the user.

        AZ_restrnt_city_content_weighted_user_merged['Review_ratings'].fillna(AZ_restrnt_city_content_weighted_user_merged \
                                                                              ['Weighted_rating'],inplace=True)
        AZ_restrnt_city_content_weighted_user_merged['Weighted_rating_new'] = (AZ_restrnt_city_content_weighted_user_merged \
                                                                               ['Weighted_rating'] + 
                                                                               AZ_restrnt_city_content_weighted_user_merged \
                                                                               ['Review_ratings']) /2
        AZ_restrnt_city_content_user_RS = AZ_restrnt_city_content_weighted_user_merged.sort_values(['Weighted_rating_new'], \
                                                                                                   ascending=False)

    else:
        # Final Content based recommendation for a New User 
        AZ_restrnt_city_content_user_RS = AZ_restrnt_city_content_weighted_df
    
    return AZ_restrnt_city_content_user_RS.head(3)


In [129]:
#Input to the content based Recommendation filter
user_input = 'PxDKVBipTwYFaNBkvv9xbg'
city = "Phoenix"
AZ_restrnt_city_flatten_final_df = AZ_restrnt_Phoenix_flatten_final_df # This dataframe needs to be read from CSV which is persisted

AZ_restrnt_city_content_user_RS = Content_search(user_input, AZ_restrnt_city_flatten_final_df, price_range_val = 1, 
                                                 meal_pref = '',alcohol_val = 0,ambience_pref = 'Ambience_casual', 
                                                 HasTV_val = np.nan, parking_pref = 'BusinessParking_lot', 
                                                 WiFi_val = 0, cuisine_pref = ['Category_Mexican'])

#Display the recommended Restaurants
AZ_restrnt_city_content_user_RS.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Business_id,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,Weighted_rating,User_id,Review_ratings,Weighted_rating_new
3,pBwOqgpkaDmFUDYX8cbnNQ,Tortas El Guero,2518 N 16th St,Phoenix,AZ,85006,163,4.5,4.256979,PxDKVBipTwYFaNBkvv9xbg,5.0,4.628489
0,7m1Oa1VYV98UUuo_6i0EZg,Paradise Valley Burger Company,4001 E Bell Rd,Phoenix,AZ,85032,1019,4.5,4.446438,,4.446438,4.446438
19,sVv9E7PrAMPD9zjSHj21XA,Restaurants Atoyac Estilo Oaxaca,1830 W Glendale Ave,Phoenix,AZ,85021,89,4.0,3.885053,PxDKVBipTwYFaNBkvv9xbg,5.0,4.442526


In [130]:
#Input to the content based Recommendation filter
user_input = 'PxDKVBipTwYFaNBkvv9xbg'
city = "Phoenix"
AZ_restrnt_city_flatten_final_df = AZ_restrnt_Phoenix_flatten_final_df # This dataframe needs to be read from CSV which is persisted

AZ_restrnt_city_content_user_RS = Content_search(user_input, AZ_restrnt_city_flatten_final_df, price_range_val = np.nan, 
                                                 meal_pref = '',alcohol_val = np.nan,ambience_pref = '', 
                                                 HasTV_val = np.nan, parking_pref = '', 
                                                 WiFi_val = np.nan, cuisine_pref = [])

#Display the recommended Restaurants
AZ_restrnt_city_content_user_RS.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Business_id,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,Weighted_rating,User_id,Review_ratings,Weighted_rating_new
0,Xg5qEQiB-7L6kGJ5F4K3bQ,Little Miss BBQ,4301 E University Dr,Phoenix,AZ,85034,1463,5.0,4.92996,,4.92996,4.92996
3,9a3DrZvpYxVs3k_qwlCNSw,Cibo,603 N 5th Ave,Phoenix,AZ,85003,1698,4.5,4.462503,PxDKVBipTwYFaNBkvv9xbg,5.0,4.731252
8,S-oLPRdhlyL5HAknBKTUcQ,Harumi Sushi,"114 W Adams St, Ste C101",Phoenix,AZ,85003,906,4.5,4.432475,PxDKVBipTwYFaNBkvv9xbg,5.0,4.716238


In [131]:
#Input to the content based Recommendation filter for a new user
user_input = 'Shobin'
city = "Phoenix"
AZ_restrnt_city_flatten_final_df = AZ_restrnt_Phoenix_flatten_final_df # This dataframe needs to be read from CSV which is persisted

AZ_restrnt_city_content_user_RS = Content_search(user_input, AZ_restrnt_city_flatten_final_df, price_range_val = 1, 
                                                 meal_pref = '',alcohol_val = 0,ambience_pref = 'Ambience_casual', 
                                                 HasTV_val = np.nan, parking_pref = 'BusinessParking_lot', 
                                                 WiFi_val = 0, cuisine_pref = ['Category_Mexican'])

#Display the recommended Restaurants
AZ_restrnt_city_content_user_RS.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Business_id,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,Weighted_rating
1945,7m1Oa1VYV98UUuo_6i0EZg,Paradise Valley Burger Company,4001 E Bell Rd,Phoenix,AZ,85032,1019,4.5,4.446438
945,9vub2LM7Djy8P-LPumcLXA,Tacos Chiwas,1923 E Mcdowell Rd,Phoenix,AZ,85006,306,4.5,4.347244
4277,G_mqw9nNYDFkwtCe5Jd8QA,Tacos Kissi,2720 W Bethany Home,Phoenix,AZ,85017,195,4.5,4.285361


# Building Item similarity based Recommendation model using Surprise library

In [132]:
#Select the city as Mesa
AZ_restrnt_city_flatten_final_df = AZ_restrnt_Mesa_flatten_final_df
print(AZ_restrnt_city_flatten_final_df.shape)
AZ_restrnt_city_flatten_final_df.head(3)

(23987, 38)


Unnamed: 0,User_id,Business_id,Review_ratings,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings,RestaurantsPriceRange2,GoodForMeal_breakfast,GoodForMeal_lunch,GoodForMeal_dinner,GoodForMeal_latenight,Alcohol,Ambience_casual,Ambience_classy,Ambience_romantic,Ambience_trendy,Ambience_upscale,HasTV,BusinessParking_garage,BusinessParking_lot,BusinessParking_street,BusinessParking_valet,WiFi,Category_Fast_Food,Category_Sandwiches,Category_Mexican,Category_American_Traditional,Category_Nightlife,Category_Pizza,Category_Bars,Category_Burgers,Category_Breakfast_Brunch,Category_American_New,User_total_ratings
0,wIA3P5Qgm4f52x_iM08n6w,MTH-AcNyWfsBa9sXp04HcQ,4.0,Firehouse Subs,"3420 E Baseline Rd, Ste 101",Mesa,AZ,85204,82,3.5,1.0,0,1,1,0,0,1,0,0,0,0,1,0,1,0,0,0,1,1,0,0,0,0,0,0,0,0,5
1,wIA3P5Qgm4f52x_iM08n6w,80gdzmn-E3IaXK7Z6qzC6A,4.0,Costa Vida Fresh Mexican Grill,"1744 S Val Vista Dr, Ste 106",Mesa,AZ,85204,161,3.5,1.0,0,1,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,5
2,wIA3P5Qgm4f52x_iM08n6w,YbweWBgYj9T3VqMFJcQlOw,4.0,Flaming Kabob,2252 E Baseline Rd,Mesa,AZ,85204,177,4.0,2.0,0,1,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,5


In [133]:
#Take a column subset of the dataframe
AZ_restrnt_user_collab_df = AZ_restrnt_city_flatten_final_df[['User_id', 'Business_id', 'Review_ratings']]
#AZ_restrnt_user_collab_df.Review_ratings = AZ_restrnt_user_collab_df.Review_ratings.astype('int32')

In [134]:
# surprise reader API to read the dataset
reader = Reader(rating_scale=(1, 5))
#Load a dataset from a pandas dataframe.
data = Dataset.load_from_df(AZ_restrnt_user_collab_df, reader)
#data.split(n_folds=2)

In [135]:
#Do not split the dataset into folds and just return a trainset as is, built from the whole dataset.
trainset = data.build_full_trainset()
#trainset, testset = train_test_split(data, test_size=.25,random_state=26)
print("The number of users in this data is %s" % trainset.n_users)
print("The number of Restaurants in this data is %s" % trainset.n_items)

The number of users in this data is 6716
The number of Restaurants in this data is 462


Here the number of users and restaurants match the actual unique count in the original dataset

**Performing Grid Search on the SVD algorithm to identify hyper parameters**

In [138]:
#param_grid = {'n_factors': [110, 120, 140, 160], 'n_epochs': [90, 100, 110], 'lr_all': [0.001, 0.003, 0.005, 0.008],
#              'reg_all': [0.08, 0.1, 0.15]}
param_grid = {'n_factors': [110, 140], 'n_epochs': [10, 30, 40], 'lr_all': [0.001, 0.003], 'reg_all': [0.08, 0.1]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)
algo = gs.best_estimator['rmse']
print("RMSE score", gs.best_score['rmse'])
print("RMSE Parameters", gs.best_params['rmse'])
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5)

RMSE score 1.1947234022464759
RMSE Parameters {'n_factors': 140, 'n_epochs': 40, 'lr_all': 0.003, 'reg_all': 0.1}


{'test_rmse': array([1.16632661, 1.1781625 , 1.19805187, 1.19853795, 1.20440856]),
 'test_mae': array([0.93165396, 0.94119723, 0.94512725, 0.95047884, 0.95037895]),
 'fit_time': (8.136074542999268,
  8.658329486846924,
  8.177520751953125,
  8.738985538482666,
  8.226534366607666),
 'test_time': (0.10267353057861328,
  0.07889318466186523,
  0.08778834342956543,
  0.07641911506652832,
  0.10118770599365234)}

In [139]:
#Train an algorithm on a given training set based on the best hyper-parameters
svd = SVD(n_factors= 140, n_epochs= 40, lr_all= 0.003, reg_all= 0.1)
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1cfc3a312b0>

In [140]:
#Return a list of ratings that can be used as a testset. The ratings are all the ratings that are not in the trainset, i.e.
# all the ratings where the user `u` is known, the item `i` is known, but the rating `r_{ui}`  is not in the trainset. 
testSet = trainset.build_anti_testset()
SVD_predictions = svd.test(testSet)

In [141]:
#Function to pick top 6 recommended Restaurants
from collections import defaultdict
def get_top6_recommendations(predictions, topN = 6):

    top_recs = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_recs[uid].append((iid, est))
    for uid, user_ratings in top_recs.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_recs[uid] = user_ratings[:topN]

    return top_recs

In [142]:
#creating a dictionary of all the users and the predicted restaurant ratings for the restaurants not visited
top6_recommendations_SVD = get_top6_recommendations(SVD_predictions)

In [143]:
top6_recommendations_SVD

defaultdict(list,
            {'wIA3P5Qgm4f52x_iM08n6w': [('wP8gL4KNYQQktelE-PSeFw',
               4.727713908022834),
              ('flCZyw8UBZF41dKMn8k_GQ', 4.712958983767785),
              ('7VYFL_s1HK1vOU-dzyW1Ew', 4.706337506911706),
              ('16d3BlncEyCTzb0GxXrBXQ', 4.657471145735526),
              ('nyyS63QvEzflDfOd5lLaIg', 4.646658642930871),
              ('aFJUDxHbALU-a7bfejpF2Q', 4.600605665505311)],
             'wKXgQ-KZsunr_G4Ipo1lqQ': [('_8AlKWBLY9F9zX8siTOr9A',
               4.900348202521897),
              ('6nmlclYyyLyH5EGLUpsmjw', 4.840836926370479),
              ('7VYFL_s1HK1vOU-dzyW1Ew', 4.836058251726171),
              ('nyyS63QvEzflDfOd5lLaIg', 4.813059468182365),
              ('16d3BlncEyCTzb0GxXrBXQ', 4.785565003987636),
              ('Sr-DO1Nt8X4I4Pjxapwz-g', 4.725658272855437)],
             'COJ1R7oZXqAr_15Z__rodQ': [('flCZyw8UBZF41dKMn8k_GQ',
               4.489225413234588),
              ('Sr-DO1Nt8X4I4Pjxapwz-g', 4.412897419028059),
   

In [145]:
#create a dataframe of recommended restaurants for a user
Top6_restaurant_collab_SVD_RS = pd.DataFrame(top6_recommendations_SVD['SI3h8naCrYA4183hAor3kA'],columns= \
                                             ['Business_id','Estimated_rating'])

#Recommed only the restaurants that has an estimated rating of > 3.5
Top6_restaurant_collab_SVD_RS = Top6_restaurant_collab_SVD_RS[Top6_restaurant_collab_SVD_RS['Estimated_rating'] > 3.5]
Top6_restaurant_collab_SVD_RS

Unnamed: 0,Business_id,Estimated_rating
0,7VYFL_s1HK1vOU-dzyW1Ew,4.824549
1,flCZyw8UBZF41dKMn8k_GQ,4.807064
2,eem6eoAiYL9UcQUE0kMD-g,4.772449
3,sPofLIXU9Ifot1n3iAOZHQ,4.753824
4,6nmlclYyyLyH5EGLUpsmjw,4.72898
5,eswEA8M7pyx9D_6RI-iZLw,4.715144


**Lets try KNN based cosine similarity of items**

In [146]:
sim_options = {'name': 'cosine',
               'user_based': False  # compute cosine similarities between items
               }
knn_cosine = KNNBasic(sim_options=sim_options)
knn_cosine.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1cfc490bc50>

In [147]:
sim_options = {'name': 'msd',
               'user_based': False  # compute Mean Squared Difference similarities between items
               }
knn_msd = KNNBasic(sim_options=sim_options)
knn_msd.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1cfc4a6a358>

In [148]:
sim_options = {'name': 'pearson',
               'user_based': False  # compute Pearson correlation coefficient between all pairs of items
               }
knn_pearson = KNNBasic(sim_options=sim_options)
knn_pearson.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1cfc47d51d0>

In [149]:
sim_options = {'name': 'pearson_baseline',
               'user_based': False  # compute (shrunk) Pearson correlation coefficient between all pairs of items
               }
knn_pearson_baseline = KNNBasic(sim_options=sim_options)
knn_pearson_baseline.fit(trainset)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1cfc47d5278>

**Evaluation of the KNN based models**

In [150]:
cross_validate(knn_cosine, data, measures=['RMSE', 'MAE'], cv=5)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([1.41392439, 1.3702721 , 1.3941455 , 1.37994864, 1.37225716]),
 'test_mae': array([1.00495541, 0.97666753, 1.00180278, 0.99025588, 0.97540987]),
 'fit_time': (0.16266703605651855,
  0.10363245010375977,
  0.13689637184143066,
  0.10416150093078613,
  0.10809493064880371),
 'test_time': (0.21179556846618652,
  0.157426118850708,
  0.2098090648651123,
  0.1676785945892334,
  0.15623927116394043)}

In [151]:
cross_validate(knn_msd, data, measures=['RMSE', 'MAE'], cv=5)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([1.37911087, 1.41279324, 1.3917208 , 1.38706933, 1.40847242]),
 'test_mae': array([0.97956527, 1.00165787, 0.9911146 , 0.98875959, 1.0030296 ]),
 'fit_time': (0.029266357421875,
  0.03521561622619629,
  0.03571152687072754,
  0.04116702079772949,
  0.029760122299194336),
 'test_time': (0.195420503616333,
  0.18352365493774414,
  0.23212862014770508,
  0.1617281436920166,
  0.16920804977416992)}

In [152]:
cross_validate(knn_pearson, data, measures=['RMSE', 'MAE'], cv=5)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([1.36519902, 1.32256785, 1.37540035, 1.35896141, 1.33197688]),
 'test_mae': array([1.01937273, 0.97884052, 1.02052509, 1.01461168, 0.98846071]),
 'fit_time': (0.20434784889221191,
  0.16169285774230957,
  0.1472792625427246,
  0.21774864196777344,
  0.14433717727661133),
 'test_time': (0.18104004859924316,
  0.15921783447265625,
  0.1518242359161377,
  0.21377158164978027,
  0.14780855178833008)}

In [153]:
cross_validate(knn_pearson_baseline, data, measures=['RMSE', 'MAE'], cv=5)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([1.35556723, 1.37351998, 1.37408723, 1.36487255, 1.35250172]),
 'test_mae': array([1.00119468, 1.00573561, 1.00819149, 1.01367169, 0.99789516]),
 'fit_time': (0.22645115852355957,
  0.2752830982208252,
  0.20676660537719727,
  0.2117910385131836,
  0.16613101959228516),
 'test_time': (0.28073787689208984,
  0.16218829154968262,
  0.15525245666503906,
  0.15822386741638184,
  0.1924440860748291)}

**As per Analysis SVD based collaborative filter is better than KNN based models**

**Remove Restaurants that are already recommended in the Content based search engine**

In [154]:
Top6_restaurant_collab_final = Top6_restaurant_collab_SVD_RS[~Top6_restaurant_collab_SVD_RS.Business_id.isin \
                                            (AZ_restrnt_city_content_user_RS.Business_id)]
Top6_restaurant_collab_final

Unnamed: 0,Business_id,Estimated_rating
0,7VYFL_s1HK1vOU-dzyW1Ew,4.824549
1,flCZyw8UBZF41dKMn8k_GQ,4.807064
2,eem6eoAiYL9UcQUE0kMD-g,4.772449
3,sPofLIXU9Ifot1n3iAOZHQ,4.753824
4,6nmlclYyyLyH5EGLUpsmjw,4.72898
5,eswEA8M7pyx9D_6RI-iZLw,4.715144


In [155]:
# Remove duplicate rows by business_ids on persisted city based restaurant
AZ_restrnt_city_flatten_final_nodup_df = AZ_restrnt_city_flatten_final_df.drop_duplicates('Business_id')

In [156]:
#Perform a left outer join on the city based weighted rating and user filtered data by User ID
AZ_restrnt_city_collaborative_RS = pd.merge(Top6_restaurant_collab_final,
                                            AZ_restrnt_city_flatten_final_nodup_df[['Business_id','Name', 'Address', 'City', 
                                                                                    'State', 'Postal_code', 'Review_count', 
                                                                                    'Restaurant_ratings']],
                                            on = 'Business_id', how = 'left').sort_values(['Estimated_rating'], ascending=False)
AZ_restrnt_city_collaborative_RS.head(3)

Unnamed: 0,Business_id,Estimated_rating,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings
0,7VYFL_s1HK1vOU-dzyW1Ew,4.824549,Z-Cafe,"1245 W Baseline Rd, Ste 105",Mesa,AZ,85202,170,4.5
1,flCZyw8UBZF41dKMn8k_GQ,4.807064,Joey's PSG,1038 E Main St,Mesa,AZ,85203,39,5.0
2,eem6eoAiYL9UcQUE0kMD-g,4.772449,Adrian's,1011 W Main St,Mesa,AZ,85201,215,4.5


**Pickling the models for collaborative filter**

In [157]:
#Function for collaborative model recommendation- For data to be prepared
def collaborative_model(AZ_restrnt_city_flatten_final_df):
    
    AZ_restrnt_user_collab_df = AZ_restrnt_city_flatten_final_df[['User_id', 'Business_id', 'Review_ratings']]
    #AZ_restrnt_user_collab_df.Review_ratings = AZ_restrnt_user_collab_df.Review_ratings.astype('int32')
   
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(AZ_restrnt_user_collab_df, reader)
    trainset = data.build_full_trainset()
    #trainset, testset = train_test_split(data, test_size=.25,random_state=26)
    svd = SVD(n_factors= 140, n_epochs= 40, lr_all= 0.003, reg_all= 0.1)
    svd.fit(trainset)

    from collections import defaultdict
    def get_top6_recommendations(predictions, topN = 6):

        top_recs = defaultdict(list)
        for uid, iid, true_r, est, _ in predictions:
            top_recs[uid].append((iid, est))
        for uid, user_ratings in top_recs.items():
            user_ratings.sort(key = lambda x: x[1], reverse = True)
            top_recs[uid] = user_ratings[:topN]

        return top_recs

    testSet = trainset.build_anti_testset()
    predictions = svd.test(testSet)

    top6_recommendations = get_top6_recommendations(predictions)
    return top6_recommendations 

In [158]:
#Calling the collabortive model function for city of Mesa and creating a dictionary of recommendations
Top6_restaurant_collab_Mesa = collaborative_model(AZ_restrnt_Mesa_flatten_final_df)

In [159]:
#predicting recommendaiton for a user
Top6_restaurant_collab_Mesa['SI3h8naCrYA4183hAor3kA']

[('flCZyw8UBZF41dKMn8k_GQ', 4.858547741054844),
 ('16d3BlncEyCTzb0GxXrBXQ', 4.771303170267561),
 ('7VYFL_s1HK1vOU-dzyW1Ew', 4.770409423029302),
 ('wP8gL4KNYQQktelE-PSeFw', 4.757279670273333),
 ('6nmlclYyyLyH5EGLUpsmjw', 4.744253216642304),
 ('nyyS63QvEzflDfOd5lLaIg', 4.711481296692021)]

**Creating data with list of recommendations for users in all top 5 cities**

In [160]:
Top6_restaurant_collab_Phoenix = collaborative_model(AZ_restrnt_Phoenix_flatten_final_df)

In [161]:
Top6_restaurant_collab_Scottsdale = collaborative_model(AZ_restrnt_Scottsdale_flatten_final_df)

In [162]:
Top6_restaurant_collab_Tempe = collaborative_model(AZ_restrnt_Tempe_flatten_final_df)

In [163]:
Top6_restaurant_collab_Chandler= collaborative_model(AZ_restrnt_Chandler_flatten_final_df)

**Pickle the recommendation data**

In [164]:
pickle.dump(Top6_restaurant_collab_Phoenix, open('Collaborative_filter_Phoenix.pkl', 'wb'))
pickle.dump(Top6_restaurant_collab_Scottsdale, open('Collaborative_filter_Scottsdale.pkl', 'wb'))
pickle.dump(Top6_restaurant_collab_Mesa, open('Collaborative_filter_Mesa.pkl', 'wb'))
pickle.dump(Top6_restaurant_collab_Tempe, open('Collaborative_filter_Tempe.pkl', 'wb'))
pickle.dump(Top6_restaurant_collab_Chandler, open('Collaborative_filter_Chandler.pkl', 'wb'))

**Read the pickled recommendation data**

In [165]:
Top6_restaurant_collab_Mesa_read = pickle.load(open('Collaborative_filter_Mesa.pkl', 'rb'))

**Processing the pickled recommendation for a specific user in a city**

In [166]:
#Collaborative filter function for recommending to a user
def Collaborative_filtering(user_name, pickled_model,AZ_restrnt_city_content_user_RS,AZ_restrnt_city_flatten_final_df):
    Top6_restaurant_collab_RS = pd.DataFrame(pickled_model[user_name],
                                             columns=['Business_id','Estimated_rating'])
    
    #Ensure that restaurants recommended in the content based recommendaion is not showing up in collaborative filter
    Top6_restaurant_collab_final = Top6_restaurant_collab_RS[~Top6_restaurant_collab_RS.Business_id.isin \
                                            (AZ_restrnt_city_content_user_RS.Business_id)]
    AZ_restrnt_city_flatten_final_nodup_df = AZ_restrnt_city_flatten_final_df.drop_duplicates('Business_id')
    
    AZ_restrnt_city_collaborative_RS = pd.merge(Top6_restaurant_collab_final,
                                            AZ_restrnt_city_flatten_final_nodup_df[['Business_id','Name', 'Address', 'City', 
                                                                                    'State', 'Postal_code', 'Review_count', 
                                                                                    'Restaurant_ratings']],
                                            on = 'Business_id', how = 'left').sort_values(['Estimated_rating'], ascending=False)
    #Restaurants should have estimated rating above 3.5
    AZ_restrnt_city_collaborative_RS = AZ_restrnt_city_collaborative_RS[AZ_restrnt_city_collaborative_RS['Estimated_rating'] 
                                                                        > 3.5] 
    return AZ_restrnt_city_collaborative_RS.head(3)

In [167]:
#Pass the information to the collborative filter function
user_name = 'SI3h8naCrYA4183hAor3kA'
pickled_model = Top6_restaurant_collab_Mesa_read
AZ_restrnt_city_flatten_final_df = AZ_restrnt_Mesa_flatten_final_df

Collaborative_filtering(user_name, pickled_model, AZ_restrnt_city_content_user_RS,AZ_restrnt_city_flatten_final_df) 

Unnamed: 0,Business_id,Estimated_rating,Name,Address,City,State,Postal_code,Review_count,Restaurant_ratings
0,flCZyw8UBZF41dKMn8k_GQ,4.858548,Joey's PSG,1038 E Main St,Mesa,AZ,85203,39,5.0
1,16d3BlncEyCTzb0GxXrBXQ,4.771303,Green Corner Restaurant,"1038 W Southern Ave, Ste 1",Mesa,AZ,85210,355,5.0
2,7VYFL_s1HK1vOU-dzyW1Ew,4.770409,Z-Cafe,"1245 W Baseline Rd, Ste 105",Mesa,AZ,85202,170,4.5


**Above 3 restaurants will be recommended to the user based on the previous similar restaurants visited by him**