# Recommender on Chicago Food Reviewers

The goal of this notebook is to build a rudimentary recomender system for South Carolina Reviewers

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances, cosine_similarity
%matplotlib inline 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
data = pd.read_csv("/home/schubert/DSI/capstone_project/data/food_reviews_il")

In [3]:
data.shape

(24174, 21)

In [4]:
data.head()

Unnamed: 0,business_id,review_id,user_id,stars,date,text,useful,funny,cool,name,...,address,city,state,postal_code,latitude,longitude,review_count,is_open,categories,avg_stars
0,-05uZNVbb8DhFweTEOoDVg,HPvCxoJuXqN1v1EooXRuPQ,ZzpNEiZQmrmqqRFxK8mD6w,3,2016-06-15,Brand new so everything looked great and food ...,0,0,0,"""Subway""",...,"""333 S Century Blvd""",Rantoul,IL,61866.0,40.306765,-88.155529,3.0,1.0,Sandwiches;Fast Food;Restaurants,2.0
1,-05uZNVbb8DhFweTEOoDVg,kGYjPWxmF_JM2074BSFhgQ,fVH8_iI99eiZ_CkWXRomYQ,1,2016-07-28,I would rather go to the older subway then the...,0,0,0,"""Subway""",...,"""333 S Century Blvd""",Rantoul,IL,61866.0,40.306765,-88.155529,3.0,1.0,Sandwiches;Fast Food;Restaurants,2.0
2,-05uZNVbb8DhFweTEOoDVg,ILIcT1r-0zV4BMUcvQ_hCA,G0sPf6-N_gWByYIA0ho95w,2,2016-07-05,To much bread! Ladies are rude and drive throu...,0,0,0,"""Subway""",...,"""333 S Century Blvd""",Rantoul,IL,61866.0,40.306765,-88.155529,3.0,1.0,Sandwiches;Fast Food;Restaurants,2.0
3,-2q4dnUw0gGJniGW2aPamQ,cU39f51OCGzn5vF_TvtO-g,fwSsSQXLvip6MkayAa_vyQ,3,2011-04-11,I used to really like El Toro which was in the...,5,1,3,"""Fiesta Ranchera""",...,"""1805 S Neil St""",Champaign,IL,61820.0,40.094068,-88.245785,4.0,0.0,Mexican;Restaurants,2.0
4,-2q4dnUw0gGJniGW2aPamQ,1lRmEQYCMtgG-bdPGtgudQ,G-mC1khkbT2G4nUtvTGjTQ,1,2011-06-20,"As others have said, steer clear of this place...",1,0,1,"""Fiesta Ranchera""",...,"""1805 S Neil St""",Champaign,IL,61820.0,40.094068,-88.245785,4.0,0.0,Mexican;Restaurants,2.0


In [5]:
# let's drop uncecessary columns 
to_drop = ["date", "text", "neighborhood", "address",
           "state", "postal_code", "latitude", "longitude",
           "review_count", "avg_stars"]

data.drop(to_drop, axis=1, inplace=True)

In [6]:
data.head()

Unnamed: 0,business_id,review_id,user_id,stars,useful,funny,cool,name,city,is_open,categories
0,-05uZNVbb8DhFweTEOoDVg,HPvCxoJuXqN1v1EooXRuPQ,ZzpNEiZQmrmqqRFxK8mD6w,3,0,0,0,"""Subway""",Rantoul,1.0,Sandwiches;Fast Food;Restaurants
1,-05uZNVbb8DhFweTEOoDVg,kGYjPWxmF_JM2074BSFhgQ,fVH8_iI99eiZ_CkWXRomYQ,1,0,0,0,"""Subway""",Rantoul,1.0,Sandwiches;Fast Food;Restaurants
2,-05uZNVbb8DhFweTEOoDVg,ILIcT1r-0zV4BMUcvQ_hCA,G0sPf6-N_gWByYIA0ho95w,2,0,0,0,"""Subway""",Rantoul,1.0,Sandwiches;Fast Food;Restaurants
3,-2q4dnUw0gGJniGW2aPamQ,cU39f51OCGzn5vF_TvtO-g,fwSsSQXLvip6MkayAa_vyQ,3,5,1,3,"""Fiesta Ranchera""",Champaign,0.0,Mexican;Restaurants
4,-2q4dnUw0gGJniGW2aPamQ,1lRmEQYCMtgG-bdPGtgudQ,G-mC1khkbT2G4nUtvTGjTQ,1,1,0,1,"""Fiesta Ranchera""",Champaign,0.0,Mexican;Restaurants


In [8]:
data["is_open"].value_counts()

1.0    5497
0.0    1311
Name: is_open, dtype: int64

In [7]:
# let's drop places that are no longer open. No point in recommending a place that is now closed 
data.drop(data[data.is_open == 0.0].index, inplace=True)

In [8]:
data["is_open"].value_counts()

1.0    20558
Name: is_open, dtype: int64

In [9]:
# making pure raitings dataframe 
ratings = data[["business_id", "name", "user_id", "stars"]]

In [10]:
ratings.head()

Unnamed: 0,business_id,name,user_id,stars
0,-05uZNVbb8DhFweTEOoDVg,"""Subway""",ZzpNEiZQmrmqqRFxK8mD6w,3
1,-05uZNVbb8DhFweTEOoDVg,"""Subway""",fVH8_iI99eiZ_CkWXRomYQ,1
2,-05uZNVbb8DhFweTEOoDVg,"""Subway""",G0sPf6-N_gWByYIA0ho95w,2
7,-5NXoZeGBdx3Bdk70tuyCw,"""Po' Boys Restaurant""",CAYavn4JlhvTnUGAhThZcQ,5
8,-5NXoZeGBdx3Bdk70tuyCw,"""Po' Boys Restaurant""",_VzM3P4SlL5BTPySmfelkg,2


In [11]:
# ratings.to_csv("star_rec_data_il", encoding='utf-8', index=False)

In [30]:
ratings["name"].unique()

array(['"Small Bar Fort Mill"', '"Hickory Tavern"',
       '"Drunken Goat Bar & Grill"',
       '"Zaxby\'s Chicken Fingers & Buffalo Wings"',
       '"Hwy 55 Burgers Shakes & Fries"', '"Red Bowl"', '"Local Dish"',
       '"Groucho\'s Deli"', '"Showmars Fort Mill-Hwy 521"',
       '"McDonald\'s"', '"Lee\'s Hoagie House"',
       '"Cracker Barrel Old Country Store"', '"Subway"',
       '"Bojangles Restaurant"', '"Tian Tian Asian Restaurant"',
       '"Cajun Yard Dog"', '"Arby\'s"', '"Mamas Pizza & Pasta"',
       '"Wing King Cafe"', '"Waffle House"', '"521 BBQ & Grill"',
       '"Toppers Pizza"', '"Golden China"',
       '"Bojangles\' Famous Chicken \'n Biscuits"', '"Big Wok"',
       '"Brixx Wood Fired Pizza"', '"Wendy\'s"',
       '"Danny\'s Pizza and Pasta"', '"Hanako"', '"China II"',
       '"Famous Toastery"', '"Bagel Boat"',
       '"Danny\'s Pizza and Pasta Fort Mill"', '"The Roasting Oven"',
       '"Sub Station II"', '"Carolina Ale House"', '"Papa John\'s Pizza"',
       '"Tacos

## Creating a Pivot Table 

In [33]:
# need an index of all users, and single column for every restaurant 
pivot = pd.pivot_table(ratings, index='user_id', columns='name', values='stars')

In [34]:
pivot.head()

name,"""521 BBQ & Grill""","""Akahana - Asian Bistro, Bar & Sushi""","""Arby's""","""Archie Boy's BBQ""","""Asian Roll & Grill""","""Bagel Boat""","""Baxter Social House""","""Big Wok""","""Blacow Burger""","""Bojangles Restaurant""",...,"""The Shore Club""","""Tian Tian Asian Restaurant""","""Toppers Pizza""","""Towne Tavern""","""Tropical Smoothie Cafe""","""Village Pizza""","""Waffle House""","""Wendy's""","""Wing King Cafe""","""Zaxby's Chicken Fingers & Buffalo Wings"""
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--44NNdtngXMzsxyN7ju6Q,,,,,,,,,,,...,,,,,,,,,,
--RNUEvgFSw-b-hhhgZzSg,,,,,,,,1.0,,,...,,,,,,,,,,
-0b84SUGVN0YkG5j2MCmBw,,,,,,,,,,,...,,,,,,,,,,
-1FEXqwQSerk4L7Vrdodgg,,,,,,,,,,,...,,,,,,,,,,
-29I2dtVmCaHZNCvUnCZGg,,,,,,,,,,,...,,,,,,,,,5.0,


In [36]:
pivot.shape

(3361, 116)

### Making Sparse matrix from Pivot (Item-Based Collaborative Filter 

In [39]:
# item based colaborative filter, so we transpose, and create a sparse dataframe 
sparse_pivot = sparse.csr_matrix(pivot.T.fillna(0))

In [40]:
print(sparse_pivot)

  (0, 26)	3.0
  (0, 43)	4.0
  (0, 48)	5.0
  (0, 55)	3.0
  (0, 71)	5.0
  (0, 80)	4.0
  (0, 112)	5.0
  (0, 117)	4.0
  (0, 155)	4.0
  (0, 177)	5.0
  (0, 187)	3.0
  (0, 198)	4.0
  (0, 207)	5.0
  (0, 251)	2.0
  (0, 263)	4.0
  (0, 265)	3.0
  (0, 269)	4.0
  (0, 281)	5.0
  (0, 287)	2.0
  (0, 292)	5.0
  (0, 299)	4.0
  (0, 305)	5.0
  (0, 307)	4.0
  (0, 320)	4.0
  (0, 347)	4.0
  :	:
  (115, 1169)	3.0
  (115, 1399)	1.0
  (115, 1521)	2.5
  (115, 1556)	2.0
  (115, 1610)	3.0
  (115, 1768)	3.0
  (115, 1886)	1.0
  (115, 1932)	4.0
  (115, 2116)	2.0
  (115, 2357)	1.0
  (115, 2371)	3.0
  (115, 2400)	1.0
  (115, 2415)	1.0
  (115, 2569)	4.0
  (115, 2736)	4.0
  (115, 2743)	1.0
  (115, 2849)	1.0
  (115, 2863)	3.0
  (115, 2869)	2.0
  (115, 2946)	5.0
  (115, 2953)	5.0
  (115, 3124)	4.0
  (115, 3176)	3.0
  (115, 3285)	3.0
  (115, 3357)	5.0


### Calculating Cosine Similarity 

In [42]:
sparse_pivot.shape

(116, 3361)

In [43]:
distances = pairwise_distances(sparse_pivot, metric='cosine')

In [44]:
distances.shape

(116, 116)

In [45]:
distance_df = pd.DataFrame(distances, index=pivot.columns, columns=pivot.columns)
distance_df.head()

name,"""521 BBQ & Grill""","""Akahana - Asian Bistro, Bar & Sushi""","""Arby's""","""Archie Boy's BBQ""","""Asian Roll & Grill""","""Bagel Boat""","""Baxter Social House""","""Big Wok""","""Blacow Burger""","""Bojangles Restaurant""",...,"""The Shore Club""","""Tian Tian Asian Restaurant""","""Toppers Pizza""","""Towne Tavern""","""Tropical Smoothie Cafe""","""Village Pizza""","""Waffle House""","""Wendy's""","""Wing King Cafe""","""Zaxby's Chicken Fingers & Buffalo Wings"""
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""521 BBQ & Grill""",0.0,0.932733,1.0,1.0,0.963776,1.0,1.0,1.0,0.975221,0.964785,...,0.979344,0.926913,1.0,0.918222,0.947555,0.994451,0.923165,0.979483,0.95687,0.956629
"""Akahana - Asian Bistro, Bar & Sushi""",0.932733,0.0,1.0,1.0,0.934566,1.0,1.0,0.960653,0.963379,0.961448,...,0.969849,0.966311,1.0,0.91484,0.960469,0.982687,1.0,0.92469,0.962227,0.943023
"""Arby's""",1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"""Archie Boy's BBQ""",1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
"""Asian Roll & Grill""",0.963776,0.934566,1.0,1.0,0.0,1.0,1.0,0.988573,0.996624,1.0,...,1.0,1.0,1.0,0.961951,0.967199,0.973536,1.0,0.930933,0.979431,0.972923


In [None]:
# buiild recommender on train, holding out other reviews users have rated 5 stars, 
# predict how many stars a user, 5 class output, NN, sigmoid on output, cap at 5 stars, careful
# as does introduce some error. 
# or turn it into a two class problem 

In [47]:
search = '"Arby\'s"'
for resto in ratings.loc[ratings['name'].str.contains(search), 'name'].values:
    print(resto)
    print('Average Rating', pivot[resto].mean())
    print('Count of ratings', pivot[resto].count())
    print('Similar Restaurants')
    print(distance_df[resto].sort_values()[1:11])
    print('')
    print('')

"Arby's"
Average Rating 1.3333333333333333
Count of ratings 3
Similar Restaurants
name
"Bojangles"                      0.729631
"The Flipside Cafe"              0.960277
"Local Dish"                     0.971977
"Jim 'N Nick's Bar-B-Q"          0.977320
"521 BBQ & Grill"                1.000000
"Papa Murphy's"                  1.000000
"Pasquale's Restaurant & Pub"    1.000000
"Peking Tokyo"                   1.000000
"Pizza Hut"                      1.000000
"Papa John's Pizza"              1.000000
Name: "Arby's", dtype: float64


"Arby's"
Average Rating 1.3333333333333333
Count of ratings 3
Similar Restaurants
name
"Bojangles"                      0.729631
"The Flipside Cafe"              0.960277
"Local Dish"                     0.971977
"Jim 'N Nick's Bar-B-Q"          0.977320
"521 BBQ & Grill"                1.000000
"Papa Murphy's"                  1.000000
"Pasquale's Restaurant & Pub"    1.000000
"Peking Tokyo"                   1.000000
"Pizza Hut"                      1.00

In [None]:
# Next is to create similar users recomender system. Script and scale. 