In [2]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

import os
from modules.dataImporter import yelp_import

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"

# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<a id="data"></a>
# Data

We are using subsets of each table since we have a large dataset to work with. For this notebook, we used _business_ and _review_ tables.

In [3]:
# import the data (chunksize returns jsonReader for iteration)

datasets, spark = yelp_import("small")

subset_business = datasets["businesses"]
subset_review = datasets["reviews"]

In [28]:
# peak the tables
display(subset_business.head(2))
display(subset_review.head(2))

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,KU_O5udG6zpxOg-VcAEodg,mh_-eMZ6K5RLWhZyISBhwA,XQfwVwDr-v0ZS3_CbbE5Xw,3,0,0,0,"If you decide to eat here, just be aware it is...",2018-07-07 22:09:11
1,BiTunyQ73aT9WBnpR9DZGw,OyoGAe7OKpv6SyGZT5g77Q,7ATYjTIgM3jUlt4UM3IypQ,5,1,0,1,I've taken a lot of spin classes over the year...,2012-01-03 15:28:18


In [29]:
print(subset_business.shape)
print(subset_review.shape)

(1500, 14)
(1500, 9)


In [30]:
# No New York, Maybe in other Naming convention ?
subset_business[(subset_business["city"].str.contains("New"))]["city"].value_counts()

New Orleans        68
New Port Richey     8
Newtown             4
New Hope            3
Newark              2
Name: city, dtype: int64

In [31]:
# No San Diego and San Francisco, Maybe in other Naming convention ?
subset_business[(subset_business["city"].str.contains("San"))]["city"].value_counts()

Santa Barbara    30
Name: city, dtype: int64

In [32]:
# No Paris, Maybe in other Naming convention ?
subset_business[(subset_business["city"].str.contains("Paris"))]["city"].value_counts()

Series([], Name: city, dtype: int64)

In [33]:
subset_business["city"].value_counts().sort_values(ascending=False)

Philadelphia        154
Tucson              117
Tampa                97
Indianapolis         84
Reno                 71
                   ... 
Clearwater Beach      1
Riverton              1
St Pete Beach         1
Lawnside              1
Hockessin             1
Name: city, Length: 283, dtype: int64

<a id="preprocessing-data"></a>
# Preprocessing the Data

We chose Philadelphia since it has the highest number of restraunts. The restaurant is the most popular category among businesses. 

In [34]:
# Businesses in Philadelphia and currently open business
city = subset_business[
    (subset_business["city"].str.contains("Philadelphia"))
    & (subset_business["is_open"] == 1)
]
Philadelphia = city[
    ["business_id", "name", "address", "categories", "attributes", "stars"]
]
Philadelphia

Unnamed: 0,business_id,name,address,categories,attributes,stars
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'RestaurantsDelivery': 'False', 'OutdoorSeati...",4.0
15,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,"Sushi Bars, Restaurants, Japanese","{'RestaurantsReservations': 'True', 'Restauran...",4.0
19,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,"Korean, Restaurants","{'NoiseLevel': 'u'quiet'', 'GoodForMeal': '{'d...",4.5
35,aPNXGTDkf-4bjhyMBQxqpQ,Craft Hall,901 N Delaware Ave,"Eatertainment, Arts & Entertainment, Brewpubs,...","{'OutdoorSeating': 'True', 'RestaurantsPriceRa...",3.5
82,ppFCk9aQkM338Rgwpl2F5A,Wawa,3604 Chestnut St,"Restaurants, Automotive, Delis, Gas Stations, ...","{'Alcohol': 'u'none'', 'RestaurantsGoodForGrou...",3.0
...,...,...,...,...,...,...
1430,dE_MaaYrXBAEebtH2u_B-w,Wilson's Market,1337 W Olney Ave,"Specialty Food, Meat Shops, Sandwiches, Food, ...","{'RestaurantsPriceRange2': '1', 'Caters': 'Tru...",3.5
1434,w3R4cgg_HPI0F55PCLpufA,Green Street Property Management,"2015 Locust St, Ste 100","Property Management, Real Estate, Home Services",{'BusinessAcceptsCreditCards': 'True'},2.0
1437,68ThEdiK0eWCEgGEmV9Tng,Southgate,1801 Lombard St,"Breakfast & Brunch, American (Traditional), Ko...","{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...",4.0
1488,u8ZsK3py4y9xEOk-vox2ZA,Iheoma Esochi J DDS,233 McClellan St,"Health & Medical, Dentists, General Dentistry",{'ByAppointmentOnly': 'True'},3.5


In [35]:
# getting just restaurants from Philadelphia business
rest = Philadelphia[
    Philadelphia["categories"].str.contains("Restaurant.*") == True
].reset_index()
rest

Unnamed: 0,index,business_id,name,address,categories,attributes,stars
0,3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'RestaurantsDelivery': 'False', 'OutdoorSeati...",4.0
1,15,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,"Sushi Bars, Restaurants, Japanese","{'RestaurantsReservations': 'True', 'Restauran...",4.0
2,19,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,"Korean, Restaurants","{'NoiseLevel': 'u'quiet'', 'GoodForMeal': '{'d...",4.5
3,35,aPNXGTDkf-4bjhyMBQxqpQ,Craft Hall,901 N Delaware Ave,"Eatertainment, Arts & Entertainment, Brewpubs,...","{'OutdoorSeating': 'True', 'RestaurantsPriceRa...",3.5
4,82,ppFCk9aQkM338Rgwpl2F5A,Wawa,3604 Chestnut St,"Restaurants, Automotive, Delis, Gas Stations, ...","{'Alcohol': 'u'none'', 'RestaurantsGoodForGrou...",3.0
5,130,3BJxm-HnvzdwD1zjmSbmyQ,Golden Chopstick Chinese Restaurant,1800 Spring Garden St,"Restaurants, Chinese","{'GoodForKids': 'True', 'RestaurantsDelivery':...",3.0
6,199,O1oZpbZNDMH_gz8DhsZCdA,Wendy's,700 E. Hunting Park,"Burgers, Restaurants, Fast Food","{'RestaurantsTakeOut': 'True', 'RestaurantsPri...",1.5
7,235,bTve2mwLk5Zc01vRKqc2KQ,Red Hook Coffee & Tea,765 S 4th St,"Restaurants, Breakfast & Brunch, Vegan, Bagels...","{'BikeParking': 'True', 'OutdoorSeating': 'Tru...",4.5
8,266,n8ecak12IF_jhnPNs37AZA,Baltic Bakery,2609 Edgemont St,"Food, Bakeries, Restaurants","{'RestaurantsDelivery': 'False', 'BusinessAcce...",4.5
9,274,_T0cPZE2ZJOTTlYYKMP64Q,Brown Sugar Bakery & Cafe,219 S 52nd St,"Cupcakes, Caribbean, Restaurants, Trinidadian,...","{'Caters': 'False', 'Alcohol': 'u'none'', 'Res...",4.5


<a id="get-dummies"></a>
* ** Get Dummies from attributes and categories columns**

> In "attributes" column has nested attributes. In order to create a feature table, we need to separate those nested attributes into their own columns. Therefore, the following functions will be used to achieve this goal.

In [36]:
# Function that extract keys from the nested dictionary
def extract_keys(attr, key):
    if attr == None:
        return "{}"
    if key in attr:
        return attr.pop(key)


# convert string to dictionary
import ast


def str_to_dict(attr):
    if attr != None:
        return ast.literal_eval(attr)
    else:
        return ast.literal_eval("{}")

In [37]:
# get dummies from nested attributes
rest["BusinessParking"] = rest.apply(
    lambda x: str_to_dict(extract_keys(x["attributes"], "BusinessParking")), axis=1
)
rest["Ambience"] = rest.apply(
    lambda x: str_to_dict(extract_keys(x["attributes"], "Ambience")), axis=1
)
rest["GoodForMeal"] = rest.apply(
    lambda x: str_to_dict(extract_keys(x["attributes"], "GoodForMeal")), axis=1
)
rest["Dietary"] = rest.apply(
    lambda x: str_to_dict(extract_keys(x["attributes"], "Dietary")), axis=1
)
rest["Music"] = rest.apply(
    lambda x: str_to_dict(extract_keys(x["attributes"], "Music")), axis=1
)

In [38]:
rest

Unnamed: 0,index,business_id,name,address,categories,attributes,stars,BusinessParking,Ambience,GoodForMeal,Dietary,Music
0,3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,"Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'RestaurantsDelivery': 'False', 'OutdoorSeati...",4.0,"{'garage': False, 'street': True, 'validated':...",{},{},{},{}
1,15,MUTTqe8uqyMdBl186RmNeA,Tuna Bar,205 Race St,"Sushi Bars, Restaurants, Japanese","{'RestaurantsReservations': 'True', 'Restauran...",4.0,"{'valet': False, 'garage': None, 'street': Tru...","{'touristy': False, 'hipster': False, 'romanti...","{'dessert': True, 'latenight': None, 'lunch': ...",{},{}
2,19,ROeacJQwBeh05Rqg7F6TCg,BAP,1224 South St,"Korean, Restaurants","{'NoiseLevel': 'u'quiet'', 'RestaurantsGoodFor...",4.5,"{'garage': False, 'street': True, 'validated':...","{'touristy': False, 'hipster': False, 'romanti...","{'dessert': False, 'latenight': False, 'lunch'...",{},{}
3,35,aPNXGTDkf-4bjhyMBQxqpQ,Craft Hall,901 N Delaware Ave,"Eatertainment, Arts & Entertainment, Brewpubs,...","{'OutdoorSeating': 'True', 'RestaurantsPriceRa...",3.5,"{'garage': False, 'street': False, 'validated'...","{'touristy': False, 'hipster': False, 'romanti...","{'dessert': False, 'latenight': False, 'lunch'...",{},{}
4,82,ppFCk9aQkM338Rgwpl2F5A,Wawa,3604 Chestnut St,"Restaurants, Automotive, Delis, Gas Stations, ...","{'Alcohol': 'u'none'', 'RestaurantsGoodForGrou...",3.0,"{'garage': False, 'street': True, 'validated':...","{'romantic': False, 'intimate': False, 'classy...",{},{},{}
5,130,3BJxm-HnvzdwD1zjmSbmyQ,Golden Chopstick Chinese Restaurant,1800 Spring Garden St,"Restaurants, Chinese","{'GoodForKids': 'True', 'RestaurantsDelivery':...",3.0,"{'garage': False, 'street': True, 'validated':...","{'touristy': False, 'hipster': False, 'romanti...","{'dessert': False, 'latenight': False, 'lunch'...",{},{}
6,199,O1oZpbZNDMH_gz8DhsZCdA,Wendy's,700 E. Hunting Park,"Burgers, Restaurants, Fast Food","{'RestaurantsTakeOut': 'True', 'RestaurantsPri...",1.5,"{'garage': False, 'street': False, 'validated'...",{},{},{},{}
7,235,bTve2mwLk5Zc01vRKqc2KQ,Red Hook Coffee & Tea,765 S 4th St,"Restaurants, Breakfast & Brunch, Vegan, Bagels...","{'BikeParking': 'True', 'OutdoorSeating': 'Tru...",4.5,"{'garage': False, 'street': True, 'validated':...","{'romantic': False, 'intimate': False, 'classy...","{'dessert': False, 'latenight': False, 'lunch'...",{},{}
8,266,n8ecak12IF_jhnPNs37AZA,Baltic Bakery,2609 Edgemont St,"Food, Bakeries, Restaurants","{'RestaurantsDelivery': 'False', 'BusinessAcce...",4.5,"{'garage': False, 'street': True, 'validated':...",{},{},{},{}
9,274,_T0cPZE2ZJOTTlYYKMP64Q,Brown Sugar Bakery & Cafe,219 S 52nd St,"Cupcakes, Caribbean, Restaurants, Trinidadian,...","{'Caters': 'False', 'Alcohol': 'u'none'', 'Res...",4.5,"{'garage': False, 'street': True, 'validated':...","{'romantic': False, 'intimate': False, 'classy...","{'dessert': False, 'latenight': False, 'lunch'...",{},{}


In [39]:
# create table with attribute dummies
df_attr = pd.concat(
    [
        rest["attributes"].apply(pd.Series),
        rest["BusinessParking"].apply(pd.Series),
        rest["Ambience"].apply(pd.Series),
        rest["GoodForMeal"].apply(pd.Series),
        rest["Dietary"].apply(pd.Series),
    ],
    axis=1,
)
df_attr_dummies = pd.get_dummies(df_attr)
df_attr_dummies

  rest["attributes"].apply(pd.Series),
  rest["BusinessParking"].apply(pd.Series),
  rest["BusinessParking"].apply(pd.Series),
  rest["BusinessParking"].apply(pd.Series),
  rest["BusinessParking"].apply(pd.Series),
  rest["Ambience"].apply(pd.Series),
  rest["Ambience"].apply(pd.Series),
  rest["Ambience"].apply(pd.Series),
  rest["Ambience"].apply(pd.Series),
  rest["Ambience"].apply(pd.Series),
  rest["Ambience"].apply(pd.Series),
  rest["Ambience"].apply(pd.Series),
  rest["Ambience"].apply(pd.Series),
  rest["Ambience"].apply(pd.Series),
  rest["Ambience"].apply(pd.Series),
  rest["GoodForMeal"].apply(pd.Series),
  rest["GoodForMeal"].apply(pd.Series),
  rest["GoodForMeal"].apply(pd.Series),
  rest["GoodForMeal"].apply(pd.Series),
  rest["GoodForMeal"].apply(pd.Series),
  rest["GoodForMeal"].apply(pd.Series),
  rest["GoodForMeal"].apply(pd.Series),
  rest["GoodForMeal"].apply(pd.Series),
  rest["GoodForMeal"].apply(pd.Series),
  rest["GoodForMeal"].apply(pd.Series),
  rest["GoodFor

Unnamed: 0,RestaurantsDelivery_False,RestaurantsDelivery_None,RestaurantsDelivery_True,OutdoorSeating_False,OutdoorSeating_None,OutdoorSeating_True,BusinessAcceptsCreditCards_False,BusinessAcceptsCreditCards_True,BikeParking_False,BikeParking_True,...,latenight_False,latenight_True,lunch_False,lunch_True,dinner_False,dinner_True,brunch_False,brunch_True,breakfast_False,breakfast_True
0,1,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,1,0,0,1,0,0,1,0,1,...,1,0,0,1,0,1,1,0,1,0
3,0,0,1,0,0,1,0,1,0,0,...,1,0,1,0,1,0,1,0,1,0
4,0,0,1,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1,1,0,0,0,1,0,1,...,1,0,0,1,0,0,1,0,1,0
6,0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,1,1,0,0,1,...,1,0,0,1,1,0,0,1,0,1
8,1,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9,0,0,1,1,0,0,0,1,0,1,...,1,0,1,0,1,0,1,0,1,0


In [40]:
# get dummies from categories
df_categories_dummies = pd.Series(rest["categories"]).str.get_dummies(",")
df_categories_dummies

Unnamed: 0,American (New),American (Traditional),Arts & Entertainment,Asian Fusion,Automotive,Bagels,Bakeries,Bars,Beer,Beer Bar,...,Eatertainment,Event Planning & Services,Food,Korean,Restaurants,Sandwiches,Shopping,Specialty Food,Sushi Bars,Trinidadian
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,1,1,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
9,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [41]:
# pull out names and stars from rest table
result = rest[["name", "stars"]]
result

Unnamed: 0,name,stars
0,St Honore Pastries,4.0
1,Tuna Bar,4.0
2,BAP,4.5
3,Craft Hall,3.5
4,Wawa,3.0
5,Golden Chopstick Chinese Restaurant,3.0
6,Wendy's,1.5
7,Red Hook Coffee & Tea,4.5
8,Baltic Bakery,4.5
9,Brown Sugar Bakery & Cafe,4.5


In [42]:
# Concat all tables and drop Restaurant column
df_final = pd.concat([df_attr_dummies, df_categories_dummies, result], axis=1)
df_final.drop("Restaurants", inplace=True, axis=1)

In [43]:
# map floating point stars to an integer
mapper = {1.0: 1, 1.5: 2, 2.0: 2, 2.5: 3, 3.0: 3, 3.5: 4, 4.0: 4, 4.5: 5, 5.0: 5}
df_final["stars"] = df_final["stars"].map(mapper)

In [44]:
# Final table for the models
df_final

Unnamed: 0,RestaurantsDelivery_False,RestaurantsDelivery_None,RestaurantsDelivery_True,OutdoorSeating_False,OutdoorSeating_None,OutdoorSeating_True,BusinessAcceptsCreditCards_False,BusinessAcceptsCreditCards_True,BikeParking_False,BikeParking_True,...,Event Planning & Services,Food,Korean,Sandwiches,Shopping,Specialty Food,Sushi Bars,Trinidadian,name,stars
0,1,0,0,1,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,St Honore Pastries,4
1,0,0,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,1,0,Tuna Bar,4
2,0,1,0,0,1,0,0,1,0,1,...,0,0,1,0,0,0,0,0,BAP,5
3,0,0,1,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,Craft Hall,4
4,0,0,1,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,Wawa,3
5,0,0,1,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,Golden Chopstick Chinese Restaurant,3
6,0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,Wendy's,2
7,1,0,0,0,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,Red Hook Coffee & Tea,5
8,1,0,0,0,0,0,1,0,0,1,...,0,1,0,0,0,0,0,0,Baltic Bakery,5
9,0,0,1,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,Brown Sugar Bakery & Cafe,5


## Check how many attributes(Tags) in the dataset for restraunts for our recomendation algorithms

In [45]:
# Check how many attributes(Tags) for restraunts for our recomendation algorithms
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 1000)
pd.set_option("display.width", 1000)
df_final.head()

Unnamed: 0,RestaurantsDelivery_False,RestaurantsDelivery_None,RestaurantsDelivery_True,OutdoorSeating_False,OutdoorSeating_None,OutdoorSeating_True,BusinessAcceptsCreditCards_False,BusinessAcceptsCreditCards_True,BikeParking_False,BikeParking_True,RestaurantsPriceRange2_1,RestaurantsPriceRange2_2,RestaurantsTakeOut_False,RestaurantsTakeOut_True,ByAppointmentOnly_False,WiFi_'free',WiFi_'no',WiFi_u'free',WiFi_u'no',Alcohol_'full_bar',Alcohol_'none',Alcohol_u'beer_and_wine',Alcohol_u'full_bar',Alcohol_u'none',Caters_False,Caters_True,RestaurantsReservations_False,RestaurantsReservations_True,RestaurantsGoodForGroups_False,RestaurantsGoodForGroups_True,RestaurantsAttire_'casual',RestaurantsAttire_u'casual',HasTV_False,HasTV_True,GoodForKids_False,GoodForKids_True,NoiseLevel_'average',NoiseLevel_'loud',NoiseLevel_'quiet',NoiseLevel_u'average',NoiseLevel_u'quiet',DogsAllowed_False,DogsAllowed_True,HappyHour_False,HappyHour_True,WheelchairAccessible_False,WheelchairAccessible_True,RestaurantsTableService_False,RestaurantsTableService_True,BusinessAcceptsBitcoin_False,BYOBCorkage_'yes_free',Corkage_False,Corkage_True,GoodForDancing_False,"BestNights_{'monday': False, 'tuesday': False, 'friday': True, 'wednesday': False, 'thursday': False, 'sunday': False, 'saturday': True}","BestNights_{'monday': True, 'tuesday': False, 'friday': False, 'wednesday': False, 'thursday': True, 'sunday': False, 'saturday': True}","BestNights_{u'monday': False, u'tuesday': False, u'wednesday': False, u'thursday': False, u'friday': False, u'saturday': True, u'sunday': False}",Smoking_u'no',DriveThru_False,DriveThru_True,BYOB_False,CoatCheck_False,garage_False,garage_True,street_False,street_True,validated_False,lot_False,lot_True,valet_False,touristy_False,touristy_True,hipster_False,hipster_True,romantic_False,romantic_True,divey_False,intimate_False,intimate_True,trendy_False,trendy_True,upscale_False,classy_False,classy_True,casual_False,casual_True,dessert_False,dessert_True,latenight_False,latenight_True,lunch_False,lunch_True,dinner_False,dinner_True,brunch_False,brunch_True,breakfast_False,breakfast_True,American (New),American (Traditional),Arts & Entertainment,Asian Fusion,Automotive,Bagels,Bakeries,Bars,Beer,Beer Bar,Beer Gardens,Breakfast & Brunch,Breweries,Brewpubs,Bubble Tea,Burgers,Burmese,Cafes,Cajun/Creole,Caribbean,Caterers,Chicken Shop,Chicken Wings,Chinese,Cocktail Bars,Coffee & Tea,Convenience Stores,Delis,Donuts,Event Planning & Services,Fashion,Fast Food,Food,Gas Stations,Gastropubs,Grocery,Halal,Ice Cream & Frozen Yogurt,Indian,Irish,Irish Pub,Japanese,Juice Bars & Smoothies,Korean,Malaysian,Meat Shops,Nightlife,Pakistani,Pizza,Pubs,Restaurants,Salad,Sandwiches,Seafood,Thai,Trinidadian,Vegan,Vegetarian,Venues & Event Spaces,Whiskey Bars,Wine & Spirits,Bagels.1,Bars.1,Breakfast & Brunch.1,Burgers.1,Cafes.1,Cheesesteaks,Chicken Wings.1,Cupcakes,Delis.1,Diners,Eatertainment,Event Planning & Services.1,Food.1,Korean.1,Sandwiches.1,Shopping,Specialty Food,Sushi Bars,Trinidadian.1,name,stars
0,1,0,0,1,0,0,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,St Honore Pastries,4
1,0,0,1,0,0,1,0,1,0,0,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,1,1,0,1,0,1,0,0,0,0,1,0,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,1,0,1,0,1,0,1,1,0,1,0,1,0,1,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,Tuna Bar,4
2,0,1,0,0,1,0,0,1,0,1,1,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,1,0,1,0,0,1,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1,1,0,0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,BAP,5
3,0,0,1,0,0,1,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,1,1,1,0,1,0,1,0,1,1,0,1,0,1,0,1,1,0,1,0,1,0,1,0,1,0,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,Craft Hall,4
4,0,0,1,1,0,0,0,1,0,1,1,0,0,1,1,0,0,1,0,0,0,0,0,1,1,0,1,0,0,1,0,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,1,0,1,1,0,1,0,1,0,1,1,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Wawa,3


In [46]:
# List out all attributes with values
df_final.drop(["name", "stars"], axis=1).sum(axis=0).sort_values(ascending=False).head(
    100
)

RestaurantsTakeOut_True             36
valet_False                         35
BusinessAcceptsCreditCards_True     35
validated_False                     32
garage_False                        32
upscale_False                       28
hipster_False                       27
lot_False                           27
HasTV_True                          27
street_True                         27
touristy_False                      26
divey_False                         26
RestaurantsDelivery_True            25
GoodForKids_True                    25
trendy_False                        25
romantic_False                      25
 Restaurants                        24
intimate_False                      23
RestaurantsGoodForGroups_True       23
RestaurantsReservations_False       23
BikeParking_True                    23
RestaurantsPriceRange2_1            23
latenight_False                     23
dessert_False                       22
Alcohol_u'none'                     21
OutdoorSeating_False     

<a id="content-based"></a>
# Content Based Filtering- Model

In this section, we are going to build a system that recognizes the similarity between restaurants based on specific features and recommends restaurants that are most similar to a particular restaurant. __df_final__ (features) table used to build this system.

In [47]:
# Create X (all the features) and y (target)
X = df_final.iloc[:, :-2]
y = df_final["stars"]

* **Split the data into train and test set (80:20)**

In [48]:
# Split the data into train and test sets
from sklearn.model_selection import train_test_split

X_train_knn, X_test_knn, y_train_knn, y_test_knn = train_test_split(
    X, y, test_size=0.2, random_state=1
)

ModuleNotFoundError: No module named 'sklearn'

* **Instantiate and fit the model**

In [49]:
y_train_knn.head()

NameError: name 'y_train_knn' is not defined

In [50]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train_knn, y_train_knn)

# y_pred = knn.predict(X_test)

accuracy_train = knn.score(X_train_knn, y_train_knn)
accuracy_test = knn.score(X_test_knn, y_test_knn)

print(f"Score on training set: {accuracy_train}")
print(f"Score on test set: {accuracy_test}")

ModuleNotFoundError: No module named 'sklearn'

The restaurant of the validation set

In [None]:
# look at the last row for the test
display(df_final.iloc[-1:])

# look at the restaurant name from the last row.
print("Validation set (Restaurant name): ", df_final["name"].values[-1])

* **Test the model:** 

> We used the last row as a validation set (we didn't include this last row for modeling). 

In [None]:
# test set from the df_final table (only last row): Restaurant name: "Steak & Cheese & Quick Pita Restaurant"
test_set = df_final.iloc[-1:, :-2]

# validation set from the df_final table (exclude the last row)
X_val = df_final.iloc[:-1, :-2]
y_val = df_final["stars"].iloc[:-1]

In [None]:
# fit model with validation set
n_knn = knn.fit(X_val, y_val)

After fitting the KNN model to the validation set, we are going to find the distances between the validation set and the other restaurants based on their similar features. 

In [None]:
# distances and indeces from validation set (Steak & Cheese & Quick Pita Restaurant)
distances, indeces = n_knn.kneighbors(test_set)
# n_knn.kneighbors(test_set)[1][0]

# create table distances and indeces from "Steak & Cheese & Quick Pita Restaurant"
final_table = pd.DataFrame(n_knn.kneighbors(test_set)[0][0], columns=["distance"])
final_table["index"] = n_knn.kneighbors(test_set)[1][0]
final_table.set_index("index")

We are creating the following ***result*** table which displays similar restaurants to the validation restrauant by their distances. Based on this recommendation system, the short distance means having more similarity to the validation restrauant.

In [None]:
# get names of the restaurant that similar to the validation restrauant
result = final_table.join(df_final, on="index")
result[["distance", "index", "name", "stars"]].head(5)