# Yelp Data Analysis - Relationship between Density and Quality


In [87]:
import csv
import pandas as pd
import numpy as np
import pandas as pd
import seaborn as sns
import operator #need this for sorting dicts later
import matplotlib.cm as cm #for colormapping later
from matplotlib.colors import LogNorm
import matplotlib as mpl
import matplotlib.pyplot as plt
import math
%matplotlib inline

In [63]:
## import the yelp business dataset as a csv
business_df = pd.read_csv('business.csv')

In [64]:
business_df.head()

Unnamed: 0,attributes.Ambience.divey,attributes.RestaurantsDelivery,attributes.DogsAllowed,postal_code,hours.Thursday,attributes.HairSpecializesIn.coloring,attributes.BestNights.sunday,attributes.BYOB,attributes.AgesAllowed,attributes.Music.video,...,attributes.Caters,attributes.RestaurantsReservations,attributes.DietaryRestrictions.dairy-free,attributes.DietaryRestrictions.vegan,attributes.Ambience.romantic,attributes.Music.jukebox,attributes.Ambience.upscale,attributes.RestaurantsTakeOut,attributes.BikeParking,attributes.OutdoorSeating
0,,,,44143,10:00-21:00,,,,,,...,,,,,,,,,True,
1,False,True,,28215,10:00-22:00,,,,,,...,,False,,,False,,False,True,,False
2,,,,M4K 1N7,10:00-19:00,,,,,,...,,,,,,,,,True,False
3,,,,85258,9:00-17:00,,,,,,...,,,,,,,,,,
4,False,False,,85016,,,,,,,...,True,False,,,False,,False,True,True,False


## Cleaning the Dataset

We are going to remove the unecessary columns from this dataset:

In [65]:
business_df = business_df[['name','postal_code','latitude','longitude','city','neighborhood','categories','stars','review_count']]

In [66]:
business_df.head()

Unnamed: 0,name,postal_code,latitude,longitude,city,neighborhood,categories,stars,review_count
0,Richmond Town Square,44143,41.541716,-81.493116,Richmond Heights,,"[u'Shopping', u'Shopping Centers']",2.0,17
1,South Florida Style Chicken & Ribs,28215,35.23687,-80.741976,Charlotte,Eastland,"[u'Food', u'Soul Food', u'Convenience Stores',...",4.5,4
2,The Tea Emporium,M4K 1N7,43.677126,-79.353285,Toronto,Riverdale,"[u'Food', u'Coffee & Tea']",4.5,7
3,TRUmatch,85258,33.565082,-111.9164,Scottsdale,,"[u'Professional Services', u'Matchmakers']",3.0,3
4,Blimpie,85016,33.505928,-112.038847,Phoenix,,"[u'Sandwiches', u'Restaurants']",4.5,10


In [67]:
business_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156639 entries, 0 to 156638
Data columns (total 9 columns):
name            156639 non-null object
postal_code     156086 non-null object
latitude        156638 non-null float64
longitude       156638 non-null float64
city            156636 non-null object
neighborhood    62236 non-null object
categories      156639 non-null object
stars           156639 non-null float64
review_count    156639 non-null int64
dtypes: float64(3), int64(1), object(5)
memory usage: 10.8+ MB


We can focus on examining businesses only in Toronto

In [68]:
toronto_restaurants_df = business_df[business_df['city'] == 'Toronto']
toronto_restaurants_df.reset_index(level=None, drop=False, inplace=False, col_level=0, col_fill='')
toronto_restaurants_df.head(10)

Unnamed: 0,name,postal_code,latitude,longitude,city,neighborhood,categories,stars,review_count
2,The Tea Emporium,M4K 1N7,43.677126,-79.353285,Toronto,Riverdale,"[u'Food', u'Coffee & Tea']",4.5,7
26,Hart House Theatre,M5S 3H3,43.663669,-79.394687,Toronto,University of Toronto,"[u'Arts & Entertainment', u'Performing Arts']",4.0,8
38,McCarthy's Irish Pub,M4L 2B5,43.678049,-79.314774,Toronto,Upper Beach,"[u'Pubs', u'Restaurants', u'Bars', u'Irish', u...",4.0,5
44,Paris Bakery & Pastry,M6H 3Z5,43.662401,-79.444706,Toronto,Wallace Emerson,"[u'Bakeries', u'Food']",3.0,16
53,Oishi Sushi,M3J 2G5,43.76351,-79.49075,Toronto,,"[u'Asian Fusion', u'Restaurants', u'Sushi Bars']",2.0,27
55,Boardwalk Place,M4W 3L6,43.66301,-79.310898,Toronto,,"[u'American (Traditional)', u'Restaurants', u'...",3.0,13
62,Adelaide Hall,M5H 1X6,43.648172,-79.389077,Toronto,Entertainment District,"[u'Nightlife', u'Music Venues', u'DJs', u'Arts...",3.5,10
63,Sangria Lounge,M6R 2K9,43.643537,-79.447954,Toronto,High Park,"[u'Nightlife', u'Lounges', u'Beer', u'Wine & S...",3.5,23
134,Buonanotte,M5V 1H2,43.645687,-79.390493,Toronto,Entertainment District,"[u'Italian', u'Restaurants']",2.0,66
154,Big Daddy's Bourbon Street Bistro & Oyster Bar,M5H 1K5,43.647499,-79.386471,Toronto,Entertainment District,"[u'Cajun/Creole', u'Restaurants', u'Seafood']",3.5,132


Next, we create a DataFrame called not_restaurant. We filter through all the values in the categories column for non restaurant businesses and store these corresponding rows in it. At this time, we are only considering u'Restaurant as our restaurant variable. However, this can be extended to include Pubs with u'Pubs or coffee shops with u'Coffee & Tea

In [69]:
not_restaurant = pd.DataFrame()
for lst in toronto_restaurants_df['categories']:
    if "u'Restaurants'" not in lst:
        not_restaurant = not_restaurant.append(toronto_restaurants_df[toronto_restaurants_df['categories'] == lst])

This is what the dataframe with non retsaurant businesses looks like now:

In [70]:
not_restaurant.head(7)

Unnamed: 0,name,postal_code,latitude,longitude,city,neighborhood,categories,stars,review_count
2,The Tea Emporium,M4K 1N7,43.677126,-79.353285,Toronto,Riverdale,"[u'Food', u'Coffee & Tea']",4.5,7
475,Jimmy's Coffee,M5T 1L8,43.654625,-79.401349,Toronto,Kensington Market,"[u'Food', u'Coffee & Tea']",4.0,108
1091,Second Cup,M5B 1W8,43.65284,-79.377373,Toronto,Downtown Core,"[u'Food', u'Coffee & Tea']",2.5,3
2297,Crafted Coffee,M6J 2Z6,43.64723,-79.41979,Toronto,Ossington Strip,"[u'Food', u'Coffee & Tea']",4.0,67
3157,Merchants of Green Coffee,M4M 1L9,43.659933,-79.353986,Toronto,Riverdale,"[u'Food', u'Coffee & Tea']",4.0,76
3839,Tim Horton's,M4A 1J8,43.725891,-79.313232,Toronto,,"[u'Food', u'Coffee & Tea']",3.0,4
4444,The Common II,M6H 1M4,43.66084,-79.431397,Toronto,Dovercourt,"[u'Food', u'Coffee & Tea']",4.0,28


For the sake of proper methodology, we also store all the restaurant businesses in a separate dataframe called is_restaurant and pick a single restaurant at random and store it in the variable restaurant_df:

In [71]:
is_restaurant = pd.DataFrame()
for lst in toronto_restaurants_df['categories']:
    if "u'Restaurants'" in lst:
        is_restaurant = is_restaurant.append(toronto_restaurants_df[toronto_restaurants_df['categories'] == lst])

In [72]:
is_restaurant.head()

Unnamed: 0,name,postal_code,latitude,longitude,city,neighborhood,categories,stars,review_count
38,McCarthy's Irish Pub,M4L 2B5,43.678049,-79.314774,Toronto,Upper Beach,"[u'Pubs', u'Restaurants', u'Bars', u'Irish', u...",4.0,5
53,Oishi Sushi,M3J 2G5,43.76351,-79.49075,Toronto,,"[u'Asian Fusion', u'Restaurants', u'Sushi Bars']",2.0,27
55,Boardwalk Place,M4W 3L6,43.66301,-79.310898,Toronto,,"[u'American (Traditional)', u'Restaurants', u'...",3.0,13
134,Buonanotte,M5V 1H2,43.645687,-79.390493,Toronto,Entertainment District,"[u'Italian', u'Restaurants']",2.0,66
1312,Ceno,M5R 2H4,43.673959,-79.396121,Toronto,The Annex,"[u'Italian', u'Restaurants']",4.0,4


In [73]:
restaurant_df = is_restaurant.iloc[[0]]
restaurant_df

Unnamed: 0,name,postal_code,latitude,longitude,city,neighborhood,categories,stars,review_count
38,McCarthy's Irish Pub,M4L 2B5,43.678049,-79.314774,Toronto,Upper Beach,"[u'Pubs', u'Restaurants', u'Bars', u'Irish', u...",4.0,5


Next, we will calculate the distances between all the non-restaurant businesses in not_restaurant and McCarthy's Irish Pub. The Haversine formula is needed to calculate the great-circle distance. The great-circle distance is simply the shortest distance over the Earth's surface. The advantage of using the Haversine formula is that it maintains its integrity in calculations, even over small distances unlike the Law of Cosine formula.

In [88]:
## Function that calculates the haversine


df = pd.DataFrame()

df['LAT_rad'], df['LON_rad'] = np.radians(not_restaurant['latitude']), np.radians(not_restaurant['longitude'])
df['dLON'] = df['LON_rad'] - math.radians(restaurant_df['longitude'])
df['dLAT'] = df['LAT_rad'] - math.radians(restaurant_df['latitude'])

# using heaversine's formula:
not_restaurant['distance'] = 6367 * 2 * np.arcsin(np.sqrt(np.sin(df['dLAT']/2)**2 + math.cos(math.radians(37.2175900)) * np.cos(df['LAT_rad']) * np.sin(df['dLON']/2)**2))
not_restaurant.head()

Unnamed: 0,name,postal_code,latitude,longitude,city,neighborhood,categories,stars,review_count,distance
2,The Tea Emporium,M4K 1N7,43.677126,-79.353285,Toronto,Riverdale,"[u'Food', u'Coffee & Tea']",4.5,7,3.249438
475,Jimmy's Coffee,M5T 1L8,43.654625,-79.401349,Toronto,Kensington Market,"[u'Food', u'Coffee & Tea']",4.0,108,7.752686
1091,Second Cup,M5B 1W8,43.65284,-79.377373,Toronto,Downtown Core,"[u'Food', u'Coffee & Tea']",2.5,3,5.977419
2297,Crafted Coffee,M6J 2Z6,43.64723,-79.41979,Toronto,Ossington Strip,"[u'Food', u'Coffee & Tea']",4.0,67,9.497646
3157,Merchants of Green Coffee,M4M 1L9,43.659933,-79.353986,Toronto,Riverdale,"[u'Food', u'Coffee & Tea']",4.0,76,3.871908


In [None]:

convert the categories from list of ojects tolist of strings
create a list with restaurant and not restaurant types