This dataset is a subset of Yelp's businesses, reviews, and user data. Conduct research or analysis on Yelp's data. In the most recent dataset you'll find information about businesses across 8 metropolitan areas in the USA and Canada.

https://www.kaggle.com/datasets/yelp-dataset/yelp-dataset

In [1]:
# import libraries
import pandas as pd
import numpy as np
import scipy.stats as st

In [2]:
# load the data
df = pd.read_json("yelp_academic_dataset_business.json", lines=True)
df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [3]:
# filter rows where the "categories" column contains the word "Restaurants"
restaurant_df = df[df["categories"].str.contains("Restaurants", case=False, na=False)]

restaurant_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,1,"{'BusinessParking': 'None', 'BusinessAcceptsCr...","Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-22:0', '..."
8,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.565165,-90.321087,3.0,19,0,"{'Caters': 'True', 'Alcohol': 'u'full_bar'', '...","Pubs, Restaurants, Italian, Bars, American (Tr...",
9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.76817,1.5,10,1,"{'RestaurantsAttire': ''casual'', 'Restaurants...","Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-21:0', '..."
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,1,"{'Alcohol': ''none'', 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks","{'Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'..."


In [5]:
# iterate over the attributes column and collect all unique keys
unique_attributes = set()
for attr in df["attributes"].dropna():
    unique_attributes.update(attr.keys())

# print out
unique_attributes_list = list(unique_attributes)
for attr in unique_attributes_list:
    print(attr)

RestaurantsPriceRange2
DietaryRestrictions
RestaurantsAttire
Open24Hours
BusinessAcceptsBitcoin
Corkage
HasTV
NoiseLevel
AcceptsInsurance
RestaurantsCounterService
DogsAllowed
OutdoorSeating
GoodForKids
Ambience
RestaurantsTakeOut
WheelchairAccessible
HairSpecializesIn
RestaurantsDelivery
RestaurantsGoodForGroups
Caters
RestaurantsReservations
ByAppointmentOnly
BestNights
BusinessParking
Music
BYOBCorkage
Alcohol
HappyHour
BikeParking
RestaurantsTableService
BusinessAcceptsCreditCards
CoatCheck
AgesAllowed
Smoking
GoodForMeal
GoodForDancing
DriveThru
WiFi
BYOB


In [6]:
# create a function to read the p-value
def p_value_reader(p_value, alpha):
    if p_value < alpha:
        print("Reject the Null Hypothesis.")
    else:
        print("Fail to reject the Null Hypothesis.")

### Hypothesis 1: Restaurants that are open have better ratings than those that are not.

Null Hypothesis: Open restaurants ratings <= Closed Restaurant ratings

Alternative Hypothesis: Open restaurants ratings > Closed restaurant ratings

(two sample test with a 1 tail)

In [7]:
# compare the length of the data
open_rest_stars = restaurant_df[restaurant_df["is_open"] == 1]["stars"]
close_rest_stars = restaurant_df[restaurant_df["is_open"] == 0]["stars"]

In [8]:
# check the mean
open_rest_stars.mean(), close_rest_stars.mean()

(3.523894589418927, 3.4975117180718707)

In [9]:
# function: performs 2 sample test based on the outcome of levene's test
def test_2sample(sample1, sample2, alpha, alternative):
    # levene's test
    stat, p_value = st.levene(sample1, sample2)
    # interpret the test
    if p_value < alpha:
        equal_var = False
        print("Reject the Null Hypothesis. Variances are unequal. Perform Welch's test.")
    else:
        equal_var = True
        print("Fail to reject the Null Hypothesis.Variances are equal. Perform 2-sample t test.")

    # 2 sample test
    t_stat, p_value = st.ttest_ind(
        sample1,
        sample2,
        equal_var = equal_var,
        alternative = alternative
    )
    print(f"The p_value is {p_value}.")
    p_value_reader(p_value, alpha)

In [10]:
# apply the function
test_2sample(open_rest_stars, close_rest_stars, 0.05, "greater")

Reject the Null Hypothesis. Variances are unequal. Perform Welch's test.
The p_value is 0.00017650010900392002.
Reject the Null Hypothesis.


In [11]:
open_rest_stars

3         4.0
5         2.0
9         1.5
11        4.0
12        2.5
         ... 
150323    4.5
150325    3.0
150327    4.0
150336    4.5
150339    4.5
Name: stars, Length: 34987, dtype: float64

### Hypothesis 2: Restaurants that deliver food have worse ratings

Null Hypothesis: Restaurants that deliver food ratings >= restaurants that don't deliver food ratings

Alternative Hypothesis: Restaurants that deliver food ratings < restaurants that don't deliver food ratings

(2 sample test with 1 tail)

In [12]:
# create a copy of the df
df_h2 = restaurant_df.copy()

In [13]:
# define a function to extract the "RestaurantsDelivery" value
def is_delivery(attributes):
    if attributes and "RestaurantsDelivery" in attributes:
        return attributes["RestaurantsDelivery"] == "True"
    return False

# apply the function to the attributes column and create a new column delivery_food
df_h2["delivery_food"] = df_h2["attributes"].apply(is_delivery)

In [14]:
df_h2.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours,delivery_food
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ...",False
5,CF33F8-E6oudUQ46HnavjQ,Sonic Drive-In,615 S Main St,Ashland City,TN,37015,36.269593,-87.058943,2.0,6,1,"{'BusinessParking': 'None', 'BusinessAcceptsCr...","Burgers, Fast Food, Sandwiches, Food, Ice Crea...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-22:0', '...",True
8,k0hlBqXX-Bt0vf1op7Jr1w,Tsevi's Pub And Grill,8025 Mackenzie Rd,Affton,MO,63123,38.565165,-90.321087,3.0,19,0,"{'Caters': 'True', 'Alcohol': 'u'full_bar'', '...","Pubs, Restaurants, Italian, Bars, American (Tr...",,False
9,bBDDEgkFA1Otx9Lfe7BZUQ,Sonic Drive-In,2312 Dickerson Pike,Nashville,TN,37207,36.208102,-86.76817,1.5,10,1,"{'RestaurantsAttire': ''casual'', 'Restaurants...","Ice Cream & Frozen Yogurt, Fast Food, Burgers,...","{'Monday': '0:0-0:0', 'Tuesday': '6:0-21:0', '...",True
11,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,,Tampa Bay,FL,33602,27.955269,-82.45632,4.0,10,1,"{'Alcohol': ''none'', 'OutdoorSeating': 'None'...","Vietnamese, Food, Restaurants, Food Trucks","{'Monday': '11:0-14:0', 'Tuesday': '11:0-14:0'...",False


In [15]:
# star rating for delivery and non delivery restaurants
delivery_stars = df_h2[df_h2["delivery_food"] == True]["stars"]
non_delivery_stars = df_h2[df_h2["delivery_food"] == False]["stars"]

In [16]:
# check the mean
delivery_stars.mean(), non_delivery_stars.mean()

(3.406033450640351, 3.6369358429467717)

In [17]:
# apply the test function
test_2sample(delivery_stars, non_delivery_stars, 0.05, "less")

Reject the Null Hypothesis. Variances are unequal. Perform Welch's test.
The p_value is 3.1154302069409457e-227.
Reject the Null Hypothesis.


### Hypothesis 3: Restaurants that allow smoking are less likely to be open

Null Hypothesis: There is no relationship between the variables

Alternative Hypothesis: Restaurants that allow smoking are less likely to be open

(chisquare test)

In [18]:
# create a copy of df
df_h3 = restaurant_df.copy()

In [19]:
# assuming the attributes for smoking is in a dictionary and "True" indicates smoking is allowed
df_h3["allow_smoking"] = df_h3["attributes"].apply(lambda x: x.get("Smoking") == "True" if x else False)

In [23]:
df_h3.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours', 'allow_smoking'],
      dtype='object')

In [24]:
# create a contingency table to observe the frequency distribution between the smoking
contingency_table = pd.crosstab(df_h3["allow_smoking"], df_h3["is_open"])
print(contingency_table)

is_open            0      1
allow_smoking              
False          17281  34987


In [25]:
# Chi Square Test
stat, p_value, dof, expected_freq = st.chi2_contingency(observed = contingency_table)

# Print and interpret the p_value
print(f"The p-value is {p_value}")
p_value_reader(p_value, 0.05)

The p-value is 1.0
Fail to reject the Null Hypothesis.
