# Code for Yelp Data Analysis

In [1]:
import numpy as np
import json
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import re
import copy
import os
import csv
from sklearn import tree 
from sklearn.model_selection import train_test_split
from scipy import stats
import researchpy as rp

## Load Data, Description, Preprocessing
#### First load data and filter targeting business.

In [2]:
def read_json(path):
    with open(path, encoding='utf-8') as f:
        iter_f = iter(f)
        line = f.readline()
        text = []
        for line in iter_f: 
            d = json.loads(line)
            text.append(d)
        f.close()
    return(text)

In [3]:
def write_json(path, data):
    with open(path, 'w', encoding="utf-8") as f:
        return json.dump(data, f, indent=2)

In [4]:
raw_business=read_json("Stat628/business.json")
raw_review=read_json("Stat628/review.json")
raw_tip=read_json("Stat628/tip.json")
raw_user=read_json("Stat628/user.json")

In [5]:
# filter business 
# filter rule: containing 'seafood', 'restaurant', but no 'Steakhouse'; category's length less than 50(roughly); number of reviews larger than 50;
seafood_business=[]
index=[]
for i in range(len(raw_business)):
    if raw_business[i]['categories'] is None:
        continue
    if "Seafood" in raw_business[i]['categories'] and "Restaurants" in  raw_business[i]['categories'] and 'Steakhouse' not in raw_business[i]['categories'] and len(raw_business[i]['categories'])<=50 and raw_business[i]['review_count'] >= 50:
        seafood_business.append(raw_business[i])
        index.append(i)
# 433 businesses

In [6]:
'''
# store filtered data
# business
business_df=pd.DataFrame(seafood_business)
business_df.to_csv('seafood_business.csv')
#reviews
review_df=pd.DataFrame(all_review)
review_df.to_csv('all_review.csv')
'''

"\n# store filtered data\n# business\nbusiness_df=pd.DataFrame(seafood_business)\nbusiness_df.to_csv('seafood_business.csv')\n#reviews\nreview_df=pd.DataFrame(all_review)\nreview_df.to_csv('all_review.csv')\n"

#### Analysis of attributes in business data.
* Count number of attributes
* For each attribute, count the number of business providing this attribute

In [8]:
# Put attributes together
attributes_all=[]
for item in seafood_business:
    if item['attributes'] != None:
        attributes_all.extend(list(item['attributes'].keys()))
attributes_all = list(set(attributes_all))

In [9]:
def attributes_counts(attribute):
    count = 0
    for j in range(len(seafood_business)):
        if seafood_business[j]['attributes'] is None:
            continue
        else:
            attributes = seafood_business[j]['attributes'].keys()
            attributes = list(attributes)
            if attribute in attributes: # if the given attribute exists
                count += 1
    return(count)

In [10]:
# Count number of business containing these attributes
# Then sort accroding to counts.
attributes_all = sorted(attributes_all, key=str.lower)
counts = list()

for i in range(len(attributes_all)):
    count = attributes_counts(attributes_all[i])
    counts.append(count)

attributes_all_df = pd.DataFrame(attributes_all)
counts_df = pd.DataFrame(counts)
attributes_counts = pd.concat([attributes_all_df, counts_df], axis=1)
attributes_counts.columns = ["Attributes", "Counts"]

attributes_counts = attributes_counts.sort_values(by='Counts')
attributes_counts

Unnamed: 0,Attributes,Counts
7,BYOB,1
13,DriveThru,5
17,HappyHour,9
29,Smoking,9
14,GoodForDancing,10
10,CoatCheck,10
2,BestNights,10
19,Music,11
11,Corkage,12
8,BYOBCorkage,35


Drop attributes with less than 50% businesses

In [11]:
attributes_selected=list(attributes_counts[attributes_counts['Counts']>=225]['Attributes'])
attributes_counts[attributes_counts['Counts']>=225]
#len(selected_attributes)
#19 attributes are selected

Unnamed: 0,Attributes,Counts
27,RestaurantsTableService,225
5,BusinessAcceptsCreditCards,347
16,GoodForMeal,381
9,Caters,401
3,BikeParking,414
31,WiFi,426
20,NoiseLevel,427
22,RestaurantsAttire,430
0,Alcohol,431
18,HasTV,432


Consider all attributes, 5 attributes don't have missing values; 5 miss only 1 value; 1 miss 2 values; 1 miss 3 values; 

In [12]:
business_attributes=[]
for business in seafood_business:
    new_dic={}
    new_dic['business_id']=business['business_id']
    new_dic['stars']=business['stars']
    for attribute in attributes_selected:
        if attribute not in business['attributes'].keys():
            new_dic[attribute]=None
        else:
            new_dic[attribute]=business['attributes'][attribute]
    business_attributes.append(new_dic)

Correct the form of some attributes.\
Imputation

In [13]:
def correct_form(attribute):
    for business in business_attributes:
        if business[attribute] != None:
            if re.search('^u(.*)',business[attribute]):
                business[attribute]=re.search('^u(.*)',business[attribute]).group(1)

In [14]:
correct_form('WiFi')
correct_form('NoiseLevel')
correct_form('RestaurantsAttire')
correct_form('Alcohol')

In [15]:
business_attributes_df=pd.DataFrame(business_attributes)
#business_attributes_df

In [16]:
def find_none(attribute):
    indexs=[]
    for index, row in business_attributes_df.iterrows():
        if row[attribute] is None:
            indexs.append(index)
    return(indexs)
def exclude_none(attribute):
    indexs=[]
    for index, row in business_attributes_df.iterrows():
        if row[attribute] != None:
            indexs.append(index)
    return(indexs)

Impute manually for the attributes missing only few businesses.\
Ways: search businesses on Yelp website and impute

'HasTV','RestaurantsDelivery','RestaurantsPriceRange2','Ambience','OutdoorSeating' only miss 1 attribute

In [17]:
for item in ['HasTV','RestaurantsDelivery','RestaurantsPriceRange2','Ambience','OutdoorSeating']:
    print(find_none(item))

[267]
[258]
[267]
[3]
[267]


In [18]:
seafood_business.pop(3)
business_df=pd.DataFrame(seafood_business)
business_df.to_csv('seafood_business.csv')

In [19]:
# For HasTV: 267, True
business_attributes[267]['HasTV']='True'

# For RestaurantsDelivery: 258, False
business_attributes[258]['RestaurantsDelivery']='False'

# For RestaurantsPriceRange2: 267, 2
business_attributes[267]['RestaurantsPriceRange2']='2'

# OutdoorSeating: Yes
business_attributes[267]['OutdoorSeating']='True'

# For Ambience: 3, closed, delete
#business_attributes.pop(3)

In [20]:
business_attributes_df=pd.DataFrame(business_attributes)

Alcohol miss 2 values and RestaurantsAttire miss 3 values

In [21]:
for item in ['Alcohol','RestaurantsAttire']:
    print(find_none(item))

[218, 267]
[267, 390, 415]


In [22]:
# For Alcohol, 217,  266, full_bar

# For RestaurantsAttire, 266, 389, 414 

##### Use decison tree to impute missing attributes.

##### For Alcohol, 2 missing values

In [32]:
Al_dict={"'beer_and_wine'":1,"'full_bar'":2,"'none'":3}
reverse_Al={1:"'beer_and_wine'",2:"'full_bar'",3:"'none'"}
t_Al=exclude_none('Alcohol')
f_Al=find_none('Alcohol')
X_t=[]
Y_t=[]
for i in t_Al:
    x=[]
    for attribute in ['HasTV','RestaurantsDelivery','RestaurantsPriceRange2','OutdoorSeating','RestaurantsGoodForGroups','RestaurantsReservations','RestaurantsTakeOut','GoodForKids']:
        if business_attributes[i][attribute]=='False':
            x.append(0)
        elif business_attributes[i][attribute]=='True':
            x.append(1)
        else:
            x.append(float(business_attributes[i][attribute]))
    X_t.append(x)
    Y_t.append(Al_dict[business_attributes[i]['Alcohol']])
X_f=[]
for i in f_Al:
    x=[]
    for attribute in ['HasTV','RestaurantsDelivery','RestaurantsPriceRange2','OutdoorSeating','RestaurantsGoodForGroups','RestaurantsReservations','RestaurantsTakeOut','GoodForKids']:
        if business_attributes[i][attribute]=='False':
            x.append(0)
        elif business_attributes[i][attribute]=='True':
            x.append(1)
        else:
            x.append(float(business_attributes[i][attribute]))
    X_f.append(x)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_t, Y_t, test_size=0.2, random_state=0)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
print(clf.score(X_test,y_test))
pre_Al=clf.predict(X_f)

0.632183908045977


In [34]:
for i in range(2):
    business_attributes[f_Al[i]]['Alcohol']=reverse_Al[pre_Al[i]]
    business_attributes_df['Alcohol'].loc[f_Al[i]]=reverse_Al[pre_Al[i]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


##### For Noiselevel, 5 missing values

In [35]:
No_dict={"'average'":1, "'loud'":2, "'quiet'":3, "'very_loud'":4}
reverse_No={1:"'average'", 2:"'loud'", 3:"'quiet'", 4:"'very_loud'"}
t_No=exclude_none('NoiseLevel')
f_No=find_none('NoiseLevel')
X_t=[]
Y_t=[]
for i in t_No:
    x=[]
    for attribute in ['HasTV','RestaurantsDelivery','RestaurantsPriceRange2','OutdoorSeating','RestaurantsGoodForGroups','RestaurantsReservations','RestaurantsTakeOut','GoodForKids']:
        if business_attributes[i][attribute]=='False':
            x.append(0)
        elif business_attributes[i][attribute]=='True':
            x.append(1)
        else:
            x.append(float(business_attributes[i][attribute]))
    X_t.append(x)
    Y_t.append(No_dict[business_attributes[i]['NoiseLevel']])
X_f=[]
for i in f_No:
    x=[]
    for attribute in ['HasTV','RestaurantsDelivery','RestaurantsPriceRange2','OutdoorSeating','RestaurantsGoodForGroups','RestaurantsReservations','RestaurantsTakeOut','GoodForKids']:
        if business_attributes[i][attribute]=='False':
            x.append(0)
        elif business_attributes[i][attribute]=='True':
            x.append(1)
        else:
            x.append(float(business_attributes[i][attribute]))
    X_f.append(x)

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X_t, Y_t, test_size=0.2, random_state=0)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
print(clf.score(X_test,y_test))
pre_No=clf.predict(X_f)

0.8255813953488372


In [37]:
for i in range(len(f_No)):
    business_attributes[f_No[i]]['NoiseLevel']=reverse_No[pre_No[i]]
    business_attributes_df['NoiseLevel'].loc[f_No[i]]=reverse_No[pre_No[i]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


##### For BusinessAcceptCreditCard ,  85 missing values

In [38]:
Credit_dict={'False':0,'True':1}
reverse_Credit={0:'False',1:'True'}
t_Credit=exclude_none('BusinessAcceptsCreditCards')
f_Credit=find_none('BusinessAcceptsCreditCards')
X_t=[]
Y_t=[]
for i in t_Credit:
    x=[]
    for attribute in ['HasTV','RestaurantsDelivery','RestaurantsPriceRange2','OutdoorSeating','RestaurantsGoodForGroups','RestaurantsReservations','RestaurantsTakeOut','GoodForKids']:
        if business_attributes[i][attribute]=='False':
            x.append(0)
        elif business_attributes[i][attribute]=='True':
            x.append(1)
        else:
            x.append(float(business_attributes[i][attribute]))
    X_t.append(x)
    Y_t.append(Credit_dict[business_attributes[i]['BusinessAcceptsCreditCards']])
X_f=[]
for i in f_Credit:
    x=[]
    for attribute in ['HasTV','RestaurantsDelivery','RestaurantsPriceRange2','OutdoorSeating','RestaurantsGoodForGroups','RestaurantsReservations','RestaurantsTakeOut','GoodForKids']:
        if business_attributes[i][attribute]=='False':
            x.append(0)
        elif business_attributes[i][attribute]=='True':
            x.append(1)
        else:
            x.append(float(business_attributes[i][attribute]))
    X_f.append(x)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_t, Y_t, test_size=0.2, random_state=0)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
print(clf.score(X_test,y_test))
pre_Credit=clf.predict(X_f)

0.9714285714285714


In [40]:
for i in range(len(f_Credit)):
    business_attributes[f_Credit[i]]['BusinessAcceptsCreditCards']=reverse_Credit[pre_Credit[i]]
    business_attributes_df['BusinessAcceptsCreditCards'].loc[f_Credit[i]]=reverse_Credit[pre_Credit[i]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


##### For WiFi, 6 missing values

In [42]:
wifi_dict={"'free'":1,"'no'":2,"'paid'":3}
reverse_wifi={1:"'free'",2:"'no'",3:"'paid'"}
t_wifi=exclude_none('WiFi')
f_wifi=find_none('WiFi')
X_t=[]
Y_t=[]
for i in t_wifi:
    x=[]
    for attribute in ['HasTV','RestaurantsDelivery','RestaurantsPriceRange2','OutdoorSeating','RestaurantsGoodForGroups','RestaurantsReservations','RestaurantsTakeOut','GoodForKids']:
        if business_attributes[i][attribute]=='False':
            x.append(0)
        elif business_attributes[i][attribute]=='True':
            x.append(1)
        else:
            x.append(float(business_attributes[i][attribute]))
    X_t.append(x)
    Y_t.append(wifi_dict[business_attributes[i]['WiFi']])
X_f=[]
for i in f_wifi:
    x=[]
    for attribute in ['HasTV','RestaurantsDelivery','RestaurantsPriceRange2','OutdoorSeating','RestaurantsGoodForGroups','RestaurantsReservations','RestaurantsTakeOut','GoodForKids']:
        if business_attributes[i][attribute]=='False':
            x.append(0)
        elif business_attributes[i][attribute]=='True':
            x.append(1)
        else:
            x.append(float(business_attributes[i][attribute]))
    X_f.append(x)

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X_t, Y_t, test_size=0.2, random_state=0)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
print(clf.score(X_test,y_test))
pre_wifi=clf.predict(X_f)

0.5930232558139535


In [44]:
for i in range(len(f_wifi)):
    business_attributes[f_Credit[i]]['WiFi']=reverse_wifi[pre_wifi[i]]
    business_attributes_df['WiFi'].loc[f_wifi[i]]=reverse_wifi[pre_wifi[i]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


##### For Caters, 32 missing values

In [45]:
Cater_dict={'False':0,'True':1}
reverse_Cater={0:'False',1:'True'}
t_Cater=exclude_none('Caters')
f_Cater=find_none('Caters')
X_t=[]
Y_t=[]
for i in t_Cater:
    x=[]
    for attribute in ['HasTV','RestaurantsDelivery','RestaurantsPriceRange2','OutdoorSeating','RestaurantsGoodForGroups','RestaurantsReservations','RestaurantsTakeOut','GoodForKids']:
        if business_attributes[i][attribute]=='False':
            x.append(0)
        elif business_attributes[i][attribute]=='True':
            x.append(1)
        else:
            x.append(float(business_attributes[i][attribute]))
    X_t.append(x)
    Y_t.append(Cater_dict[business_attributes[i]['Caters']])
X_f=[]
for i in f_Cater:
    x=[]
    for attribute in ['HasTV','RestaurantsDelivery','RestaurantsPriceRange2','OutdoorSeating','RestaurantsGoodForGroups','RestaurantsReservations','RestaurantsTakeOut','GoodForKids']:
        if business_attributes[i][attribute]=='False':
            x.append(0)
        elif business_attributes[i][attribute]=='True':
            x.append(1)
        else:
            x.append(float(business_attributes[i][attribute]))
    X_f.append(x)

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X_t, Y_t, test_size=0.2, random_state=0)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
print(clf.score(X_test,y_test))
pre_Cater=clf.predict(X_f)

0.7160493827160493


In [47]:
for i in range(len(f_Cater)):
    business_attributes[f_Cater[i]]['Caters']=reverse_Cater[pre_Cater[i]]
    business_attributes_df['Caters'].loc[f_Cater[i]]=reverse_Cater[pre_Cater[i]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


##### For BikeParking, 19 missing values

In [48]:
bike_dict={'False':0,'True':1}
reverse_bike={0:'False',1:'True'}
t_bike=exclude_none('BikeParking')
f_bike=find_none('BikeParking')
X_t=[]
Y_t=[]
for i in t_bike:
    x=[]
    for attribute in ['HasTV','RestaurantsDelivery','RestaurantsPriceRange2','OutdoorSeating','RestaurantsGoodForGroups','RestaurantsReservations','RestaurantsTakeOut','GoodForKids']:
        if business_attributes[i][attribute]=='False':
            x.append(0)
        elif business_attributes[i][attribute]=='True':
            x.append(1)
        else:
            x.append(float(business_attributes[i][attribute]))
    X_t.append(x)
    Y_t.append(Cater_dict[business_attributes[i]['BikeParking']])
X_f=[]
for i in f_bike:
    x=[]
    for attribute in ['HasTV','RestaurantsDelivery','RestaurantsPriceRange2','OutdoorSeating','RestaurantsGoodForGroups','RestaurantsReservations','RestaurantsTakeOut','GoodForKids']:
        if business_attributes[i][attribute]=='False':
            x.append(0)
        elif business_attributes[i][attribute]=='True':
            x.append(1)
        else:
            x.append(float(business_attributes[i][attribute]))
    X_f.append(x)

In [49]:
X_train, X_test, y_train, y_test = train_test_split(X_t, Y_t, test_size=0.2, random_state=0)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
print(clf.score(X_test,y_test))
pre_bike=clf.predict(X_f)

0.7349397590361446


In [50]:
for i in range(len(f_bike)):
    business_attributes[f_bike[i]]['BikeParking']=reverse_bike[pre_bike[i]]
    business_attributes_df['BikeParking'].loc[f_bike[i]]=reverse_bike[pre_bike[i]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


##### For RestaurantsAttire, 3 missing values

In [52]:
attr_dict={"'casual'":1,"'dressy'":2}
reverse_attr={1:"'casual'",2:"'dressy'"}
t_attr=exclude_none('RestaurantsAttire')
f_attr=find_none('RestaurantsAttire')
X_t=[]
Y_t=[]
for i in t_attr:
    x=[]
    for attribute in ['HasTV','RestaurantsDelivery','RestaurantsPriceRange2','OutdoorSeating','RestaurantsGoodForGroups','RestaurantsReservations','RestaurantsTakeOut','GoodForKids']:
        if business_attributes[i][attribute]=='False':
            x.append(0)
        elif business_attributes[i][attribute]=='True':
            x.append(1)
        else:
            x.append(float(business_attributes[i][attribute]))
    X_t.append(x)
    Y_t.append(attr_dict[business_attributes[i]['RestaurantsAttire']])
X_f=[]
for i in f_attr:
    x=[]
    for attribute in ['HasTV','RestaurantsDelivery','RestaurantsPriceRange2','OutdoorSeating','RestaurantsGoodForGroups','RestaurantsReservations','RestaurantsTakeOut','GoodForKids']:
        if business_attributes[i][attribute]=='False':
            x.append(0)
        elif business_attributes[i][attribute]=='True':
            x.append(1)
        else:
            x.append(float(business_attributes[i][attribute]))
    X_f.append(x)

In [53]:
X_train, X_test, y_train, y_test = train_test_split(X_t, Y_t, test_size=0.2, random_state=0)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
print(clf.score(X_test,y_test))
pre_attr=clf.predict(X_f)

0.9534883720930233


In [54]:
for i in range(len(f_attr)):
    business_attributes[f_attr[i]]['RestaurantsAttire']=reverse_attr[pre_attr[i]]
    business_attributes_df['RestaurantsAttire'].loc[f_attr[i]]=reverse_attr[pre_attr[i]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


##### For RestaurantsTableService, 208 missing values

In [55]:
ser_dict={'False':0,'True':1}
reverse_ser={0:'False',1:'True'}
t_ser=exclude_none('RestaurantsTableService')
f_ser=find_none('RestaurantsTableService')
X_t=[]
Y_t=[]
for i in t_ser:
    x=[]
    for attribute in ['HasTV','RestaurantsDelivery','RestaurantsPriceRange2','OutdoorSeating','RestaurantsGoodForGroups','RestaurantsReservations','RestaurantsTakeOut','GoodForKids']:
        if business_attributes[i][attribute]=='False':
            x.append(0)
        elif business_attributes[i][attribute]=='True':
            x.append(1)
        else:
            x.append(float(business_attributes[i][attribute]))
    X_t.append(x)
    Y_t.append(ser_dict[business_attributes[i]['RestaurantsTableService']])
X_f=[]
for i in f_ser:
    x=[]
    for attribute in ['HasTV','RestaurantsDelivery','RestaurantsPriceRange2','OutdoorSeating','RestaurantsGoodForGroups','RestaurantsReservations','RestaurantsTakeOut','GoodForKids']:
        if business_attributes[i][attribute]=='False':
            x.append(0)
        elif business_attributes[i][attribute]=='True':
            x.append(1)
        else:
            x.append(float(business_attributes[i][attribute]))
    X_f.append(x)

In [56]:
X_train, X_test, y_train, y_test = train_test_split(X_t, Y_t, test_size=0.2, random_state=0)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
print(clf.score(X_test,y_test))
pre_ser=clf.predict(X_f)

0.9555555555555556


In [57]:
for i in range(len(f_ser)):
    business_attributes[f_ser[i]]['RestaurantsTableService']=reverse_ser[pre_ser[i]]
    business_attributes_df['RestaurantsTableService'].loc[f_ser[i]]=reverse_ser[pre_ser[i]]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [58]:
# restore the data
business_attributes_df.to_csv('business_attributes.csv')

#### Anova analysis on different attributes

In [24]:
# GoodForKids
stats.f_oneway(business_attributes_df['stars'][business_attributes_df['GoodForKids'] == 'False'], 
             business_attributes_df['stars'][business_attributes_df['GoodForKids'] == 'True'])

F_onewayResult(statistic=9.100096148112431, pvalue=0.0027072886734485963)

In [25]:
# RestaurantsTakeOut
stats.f_oneway(business_attributes_df['stars'][business_attributes_df['RestaurantsTakeOut'] == 'False'], 
             business_attributes_df['stars'][business_attributes_df['RestaurantsTakeOut'] == 'True'])

F_onewayResult(statistic=4.090351873401226, pvalue=0.04374575702447464)

In [26]:
# RestaurantsReservations
stats.f_oneway(business_attributes_df['stars'][business_attributes_df['RestaurantsReservations'] == 'False'], 
             business_attributes_df['stars'][business_attributes_df['RestaurantsReservations'] == 'True'])

F_onewayResult(statistic=0.15540637866149778, pvalue=0.6936164344420097)

In [27]:
# RestaurantsGoodForGroups
stats.f_oneway(business_attributes_df['stars'][business_attributes_df['RestaurantsGoodForGroups'] == 'False'], 
             business_attributes_df['stars'][business_attributes_df['RestaurantsGoodForGroups'] == 'True'])

F_onewayResult(statistic=3.8154963753985185, pvalue=0.051427197717029804)

In [28]:
# OutdoorSeating
stats.f_oneway(business_attributes_df['stars'][business_attributes_df['OutdoorSeating'] == 'False'], 
             business_attributes_df['stars'][business_attributes_df['OutdoorSeating'] == 'True'])

F_onewayResult(statistic=0.05842790095446311, pvalue=0.8091131472636344)

In [29]:
# RestaurantsPriceRange2
stats.f_oneway(business_attributes_df['stars'][business_attributes_df['RestaurantsPriceRange2'] == '1'], 
             business_attributes_df['stars'][business_attributes_df['RestaurantsPriceRange2'] == '2'],
              business_attributes_df['stars'][business_attributes_df['RestaurantsPriceRange2'] == '3'],
              business_attributes_df['stars'][business_attributes_df['RestaurantsPriceRange2'] == '4'])

F_onewayResult(statistic=1.9688781222229454, pvalue=0.1179467902424339)

In [30]:
# RestaurantsDelivery
stats.f_oneway(business_attributes_df['stars'][business_attributes_df['RestaurantsDelivery'] == 'False'], 
             business_attributes_df['stars'][business_attributes_df['RestaurantsDelivery'] == 'True'])

F_onewayResult(statistic=1.1917046680733365, pvalue=0.27559583220665174)

In [31]:
# HasTV
stats.f_oneway(business_attributes_df['stars'][business_attributes_df['HasTV'] == 'False'], 
             business_attributes_df['stars'][business_attributes_df['HasTV'] == 'True'])

F_onewayResult(statistic=0.3704126167173948, pvalue=0.5431000615575006)

In [59]:
# NoiseLevel
stats.f_oneway(business_attributes_df['stars'][business_attributes_df['NoiseLevel'] == "'average'"], 
             business_attributes_df['stars'][business_attributes_df['NoiseLevel'] == "'loud'"],
             business_attributes_df['stars'][business_attributes_df['NoiseLevel'] == "'quiet'"],
              business_attributes_df['stars'][business_attributes_df['NoiseLevel'] == "'very_loud'"])

F_onewayResult(statistic=6.831904166749505, pvalue=0.00016618433473521553)

In [60]:
# Alcohol
stats.f_oneway(business_attributes_df['stars'][business_attributes_df['Alcohol'] == "'beer_and_wine'"], 
             business_attributes_df['stars'][business_attributes_df['Alcohol'] == "'full_bar'"],
             business_attributes_df['stars'][business_attributes_df['Alcohol'] == "'none'"] )

F_onewayResult(statistic=4.546269917669344, pvalue=0.011121551251475353)

In [61]:
# BusinessAcceptsCreditCards
stats.f_oneway(business_attributes_df['stars'][business_attributes_df['BusinessAcceptsCreditCards'] == 'False'], 
             business_attributes_df['stars'][business_attributes_df['BusinessAcceptsCreditCards'] == 'True'])

F_onewayResult(statistic=4.3751761991748825, pvalue=0.037050784854024195)

In [None]:
# use R to fit a linear regression model.