In [2]:
%matplotlib inline
import matplotlib
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 144

In [3]:
from grader import score

In [4]:
import gzip
import ujson as json
import pprint

import pandas as pd
import numpy as np

# PS Miniproject

## Introduction

The objective of this miniproject is to apply probability and statistics to yelp data for businesses primarily in AZ and NV. We will study the data to find meaningful patterns in the ratings and data for these businesses.
`
## Metric

Your answers will be assessed based based on how well you apply these standard statistical techniques.

## Download and parse the incoming data

The data are [here](s3://dataincubator-course/mldata/yelp_train_academic_dataset_business.json.gz).
Notice that each row of the file is a json blurb.  You can read it with Python.

In [5]:
# !mkdir -p data/
# !aws s3 cp s3://dataincubator-course/mldata/yelp_train_academic_dataset_business.json.gz data

In [6]:
!zcat data/yelp_train_academic_dataset_business.json.gz | head -5


{"business_id": "vcNAWiLM4dR7D2nwwJ7nCA", "full_address": "4840 E Indian School Rd\nSte 101\nPhoenix, AZ 85018", "hours": {"Tuesday": {"close": "17:00", "open": "08:00"}, "Friday": {"close": "17:00", "open": "08:00"}, "Monday": {"close": "17:00", "open": "08:00"}, "Wednesday": {"close": "17:00", "open": "08:00"}, "Thursday": {"close": "17:00", "open": "08:00"}}, "open": true, "categories": ["Doctors", "Health & Medical"], "city": "Phoenix", "review_count": 7, "name": "Eric Goldberg, MD", "neighborhoods": [], "longitude": -111.98375799999999, "state": "AZ", "stars": 3.5, "latitude": 33.499313000000001, "attributes": {"By Appointment Only": true}, "type": "business"}
{"business_id": "JwUE5GmEO-sH1FuwJgKBlQ", "full_address": "6162 US Highway 51\nDe Forest, WI 53532", "hours": {}, "open": true, "categories": ["Restaurants"], "city": "De Forest", "review_count": 26, "name": "Pine Cone Restaurant", "neighborhoods": [], "longitude": -89.335843999999994, "state": "WI", "stars": 4.0, "latitude

In [7]:
with gzip.open("data/yelp_train_academic_dataset_business.json.gz") as fin:
    data = [json.loads(line) for line in fin]
df = pd.DataFrame(data)

In [8]:
data[:2]

[{u'attributes': {u'By Appointment Only': True},
  u'business_id': u'vcNAWiLM4dR7D2nwwJ7nCA',
  u'categories': [u'Doctors', u'Health & Medical'],
  u'city': u'Phoenix',
  u'full_address': u'4840 E Indian School Rd\nSte 101\nPhoenix, AZ 85018',
  u'hours': {u'Friday': {u'close': u'17:00', u'open': u'08:00'},
   u'Monday': {u'close': u'17:00', u'open': u'08:00'},
   u'Thursday': {u'close': u'17:00', u'open': u'08:00'},
   u'Tuesday': {u'close': u'17:00', u'open': u'08:00'},
   u'Wednesday': {u'close': u'17:00', u'open': u'08:00'}},
  u'latitude': 33.499313,
  u'longitude': -111.983758,
  u'name': u'Eric Goldberg, MD',
  u'neighborhoods': [],
  u'open': True,
  u'review_count': 7,
  u'stars': 3.5,
  u'state': u'AZ',
  u'type': u'business'},
 {u'attributes': {u'Accepts Credit Cards': True,
   u'Alcohol': u'none',
   u'Ambience': {u'casual': False,
    u'classy': False,
    u'divey': False,
    u'hipster': False,
    u'intimate': False,
    u'romantic': False,
    u'touristy': False,
    u'

In [9]:
df.head(2)

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type
0,{u'By Appointment Only': True},vcNAWiLM4dR7D2nwwJ7nCA,"[Doctors, Health & Medical]",Phoenix,"4840 E Indian School Rd\nSte 101\nPhoenix, AZ ...","{u'Thursday': {u'close': u'17:00', u'open': u'...",33.499313,-111.983758,"Eric Goldberg, MD",[],True,7,3.5,AZ,business
1,"{u'Take-out': True, u'Price Range': 1, u'Outdo...",JwUE5GmEO-sH1FuwJgKBlQ,[Restaurants],De Forest,"6162 US Highway 51\nDe Forest, WI 53532",{},43.238893,-89.335844,Pine Cone Restaurant,[],True,26,4.0,WI,business


## city_z_score
Do certain cities have better amenities than others?  We can test this by comparing the average score or rating of businesses in one city vs. the average across all businesses in the dataset.  We will also want to use statistics to help determine if this difference is statistically significant.

Compute the z score for average ratings for every city.

**Note**:
1. Only look at cities with at least five businesses, both for the city averages and when computing the average across all cities.
2. Estimate the population's standard deviation and mean.
3. Estimate each city's mean and use the population statistics to compute the z score.
4. Do not do any cleaning of the city names.  (You will get Las Vegas twice.  That's okay.)

In [10]:
city_counts = df.groupby('city').size().sort_values()

In [11]:
popular_cities = city_counts[city_counts >= 5].index   #Cool filter.  Functions like a where clause.  Will be applied to popular cities.

In [12]:
df_popular_cities = df[df['city'].isin(city_counts[city_counts >= 5].index)]

In [75]:
states = df_popular_cities.groupby('state')
states


<pandas.core.groupby.DataFrameGroupBy object at 0x7ff5f8f3f650>

In [14]:
average = df_popular_cities['stars'].mean()

In [15]:
stdev = df_popular_cities['stars'].std(ddof=1)

In [16]:
mean_by_city = df_popular_cities.groupby('city')['stars'].mean()

In [17]:
count_by_city = df_popular_cities.groupby('city').size()

In [18]:
# ~1:45 in #9: 
# the normalization '(stdev / np.sqrt(count_ty_city_))' is trying to ensure that outliers are normalized out.
# As the count_by_city gets larger, its square root also gets larger.
# Dividing the Standard Deviation by the square root of the count 
# As the count gets larger, any differences between the mean by city and average will result in a larger z-score.
# If you had a city with a million business and the mean was .1 higher than the average, because there is so much data there,
# you may be able to conclude that, although the difference is small, it is statistically significantly.  
# If you had a city with only 5 businesses with the same difference between the mean and the population average, 
# you would likely conclude that the difference is not as significant.  
# This is how count by city works itself into this equation... to equalize the differences between city sizes.

#####  This is what standard error hopes to accomplish #####

solution = (mean_by_city - average) / (stdev / np.sqrt(count_by_city))

In [19]:
# We're done the calculations, but the data type is a series.
# We need it to be a set of tuples to do the scoring piece. 
#list(solution.to_frame().itertuples())
#list(solution.iteritems()) # I like this solution best.  Seems simplest.
#zip(solution.index, solution)

In [20]:
# Convert the series into a set of tuples... 
# We learned about itertuples() at the end of the pandas lecture.
# TODO: Review Pandas Lecture !!!!!

city_z_score_tuples = list(solution.to_frame().itertuples())
#city_z_score_tuples

In [21]:
# This defaults to a partial answer.  If you execute this, you will get a low score.  
def city_z_score():
    # return [('Ahwatukee', 0.047382042549430063)] * 70
    return city_z_score_tuples
score('ps__city_z_score', city_z_score)

Your score:  1.0


**Question:**
1. For computing the standard deviation, you should really be assuming a single degree of freedom.  Why is this not that important in this case?
> don't know
2. Which cities have the most statistically high ratings?  Do you notice a pattern?
> Does this mean that Las Vegas has low ratings in general?

## Data Sets Notation Review (RBM)

**Note**:
    
    > Square Brackets ([]) are used for python lists
    > Curly Brackets ({}) with no colons are for sets
    > Curly Brackets ({}) with colons are for dictionaries
    > Parentheses (()) are used for tuples

## good_for_kids_ci
Which cities are the most child friendly?  Let's estimate the 2-sigma confidence interval for fraction of venues which are 'Good for Kids' in each city.

**Note**:
1. Ignore any businesses that don't have the 'Good for Kids' attribute specified. **IMPORTANT**: These are the businesses for which you have no information and they should be filtered out. All businesses that have the 'Good for Kids' attribute, whether that be True or False, should be included.
2. Only look at cities with at least five businesses (satisfying the above condition).
3. In this simplified schema, a venue is either 'Good for Kids' or not.  What variable are we using to model the underlying distribution?
4. Notice that some of have a unrealistic confidence intervals.  Can you apply the "Rule of Three" to generate more realistic confidence intervals in this case?

In [58]:
d1 = {"Good for Kids": True, "Formal": False}
d2 = {"Good for Kids": False, "Formal": True}
d3 = {"Allows Pets": True}

def get_GfK(x):
    if x.has_key('Good for Kids'):
        return x["Good for Kids"]
    return None
    
print get_GfK(d1)
print get_GfK(d2)
print get_GfK(d3)
    

True
False
None


In [69]:
#df['has_gfk'] = df['attributes'].apply(lambda x: x.has_key("Good for Kids"))
df['gfk'] = df['attributes'].apply(get_GfK)
df_hasgfk = df[df['gfk'].notnull()]

In [84]:
city_hasgfk_gt5 = df_hasgfk.groupby('city')['gfk'].count()


In [86]:
city_hasgfk_gt5  >= 5

city
Ahwatukee               False
Anthem                   True
Apache Junction          True
Arcadia                 False
Atlanta                 False
Avondale                 True
Boulder City             True
Buckeye                  True
Cambridge               False
Carefree                 True
Casa Grande              True
Cave Creek               True
Central City Village    False
Chandler                 True
City of Edinburgh       False
Clark County            False
Coolidge                 True
Cottage Grove            True
Cramond                 False
Dalkeith                False
Dane                    False
De Forest               False
DeForest                False
Edinburgh                True
El Mirage                True
Enterprise              False
Fitchburg                True
Florence                 True
Fort McDowell           False
Fort Mcdowell           False
                        ...  
San Tan Valley           True
Scottsdale               True
Sedon

In [76]:
# popular_cities = city_counts[city_counts >= 5].index
gfk_cities = df_hasgfk[df_hasgfk >= 5]
gfk_cities


Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type,has_gfk,gfk
1,"{u'Take-out': True, u'Price Range': 1, u'Outdo...",JwUE5GmEO-sH1FuwJgKBlQ,[Restaurants],De Forest,"6162 US Highway 51\nDe Forest, WI 53532",{},43.238893,,Pine Cone Restaurant,[],,26.0,,WI,business,,
2,"{u'Take-out': True, u'Outdoor Seating': False,...",uGykseHzyS5xAMWoN6YUqA,"[American (Traditional), Restaurants]",De Forest,"505 W North St\nDe Forest, WI 53532","{u'Monday': {u'close': u'22:00', u'open': u'06...",43.252267,,Deforest Family Restaurant,[],,16.0,,WI,business,,
9,"{u'Alcohol': u'full_bar', u'Noise Level': u'lo...",_wZTYYL7cutanzAnJUTGMA,"[Bars, American (Traditional), Nightlife, Loun...",Mc Farland,"4506 Larson Beach Rd\nMc Farland, WI 53558",{},43.017701,,Beach House Restaurant & Lounge,[],,31.0,,WI,business,,
11,{u'Good for Kids': True},1tkeiIa-daD8LbM6mHm_9A,"[Active Life, Bowling]",Mc Farland,"4711 Farwell St\nMc Farland, WI 53558","{u'Monday': {u'close': u'02:00', u'open': u'11...",43.013156,,Spartan Bowl,[],,,,WI,business,,
12,"{u'Take-out': True, u'Price Range': 1, u'Outdo...",zOc8lbjViUZajbY7M0aUCQ,"[Pizza, Restaurants]",Mc Farland,"5813 Main St\nMc Farland, WI 53558","{u'Tuesday': {u'close': u'22:00', u'open': u'1...",43.014164,,Spartan Pizza,[],,,,WI,business,,
13,"{u'Take-out': True, u'Accepts Credit Cards': T...",UgjVZTSOaYoEvws_lAP_Dw,"[Chinese, Restaurants]",Mc Farland,"4850 Larson Beach Rd\nMc Farland, WI 53558",{},43.017939,,Main Moon Chinese Restaurant,[],,8.0,,WI,business,,
14,"{u'Take-out': True, u'Caters': True, u'Attire'...",HxPpZSY6Q1eARuiahhra6A,"[Event Planning & Services, Party & Event Plan...",Middleton,"6401 University Ave\nMiddleton, WI 53562",{},43.093265,,Crandalls Carryout & Catering,[],,5.0,,WI,business,,
15,"{u'Take-out': True, u'Price Range': 2, u'Outdo...",SKLw05kEIlZcpTD5pqma8Q,"[Party & Event Planning, Asian Fusion, Event P...",Middleton,"2039 Allen Blvd\nMiddleton, WI 53562","{u'Monday': {u'close': u'14:00', u'open': u'11...",43.090642,,Imperial Garden Chinese Restaurant,[],,41.0,,WI,business,,
16,"{u'Take-out': True, u'Accepts Credit Cards': T...",77ESrCo7hQ96VpCWWdvoxg,"[Mexican, Restaurants]",Middleton,"6230 University Ave\nMiddleton, WI 53562","{u'Monday': {u'close': u'21:00', u'open': u'06...",43.091061,,Mi Cocina,[],,17.0,,WI,business,,
17,{u'Good for Kids': True},368m5-RtrrYWf4hPM_bMlg,"[Active Life, Arts & Entertainment, Stadiums &...",Middleton,"8312 Forsythia St\nMiddleton, WI 53562","{u'Monday': {u'close': u'21:00', u'open': u'09...",43.110157,,Keva Sports Center,[],,,,WI,business,,


In [None]:
# Cities that have 5 business with GFK set (either T or F)


In [None]:

good_for_kids_ci_tuples = ...

In [None]:
    return [('Anthem', (0.85714285714285721, 1.0))] * 51

score('ps__good_for_kids_ci', good_for_kids_ci)

**Question**: For which cities do you need to apply the Rule of Three?  Is there a data feature that's common amongst them?

## category_star_ci
Some categories may be more popular than others.  Compute the 3-sigma confidence interval for the average number of reviews for businesses in each category.

**Note**:
1. Category is actually a list of categories that apply to the business.  Let's just set a business's category to be the first one in this list.  Ignore ones that do not have categories defined.
2. Only consider categories that have at least 40 businesses.
3. What is a good distribution to assume for the number of reviews?  Use the standard deviation from this distribution rather than the sample deviation in computing the confidence interval.

In [None]:

category_reviews_ci_tuples = ...

In [None]:
def category_reviews_ci():
    return [('Active Life', (16.788552393175845, 17.431729675326505))] * 119

score('ps__category_reviews_ci', category_reviews_ci)

**Questions**:
1. What categories of businesses tend to have fewer reviews?
2. What categories of businesses tend to have more reviews?

## review_count_z_score
Are more popular venues more likely to be highly rated?  Given the large variation in reporting amongst categories, we know that popularity depends on the category.

Separate the venues into those that have (strictly) more reviews than the median for their category, and those that have the same number or fewer.  For each category, compute the average number of stars for both those businesses with more than and less than or equal to the median number of reviews.  Report the z score of the difference of those means.

**Note**:
1. Again, category of a business will be defined as the first category and you should ignore businesses that do not have categories.
2. Likewise, only consider categories with at least 40 businesses.

In [None]:

review_count_z_score_tuples = ...

In [None]:
def review_count_z_score():
    return [('Active Life', -4.8885384947587749)] * 119

score('ps__review_count_z_score', review_count_z_score)

**Questions**:
1. What categories benefit from having more reviewers?
2. What categories are hurt by having more reviewers?
3. Why did we choose to separate each category by the median number of reviews rather than the mean number of reviews?

## For Fun (not graded)!
What types of categories have the most disagreement?  Use the variance as a proxy.  How would you compute the 2-sigma confidence interval?

*Copyright &copy; 2017 The Data Incubator.  All rights reserved.*