# Yelp Data Cleaning
This script imports, cleans, and filters variables for input into a regression model to predict median home price for a neighborhood. 
data source: https://www.yelp.com/dataset/download

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
# Set working directory
project_dir = os.getcwd()
yelp_dir = os.path.join(project_dir, 'data', 'yelp_dataset')

## Business dataset

In [3]:
# Load business dataset
business_df = pd.read_json(os.path.join(yelp_dir, 'business.json'), lines=True)
business_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,1SWheh84yJXfytovILXOAQ,Arizona Biltmore Golf Club,2818 E Camino Acequia Drive,Phoenix,AZ,85016,33.522143,-112.018481,3.0,5,0,{'GoodForKids': 'False'},"Golf, Active Life",
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,128,1,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...","{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W..."
2,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,"10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,170,1,"{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...","Sushi Bars, Restaurants, Japanese","{'Monday': '17:30-21:30', 'Wednesday': '17:30-..."
3,xvX2CttrVhyG2z1dFg_0xw,Farmers Insurance - Paul Lorenz,"15655 W Roosevelt St, Ste 237",Goodyear,AZ,85338,33.455613,-112.395596,5.0,3,1,,"Insurance, Financial Services","{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ..."
4,HhyxOkGAM07SRYtlQ4wMFQ,Queen City Plumbing,"4209 Stuart Andrew Blvd, Ste F",Charlotte,NC,28217,35.190012,-80.887223,4.0,4,1,"{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...","Plumbing, Shopping, Local Services, Home Servi...","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ..."


In [4]:
business_df.state.unique()

array(['AZ', 'ON', 'NC', 'AB', 'NV', 'OH', 'PA', 'QC', 'WI', 'IL', 'NY',
       'SC', 'TX', 'UT', 'NM', 'FL', 'CA', 'VA', 'BAS', 'NE', 'AK', 'XGM',
       'WA', 'XWY', 'CON', 'BC', 'GA', 'VT', 'CT', 'AL', 'DUR', 'TN',
       'NJ', 'AR', 'XGL', 'DOW'], dtype=object)

In [5]:
# Load table for most populated cities in these states
pop = pd.read_csv('/Users/AuerPower/Metis/git/predict_gentrification_with_yelp/data/most_populous_cities.csv')
pop = pop.sort_values(by='estimate2018', ascending=False)
pop = pop[pop['rank']<=50]

In [6]:
# Select top 50 most populous cities for dataset
#business_df  = business_df[business_df['city'].isin(pop['City'])]

In [7]:
business_df.city.unique()

array(['Phoenix', 'Mississauga', 'Charlotte', ..., 'Henderson Nevada',
       'Boston', 'Spring Hill City View'], dtype=object)

In [8]:
business_df.shape

(192609, 14)

### Average number of yelp reviews per business

In [9]:
# groupby postal_code and find mean of review_count
reviews_per_business = business_df.groupby(['postal_code']).mean().iloc[:,-2]
reviews_per_business

postal_code
             9.100152
02645        4.000000
05440        4.000000
06032        9.000000
06280        3.000000
              ...    
V0J         10.000000
V5H 1J9    373.000000
W8M 3T5     20.000000
m2k 0c5      3.000000
t2g 3m5      3.000000
Name: review_count, Length: 17541, dtype: float64

### Average number of stars per business

In [10]:
# groupby postal_code and find mean of stars
avg_stars = business_df.groupby(['postal_code']).mean().iloc[:,-3]
avg_stars

postal_code
           3.358877
02645      5.000000
05440      4.250000
06032      4.000000
06280      5.000000
             ...   
V0J        4.500000
V5H 1J9    3.500000
W8M 3T5    4.000000
m2k 0c5    4.500000
t2g 3m5    5.000000
Name: stars, Length: 17541, dtype: float64

### Number of unique catgories of business

In [11]:
categories = business_df.groupby('postal_code')['categories'].nunique()

# did this work? what is the number of unique catgories for zip '28210'
categories.loc['28210']

323

### Sushi

In [12]:
business_df['sushi'] = business_df['categories'].str.contains('Sushi Bars', regex=True)
business_df['sushi'] = business_df['sushi'].replace({False: 0, True: 1})
sushi = business_df.groupby('postal_code').sum().iloc[:,-1]
sushi.loc['28210']

5.0

In [13]:
sushi[sushi == 20]

postal_code
89102    20.0
Name: sushi, dtype: float64

In [14]:
### New American Restaurants
business_df['New_american'] = business_df['categories'].str.contains('American (New)', regex=True)
business_df['New_american'] = business_df['New_american'].replace({False: 0, True: 1})
New_american = business_df.groupby('postal_code').sum().iloc[:,-1]
New_american.loc['28210']

  return func(self, *args, **kwargs)


5.0

### Number of wine bars/cocktail bars/breweries

In [15]:
business_df['bars'] = business_df['categories'].str.contains('Wine Bars|Cocktail Bars|Breweries|Brewpubs', regex=True)
business_df['bars'] = business_df['bars'].replace({False: 0, True: 1})
bars = business_df.groupby('postal_code').sum().bars #iloc[:,-1]
bars.loc['28210']

9.0

### Number of coffee shops

In [16]:
business_df['coffee'] = business_df['categories'].str.contains('Coffee & Tea|Coffee Roasteries|Bakeries|Ice Cream', regex=True)
business_df['coffee'] = business_df['coffee'].replace({False: 0, True: 1})
coffee = business_df.groupby('postal_code').sum().coffee #.iloc[:,-1]
coffee.loc['28210']

18.0

### Yoga studios

In [17]:
business_df['yoga'] = business_df['categories'].str.contains('Fitness & Instruction|Active Life|Yoga', regex=True)
business_df['yoga'] = business_df['yoga'].replace({False: 0, True: 1})
yoga = business_df.groupby('postal_code').sum().yoga #.iloc[:,-1]
yoga.loc['28210']

15.0

## Reviews dataset - Food Trends
- Avocado toast
- third wave coffee

In [18]:
#from collections import defaultdict
# default value of int is 0 with defaultdict
#date_dict = defaultdict(int)

#for gm_chunk in pd.read_json(os.path.join(yelp_dir, 'review.json'), lines=True, chunksize=2):
#    for c in gm_chunk['date']:
#        date_dict += 1

# print the continent_dict 
#print(date_dict)

#defaultdict(int,
#            {'Africa': 624,
#             'Americas': 300,
#             'Asia': 396,
#             'Europe': 360,
#             'Oceania': 24})

In [19]:
#reviews = []
#reader = pd.read_json(os.path.join(yelp_dir, 'review.json'), lines=True, chunksize=1)
#for chunk in reader:
#    reviews.append(chunk)

In [20]:
# Load yelp reviews (5GB dataset)
#reader = pd.read_json(os.path.join(yelp_dir, 'review.json'), lines=True, chunksize=1)

#import json
#with open(os.path.join(yelp_dir, 'review.json')) as f:
#    for line in f:
#        data.append(json.loads(line))

#reviews = pd.DataFrame(chunk)

#reviews.append(pd.DataFrame(chunk)) 
#reviews = pd.DataFrame(chunk)
#reviews.head()

### Avocado Toast

In [21]:
# test 
#def avocado_toast():
#reviews['avocado'] = reviews['text'].str.contains('Avocado toast', case=False, regex=True)
#reviews['avocado'] = reviews['avocado'].replace({False: 0, True: 1})
#avocado = reviews.groupby('postal_code').sum().iloc[:,-1]
#avocado.loc['28210']

In [22]:
# Function for Avocado toast
#def avocado_toast():
    
#for i in chunks:
#    reader = pd.read_json(os.path.join(yelp_dir, 'review.json'), lines=True, chunksize=i)
    # Filter the data by business ID and date
    # Run calculations on it?
#for chunk in reader:
#    reviews.append(pd.DataFrame(chunk)) 
#    print(chunk)

### Identify third wave coffee shops
Third wave coffee shops are defined as having direct trade sourcing, innovative brewing methods, and a smooth type of foam that creates decorative patterns when poured (i.e. latte art). To identify third wave coffee shops I'm going to choose businesses that have photo captions with terms associated with these characteristics:
* latte art
* avocado toast
* single origin
* brewing methods: chemex, V60, AeroPress, vac pot syphon

In [23]:
# Load yelp photo data
photo_df = pd.read_json(os.path.join(yelp_dir, 'photo.json'), lines=True)
photo_df.head()

Unnamed: 0,caption,photo_id,business_id,label
0,,MllA1nNpcp1kDteVg6OGUw,rcaPajgKOJC2vo_l3xa42A,inside
1,,YjxBE88Bf6CmTEF2LP1UNA,Kn23LDd740SBVJ7mum0fwg,inside
2,,1f7izSjM0WjkDRIVbPy1yw,ZkGDCVKSdf8m76cnnalL-A,food
3,,NcSlcDTEEeOaixotOPk-rA,bF8gv7k_rwZtiDLP2ZB04w,inside
4,,5IiIo5UKEW0lWqZ6sWrY_A,50Anorn0DJXFhBr9a9_gHQ,inside


In [24]:
# filter data based on businesses id
#photo_df  = photo_df[photo_df['business_id'].isin(business_df['business_id'])]

In [25]:
# Select records that have a photo caption
photo_df = photo_df.replace(r'^\s*$', np.nan, regex=True)
photo_df = photo_df.dropna(subset=['caption'])
photo_df.head()

Unnamed: 0,caption,photo_id,business_id,label
16,Outside,-bpyOFpGiJsOzh_y17cTMQ,-KIdCJnkt5N8rnnmWR5MQg,outside
18,"""last neighborhood bar in Vegas""",YkW51dD0Hzw1572XLzrV5w,JIl4gbnh_cORSjSrZgOjAQ,outside
19,now this is a sandwich,fFf5HfvOZZBM_u-9fFSiHw,zU9w_xRlQSRIYXxGo-HSOA,food
20,Kai Restaurant,VTRKZpezwa25pyc8ePWLQQ,AkpuhGyLAxhD_sLMQv3kOg,inside
21,Resort lounge.,2fp5KiQd91qw351ea2V4Xw,AkpuhGyLAxhD_sLMQv3kOg,inside


In [26]:
# join with bussiness df so you can group by zip code
photo_df = photo_df.merge(business_df, how ='right', on = 'business_id')
photo_df.head()

Unnamed: 0,caption,photo_id,business_id,label,name,address,city,state,postal_code,latitude,...,review_count,is_open,attributes,categories,hours,sushi,New_american,bars,coffee,yoga
0,Outside,-bpyOFpGiJsOzh_y17cTMQ,-KIdCJnkt5N8rnnmWR5MQg,outside,Corleone's,"1640 E Camelback Rd, Ste 140",Phoenix,AZ,85016,33.510153,...,233,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Cheesesteaks, Restaurants, Chicken Wings, Pizza","{'Monday': '10:30-21:0', 'Tuesday': '10:30-21:...",0.0,0,0.0,0.0,0.0
1,Sweet pepper bar,k_xC_oh7EBcWjgIPQ9C16Q,-KIdCJnkt5N8rnnmWR5MQg,inside,Corleone's,"1640 E Camelback Rd, Ste 140",Phoenix,AZ,85016,33.510153,...,233,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Cheesesteaks, Restaurants, Chicken Wings, Pizza","{'Monday': '10:30-21:0', 'Tuesday': '10:30-21:...",0.0,0,0.0,0.0,0.0
2,Inside,HfTMxGkgfrkQyGUT3IdAvA,-KIdCJnkt5N8rnnmWR5MQg,inside,Corleone's,"1640 E Camelback Rd, Ste 140",Phoenix,AZ,85016,33.510153,...,233,1,"{'RestaurantsTakeOut': 'True', 'RestaurantsAtt...","Cheesesteaks, Restaurants, Chicken Wings, Pizza","{'Monday': '10:30-21:0', 'Tuesday': '10:30-21:...",0.0,0,0.0,0.0,0.0
3,"""last neighborhood bar in Vegas""",YkW51dD0Hzw1572XLzrV5w,JIl4gbnh_cORSjSrZgOjAQ,outside,Dino's Lounge,1516 Las Vegas Blvd S,Las Vegas,NV,89104,36.152505,...,229,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Karaoke, Nightlife, Bars, Dive Bars","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",0.0,0,0.0,0.0,0.0
4,Can't mess with us!,O8v5q7KeAh9IfAr7L3EOUw,JIl4gbnh_cORSjSrZgOjAQ,inside,Dino's Lounge,1516 Las Vegas Blvd S,Las Vegas,NV,89104,36.152505,...,229,1,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Karaoke, Nightlife, Bars, Dive Bars","{'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'W...",0.0,0,0.0,0.0,0.0


In [27]:
# Select records that have one of the key terms that indicate third wave
photo_df['avocado'] = photo_df['caption'].str.contains(
    'avocado toast|latte art|pour over|single origin|house made|chemex|V60|AeroPress|vac pot syphon', case=False, regex=True)
photo_df['avocado'] = photo_df['avocado'].replace({False: 0, True: 1})
avocado = photo_df.groupby('postal_code').sum().iloc[:, -1]
avocado.loc['28210']

0.0

In [28]:
print(avocado.values)

[0. 0. 0. ... 0. 0. 0.]


In [29]:
avocado.max()

13.0

### Bring variables together into one dataset

In [30]:
frame = {'reviews_per_business': reviews_per_business, 'avg_stars': avg_stars, 'categories': categories, 'sushi': sushi,
         'New_american': New_american, 'bars': bars, 'coffee': coffee, 'yoga': yoga, 'avocado': avocado} 

final_df = pd.DataFrame(frame)
final_df

Unnamed: 0_level_0,reviews_per_business,avg_stars,categories,sushi,New_american,bars,coffee,yoga,avocado
postal_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,9.100152,3.358877,512,0.0,0.0,4.0,34.0,39.0,0.0
02645,4.000000,5.000000,1,0.0,0.0,0.0,0.0,1.0,0.0
05440,4.000000,4.250000,2,0.0,0.0,0.0,0.0,0.0,0.0
06032,9.000000,4.000000,1,0.0,0.0,0.0,0.0,0.0,0.0
06280,3.000000,5.000000,1,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
V0J,10.000000,4.500000,1,0.0,0.0,0.0,0.0,0.0,0.0
V5H 1J9,373.000000,3.500000,1,0.0,0.0,1.0,0.0,0.0,0.0
W8M 3T5,20.000000,4.000000,1,0.0,0.0,0.0,1.0,0.0,0.0
m2k 0c5,3.000000,4.500000,1,0.0,0.0,0.0,0.0,1.0,0.0


In [32]:
# get rid of first row summary
final_df = final_df.iloc[1:]
final_df.to_csv('yelp_predictors.csv')