# Yelp Data Cleaning
This script imports, cleans, and filters variables for input into a regression model to predict median home price for a neighborhood. 
data source: https://www.yelp.com/dataset/download

In [5]:
import os
import numpy as np
import pandas as pd

In [6]:
# Set working directory
project_dir = os.getcwd()
yelp_dir = os.path.join(project_dir, 'data', 'yelp_dataset')

## Variables that use the business dataset

In [7]:
# Load business dataset
business_df = pd.read_json(os.path.join(yelp_dir, 'business.json'), lines=True)
business_df.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,1SWheh84yJXfytovILXOAQ,Arizona Biltmore Golf Club,2818 E Camino Acequia Drive,Phoenix,AZ,85016,33.522143,-112.018481,3.0,5,0,{'GoodForKids': 'False'},"Golf, Active Life",
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,128,1,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...","{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W..."
2,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,"10110 Johnston Rd, Ste 15",Charlotte,NC,28210,35.092564,-80.859132,4.0,170,1,"{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...","Sushi Bars, Restaurants, Japanese","{'Monday': '17:30-21:30', 'Wednesday': '17:30-..."
3,xvX2CttrVhyG2z1dFg_0xw,Farmers Insurance - Paul Lorenz,"15655 W Roosevelt St, Ste 237",Goodyear,AZ,85338,33.455613,-112.395596,5.0,3,1,,"Insurance, Financial Services","{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ..."
4,HhyxOkGAM07SRYtlQ4wMFQ,Queen City Plumbing,"4209 Stuart Andrew Blvd, Ste F",Charlotte,NC,28217,35.190012,-80.887223,4.0,4,1,"{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...","Plumbing, Shopping, Local Services, Home Servi...","{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ..."


In [8]:
business_df.state.unique()

array(['AZ', 'ON', 'NC', 'AB', 'NV', 'OH', 'PA', 'QC', 'WI', 'IL', 'NY',
       'SC', 'TX', 'UT', 'NM', 'FL', 'CA', 'VA', 'BAS', 'NE', 'AK', 'XGM',
       'WA', 'XWY', 'CON', 'BC', 'GA', 'VT', 'CT', 'AL', 'DUR', 'TN',
       'NJ', 'AR', 'XGL', 'DOW'], dtype=object)

In [9]:
# What are the most populated cities in these states

In [10]:
len(business_df.city.unique())

1204

### Average number of yelp reviews per business

### Number of unique catgories of business

## Reviews dataset

In [15]:
# Load yelp reviews (5GB dataset)
reader = pd.read_json(os.path.join(yelp_dir, 'review.json'), lines=True, chunksize=1)
reviews = pd.DataFrame(chunk)
reviews.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
167120,qLQXinoMTnM6l4DjDB-GiA,Yq1wJRZvKzTtJEDLGhvPKQ,aV4lmp3mQHhi6eykm2VOBg,5,0,0,0,My daughter has been going here for sometime n...,2016-10-30 00:12:05
167121,1OAEIaakVPtSFlYdEm3o0A,KRLO2RJaF0FGTohm92lk9g,SX7-vTahSnoVeJQ6N86PMw,3,0,0,0,If I was rating only on the quality of my spa ...,2018-05-21 22:00:19
167122,ConV_R3gUEcYZoOy1EW-Pg,_VP0TEuEWANvti52O7zB_g,0atzNJ2l4qHNnwLupGdq7w,5,1,0,2,Best Vet in Town! Dr Kanarish has been taking ...,2016-03-21 20:10:54
167123,4fm7pWt1DW15vEjaHv9wNA,UhnwlZARii0_thG8dckugg,yWwIUmeenyAO7nwc7U0U5A,4,3,0,1,I would come back here only for the rolls. It'...,2014-08-15 21:30:01
167124,Vl9rGJ5LIjhJ2ZF5A8q0Eg,825Ol4rTvAZnUyFMke37jA,O7UMzd3i-Zk8dMeyY9ZwoA,5,7,4,4,My girlfriends and I have been looking forward...,2014-06-15 20:33:59


In [None]:
for i in chunks:
    reader = pd.read_json(os.path.join(yelp_dir, 'review.json'), lines=True, chunksize=i)
    # Filter the data in some way or run calculations on it
#for chunk in reader:
#    reviews.append(pd.DataFrame(chunk)) 
#    print(chunk)

### Identify third wave coffee shops
Third wave coffee shops are defined as having direct trade sourcing, innovative brewing methods, and a smooth type of foam that creates decorative patterns when poured (i.e. latte art). To identify third wave coffee shops I'm going to choose businesses that have photo captions with terms associated with these characteristics:
* latte art
* single origin
* brewing methods: chemex, V60, AeroPress, vac pot syphon

In [6]:
# Load yelp photo data
photo_df = pd.read_json(os.path.join(yelp_dir, 'photo.json'), lines=True)
photo_df.head()

Unnamed: 0,caption,photo_id,business_id,label
0,,MllA1nNpcp1kDteVg6OGUw,rcaPajgKOJC2vo_l3xa42A,inside
1,,YjxBE88Bf6CmTEF2LP1UNA,Kn23LDd740SBVJ7mum0fwg,inside
2,,1f7izSjM0WjkDRIVbPy1yw,ZkGDCVKSdf8m76cnnalL-A,food
3,,NcSlcDTEEeOaixotOPk-rA,bF8gv7k_rwZtiDLP2ZB04w,inside
4,,5IiIo5UKEW0lWqZ6sWrY_A,50Anorn0DJXFhBr9a9_gHQ,inside


In [7]:
# Select records that have a photo caption
photo_df = photo_df.replace(r'^\s*$', np.nan, regex=True)
photo_df = photo_df.dropna(subset=['caption'])
photo_df.head()

Unnamed: 0,caption,photo_id,business_id,label
16,Outside,-bpyOFpGiJsOzh_y17cTMQ,-KIdCJnkt5N8rnnmWR5MQg,outside
18,"""last neighborhood bar in Vegas""",YkW51dD0Hzw1572XLzrV5w,JIl4gbnh_cORSjSrZgOjAQ,outside
19,now this is a sandwich,fFf5HfvOZZBM_u-9fFSiHw,zU9w_xRlQSRIYXxGo-HSOA,food
20,Kai Restaurant,VTRKZpezwa25pyc8ePWLQQ,AkpuhGyLAxhD_sLMQv3kOg,inside
21,Resort lounge.,2fp5KiQd91qw351ea2V4Xw,AkpuhGyLAxhD_sLMQv3kOg,inside


In [None]:
# Select records that have one of the key terms that indicate third wave
third_wave_terms = ['latte art', 'single origin', 'house made',
                    'chemex', 'Chemex', 'V60', 'AeroPress', 'vac pot syphon']

### Avocado Toast

### Business turnover