### Where does the dataset come from? Download from yelp.com

### Businesses

In [2]:
import pandas as pd
import json

In [3]:
data = []

with open('data/yelp_academic_dataset_business.json') as f:

    for line in f:
        
        json_dict = json.loads(line)

        if json_dict['attributes']:
            if 'BusinessParking' in json_dict['attributes'].keys():

                json_dict['attributes']['BusinessParking'] = eval(json_dict['attributes']['BusinessParking'])

        data.append(json_dict)

In [4]:
businesses = pd.json_normalize(data)

In [5]:
businesses.columns = businesses.columns.str.split(('.')).str[-1]

In [9]:
businesses.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'categories', 'hours', 'by_appointment_only', 'accepts_credit_cards',
       'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday',
       'bike_parking', 'price_range', 'coat_check', 'take_out', 'delivery',
       'caters', 'wi_fi', 'garage', 'street', 'validated', 'lot', 'valet',
       'wheelchair_accessible', 'happy_hour', 'outdoor_seating', 'has_tv',
       'reservations', 'dogs_allowed', 'sunday', 'alcohol', 'good_for_kids',
       'parking', 'attire', 'ambience', 'table_service', 'good_for_groups',
       'drive_thru', 'attributes', 'noise_level', 'good_for_meal',
       'accepts_bitcoin', 'smoking', 'music', 'good_for_dancing',
       'accepts_insurance', 'best_nights', 'b_yo_b', 'corkage',
       'b_yo_bcorkage', 'hair_specializes_in', 'open24_hours',
       'counter_service', 'ages_allowed', 'dietary_restrictions'],
 

In [7]:
import re

In [8]:
columns = []

for column in businesses.columns:
    column = re.sub(r'(\w)([A-Z])', r'\1_\2', column)
    column = re.sub(r'Restaurants_', r'', column)
    column = re.sub(r'Business_', r'', column)
    column = re.sub(r'(Range)([0-9])', r'\1', column)
    columns.append(column.lower())

businesses.columns = columns

In [10]:
businesses.drop(
  [
    'attributes',
    'hair_specializes_in',
    'counter_service',
    'open24_hours',
    'dietary_restrictions',
    'accepts_insurance',
    'ages_allowed',
    'b_yo_bcorkage',
    'corkage',
    'smoking',
    'b_yo_b',
    'good_for_dancing',
    'coat_check',
    'by_appointment_only',
    'best_nights',
    'music',
    'drive_thru',
    'business_accepts_bitcoin',
    'dogs_allowed',
    'happy_hour',
    'wheelchair_accessible',
    'good_for_meal',
    'ambience',
    'business_parking',
    'address',
    'postal_code',
    'hours',
    'is_open',
    'tuesday',
    'wednesday',
    'thursday',
    'friday',
    'saturday',
    'sunday'
  ],
  axis=1, errors='ignore', inplace=True
)

In [11]:
businesses['alcohol'] = businesses['alcohol'].str.replace("u'", "").str.replace("'", "")
businesses['noise_level'] = businesses['noise_level'].str.replace("u'", "").str.replace("'", "")
businesses['attire'] = businesses['attire'].str.replace("u'", "").str.replace("'", "")
businesses['wi_fi'] = businesses['wi_fi'].str.replace("u'", "").str.replace("'", "")

In [12]:
import numpy as np

In [13]:
def parking_check(row):
    if row['garage'] == True or row['street'] == True or row['validated'] == True or row['lot'] == True or row['valet'] == True:
        return 'True'
    if row['garage'] == False or row['street'] == False or row['validated'] == False or row['lot'] == False or row['valet'] == False:
        return 'False'
    else:
        return np.nan

In [14]:
businesses['parking_available'] = businesses.apply(parking_check, axis=1)

In [15]:
businesses.drop(['garage', 'street','validated','lot','valet'], axis=1, inplace=True)

### We have created a table with all businesses and the attributes we want to have a look at

### Businesses in Pennsylvania

In [16]:
pennsylvania = businesses[businesses['state'] == 'PA']

### Restaurants in Pennsylvania

In [17]:
pennsylvania.dropna(subset='categories', inplace=True)
pennsylvania.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pennsylvania.dropna(subset='categories', inplace=True)


In [18]:
categories =[
    'Coffee & Tea',
    'Bistros',
    'Breakfast & Brunch',
    'Cafes',
    'French',
    'Greek',
    'Italian',
    'Mexican',
    'Tacos',
    'Egyptian',
    'Pizza',
    'Soup',
    'Sushi Bars',
    'Vegetarian',
    'Waffles',
    'Food',
    'Restaurants',
    'Bars'
]

restaurants = pd.DataFrame()

for index, item in pennsylvania['categories'].items():

    for category in categories:
        if category in item:
            restaurants = pd.concat([restaurants, pennsylvania[index:index+1]])
            break

In [20]:
with open('data/restaurants_pennsylvania.csv', 'w') as f:
    restaurants.to_csv(f, mode='a', header=True, index=False)

### We have created a table with all restaurants in Pennsylvania and saved it in a .csv for later analysis

### Now we load the table with all reviews and create a .csv which only contains reviews from restaurants in Pennsylvania

In [None]:
businesses_ids = restaurants['business_id'].to_list()

column_names = ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date']

chunks = pd.read_json('data/yelp_academic_dataset_review.json', lines=True, chunksize=100000)

with open('data/reviews_pennsylvania.csv', 'w') as f:
    header = ','.join(column_names)
    f.write(header + '\n')

    for chunk in chunks:
        reviews = pd.DataFrame(chunk)
        
        reviews[reviews['business_id'].isin(businesses_ids)].to_csv(f, mode='a', header=False, index=False)