# Data cleaning

### Where does the dataset come from? Download from yelp.com

### Businesses

In [91]:
import pandas as pd
import json

In [92]:
#

data = []

with open('data/yelp_academic_dataset_business.json') as f:

    for line in f:
        
        #Parse the JSON data
        json_dict = json.loads(line)

        if json_dict['attributes']:
            if 'BusinessParking' in json_dict['attributes'].keys():

                json_dict['attributes']['BusinessParking'] = eval(json_dict['attributes']['BusinessParking'])

        data.append(json_dict)

In [93]:
businesses = pd.json_normalize(data)

In [94]:
businesses.drop(
  [
    'attributes',
    'attributes.HairSpecializesIn',
    'attributes.RestaurantsCounterService',
    'attributes.Open24Hours',
    'attributes.DietaryRestrictions',
    'attributes.AcceptsInsurance',
    'attributes.AgesAllowed',
    'attributes.BYOBCorkage',
    'attributes.Corkage',
    'attributes.Smoking',
    'attributes.BYOB',
    'attributes.GoodForDancing',
    'attributes.CoatCheck',
    'attributes.ByAppointmentOnly',
    'attributes.BestNights',
    'attributes.Music',
    'attributes.DriveThru',
    'attributes.BusinessAcceptsBitcoin',
    'attributes.DogsAllowed',
    'attributes.HappyHour',
    'attributes.WheelchairAccessible',
    'attributes.GoodForMeal',
    'attributes.Ambience',
    'attributes.BusinessParking',
    'address',
    'postal_code',
    'hours',
    'is_open',
    'hours.Tuesday',
    'hours.Wednesday',
    'hours.Thursday',
    'hours.Friday',
    'hours.Saturday',
    'hours.Sunday'
  ],
  axis=1, errors='ignore', inplace=True
)

In [95]:
businesses.rename(
    columns={
        'attributes.BusinessAcceptsCreditCards': 'accepts_credit_cards',
        'attributes.RestaurantsPriceRange2':'price_range',
        'attributes.RestaurantsTakeOut':'take_out',
        'attributes.RestaurantsDelivery':'delivery',
        'attributes.BusinessParking.garage':'parking_garage',
        'attributes.BusinessParking.street':'parking_street',
        'attributes.BusinessParking.validated':'parking_validated',
        'attributes.BusinessParking.lot':'parking_lot',
        'attributes.BusinessParking.valet':'parking_valet',
        'attributes.OutdoorSeating':'outdoor_seating',
        'attributes.RestaurantsReservations':'seats_reservations',
        'attributes.Alcohol':'alcohol',
        'attributes.Ambience':'ambience',
        'attributes.RestaurantsTableService':'table_service',
        'attributes.RestaurantsGoodForGroups':'for_groups',
        'attributes.DriveThru':'drive_thru',
        'attributes.Open24Hours':'open_24_hours',
        'attributes.RestaurantsCounterService':'counter_service',
        'attributes.NoiseLevel': 'noise_level',
        'attributes.GoodForKids' : 'good_for_kids',
        'attributes.RestaurantsAttire': 'attire',
        'attributes.WiFi': 'wifi',
        'attributes.HasTV': 'tv',
        'attributes.Caters': 'caters',
        'attributes.BikeParking': 'bike_parking'
    },
    inplace=True
)

In [96]:
businesses['alcohol'] = businesses['alcohol'].str.replace("u'", "").str.replace("'", "")
businesses['noise_level'] = businesses['noise_level'].str.replace("u'", "").str.replace("'", "")
businesses['attire'] = businesses['attire'].str.replace("u'", "").str.replace("'", "")
businesses['wifi'] = businesses['wifi'].str.replace("u'", "").str.replace("'", "")

In [97]:
import numpy as np

In [98]:
def parking_check(row):
    if row['parking_garage'] == True or row['parking_street'] == True or row['parking_validated'] == True or row['parking_lot'] == True or row['parking_valet'] == True:
        return 'True'
    if row['parking_garage'] == False or row['parking_street'] == False or row['parking_validated'] == False or row['parking_lot'] == False or row['parking_valet'] == False:
        return 'False'
    else:
        return np.nan

In [99]:
businesses['parking_available'] = businesses.apply(parking_check, axis=1)

In [100]:
businesses.drop(['parking_garage', 'parking_street','parking_validated','parking_lot','parking_valet'], axis=1, inplace=True)

### We have created a table with all businesses and the attributes we want to have a look at

### Businesses in Pennsylvania

In [101]:
pennsylvania = businesses[businesses['state'] == 'PA']

### Restaurants in Pennsylvania

In [102]:
pennsylvania.dropna(subset='categories', inplace=True)
pennsylvania.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pennsylvania.dropna(subset='categories', inplace=True)


In [108]:
categories =[
    'Coffee & Tea',
    'Bistros',
    'Breakfast & Brunch',
    'Cafes',
    'French',
    'Greek',
    'Italian',
    'Mexican',
    'Tacos',
    'Egyptian',
    'Pizza',
    'Soup',
    'Sushi Bars',
    'Vegetarian',
    'Waffles',
    'Food',
    'Restaurants',
    'Bars'
]

restaurants = pd.DataFrame()

for index, item in pennsylvania['categories'].items():

    for category in categories:
        if category in item:
            restaurants = pd.concat([restaurants, pennsylvania[index:index+1]])
            break

In [105]:
with open('data/restaurants_pennsylvania.csv', 'w') as f:
    restaurants.to_csv(f, mode='a', header=True, index=False)

### We have created a table with all restaurants in Pennsylvania and saved it in a .csv for later analysis

### Now we load the table with all reviews and create a .csv which only contains reviews from restaurants in Pennsylvania

In [106]:
businesses_ids = restaurants['business_id'].to_list()

column_names = ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date']

chunks = pd.read_json('data/yelp_academic_dataset_review.json', lines=True, chunksize=100000)

with open('data/reviews_pennsylvania.csv', 'w') as f:
    header = ','.join(column_names)
    f.write(header + '\n')

    for chunk in chunks:
        reviews = pd.DataFrame(chunk)
        
        reviews[reviews['business_id'].isin(businesses_ids)].to_csv(f, mode='a', header=False, index=False)