# Data cleaning

### Where does the dataset come from? Download from yelp.com

### Businesses

In [139]:
import pandas as pd
import json

In [140]:
data = []

with open('data/yelp_academic_dataset_business.json') as f:

    for line in f:
        
        json_dict = json.loads(line)

        if json_dict['attributes']:
            if 'BusinessParking' in json_dict['attributes'].keys():

                json_dict['attributes']['BusinessParking'] = eval(json_dict['attributes']['BusinessParking'])

        data.append(json_dict)

In [141]:
businesses = pd.json_normalize(data)

In [142]:
businesses.columns = businesses.columns.str.split('.').str[-1]

In [143]:
import re

In [144]:
columns = []

for column in businesses.columns:
    column = re.sub(r'(\w)([A-Z])', r'\1_\2', column)
    column = re.sub(r'Restaurants_', r'', column)
    column = re.sub(r'(Range)([0-9])', r'\1', column)
    columns.append(column.lower())

businesses.columns = columns

In [145]:
businesses.drop(
  [
    'attributes',
    'hair_specializes_in',
    'counter_service',
    'open24_hours',
    'dietary_restrictions',
    'accepts_insurance',
    'ages_allowed',
    'b_yo_bcorkage',
    'corkage',
    'smoking',
    'b_yo_b',
    'good_for_dancing',
    'coat_check',
    'by_appointment_only',
    'best_nights',
    'music',
    'drive_thru',
    'business_accepts_bitcoin',
    'dogs_allowed',
    'happy_hour',
    'wheelchair_accessible',
    'good_for_meal',
    'ambience',
    'business_parking',
    'address',
    'postal_code',
    'hours',
    'is_open',
    'tuesday',
    'wednesday',
    'thursday',
    'friday',
    'saturday',
    'sunday'
  ],
  axis=1, errors='ignore', inplace=True
)

In [147]:
businesses['alcohol'] = businesses['alcohol'].str.replace()

In [129]:
businesses['alcohol'].value_counts()

# 'None': no mentioning of Alcohol/No alcohol
# "'none'": No Alcohol
# u'none': No Alcohol

alcohol
u'none'             15977
u'full_bar'         12968
'none'               4933
u'beer_and_wine'     4880
'full_bar'           3024
'beer_and_wine'      1369
None                   38
Name: count, dtype: int64

In [131]:
businesses['wi_fi'].value_counts()

wi_fi
u'free'    27029
u'no'      15221
'free'      7385
'no'        6610
u'paid'      486
'paid'       133
None          50
Name: count, dtype: int64

In [149]:
businesses['alcohol'] = businesses['alcohol'].str.replace("u'", "").str.replace("'", "")
businesses['noise_level'] = businesses['noise_level'].str.replace("u'", "").str.replace("'", "")
businesses['attire'] = businesses['attire'].str.replace("u'", "").str.replace("'", "")
businesses['wi_fi'] = businesses['wi_fi'].str.replace("u'", "").str.replace("'", "")

In [151]:
businesses['alcohol'].value_counts()

alcohol
hallo            20910
full_bar         15992
beer_and_wine     6249
None                38
Name: count, dtype: int64

In [None]:
import numpy as np

In [None]:
def parking_check(row):
    if row['parking_garage'] == True or row['parking_street'] == True or row['parking_validated'] == True or row['parking_lot'] == True or row['parking_valet'] == True:
        return 'True'
    if row['parking_garage'] == False or row['parking_street'] == False or row['parking_validated'] == False or row['parking_lot'] == False or row['parking_valet'] == False:
        return 'False'
    else:
        return np.nan

In [None]:
businesses['parking_available'] = businesses.apply(parking_check, axis=1)

In [None]:
businesses.drop(['parking_garage', 'parking_street','parking_validated','parking_lot','parking_valet'], axis=1, inplace=True)

### We have created a table with all businesses and the attributes we want to have a look at

### Businesses in Pennsylvania

In [None]:
pennsylvania = businesses[businesses['state'] == 'PA']

### Restaurants in Pennsylvania

In [None]:
pennsylvania.dropna(subset='categories', inplace=True)
pennsylvania.reset_index(drop=True, inplace=True)

In [None]:
categories =[
    'Coffee & Tea',
    'Bistros',
    'Breakfast & Brunch',
    'Cafes',
    'French',
    'Greek',
    'Italian',
    'Mexican',
    'Tacos',
    'Egyptian',
    'Pizza',
    'Soup',
    'Sushi Bars',
    'Vegetarian',
    'Waffles',
    'Food',
    'Restaurants',
    'Bars'
]

restaurants = pd.DataFrame()

for index, item in pennsylvania['categories'].items():

    for category in categories:
        if category in item:
            restaurants = pd.concat([restaurants, pennsylvania[index:index+1]])
            break

In [None]:
with open('data/restaurants_pennsylvania.csv', 'w') as f:
    restaurants.to_csv(f, mode='a', header=True, index=False)

### We have created a table with all restaurants in Pennsylvania and saved it in a .csv for later analysis

### Now we load the table with all reviews and create a .csv which only contains reviews from restaurants in Pennsylvania

In [None]:
businesses_ids = restaurants['business_id'].to_list()

column_names = ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'text', 'date']

chunks = pd.read_json('data/yelp_academic_dataset_review.json', lines=True, chunksize=100000)

with open('data/reviews_pennsylvania.csv', 'w') as f:
    header = ','.join(column_names)
    f.write(header + '\n')

    for chunk in chunks:
        reviews = pd.DataFrame(chunk)
        
        reviews[reviews['business_id'].isin(businesses_ids)].to_csv(f, mode='a', header=False, index=False)