# Getting and Cleaning Data Exercise: Easy Version

Based on the Yelp dataset challenge: https://www.yelp.com/dataset_challenge

*For excerises 1-4, use the Yelp business json file. For exercises 5-6, use the Yelp review json file.*

In [1]:
# Import modules
import numpy as np
import pandas as pd
import simplejson
import re
from datetime import datetime, timedelta

Use simplejson module to read in data/yelp_academic_dataset_business.json for use in exercises 1-4.

In [2]:
business_data = []
with open('data/yelp_academic_dataset_business.json') as data_file:
    for line in data_file:
        business_data.append(simplejson.loads(line))

Convert business_data to pandas dataframe.

In [3]:
business_df = pd.DataFrame(business_data)
business_df.head()

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type
0,"{'Take-out': True, 'Drive-Thru': False, 'Good ...",5UmKMjUEUNdYWqANhGckJw,"[Fast Food, Restaurants]",Dravosburg,"4734 Lebanon Church Rd\nDravosburg, PA 15034","{'Friday': {'close': '21:00', 'open': '11:00'}...",40.354327,-79.900706,Mr Hoagie,[],True,7,3.5,PA,business
1,"{'Happy Hour': True, 'Accepts Credit Cards': T...",UsFtqoBl7naz8AVUBZMjQQ,[Nightlife],Dravosburg,"202 McClure St\nDravosburg, PA 15034",{},40.350553,-79.886814,Clancy's Pub,[],True,5,3.0,PA,business
2,{'Good for Kids': True},cE27W9VPgO88Qxe4ol6y_g,"[Active Life, Mini Golf, Golf]",Bethel Park,"1530 Hamilton Rd\nBethel Park, PA 15234",{},40.354115,-80.01466,Cool Springs Golf Center,[],False,5,2.5,PA,business
3,"{'Alcohol': 'full_bar', 'Noise Level': 'averag...",mVHrayjG3uZ_RLHkLj-AMg,"[Bars, American (New), Nightlife, Lounges, Res...",Braddock,"414 Hawkins Ave\nBraddock, PA 15104","{'Tuesday': {'close': '19:00', 'open': '10:00'...",40.40883,-79.866211,Emil's Lounge,[],True,26,4.5,PA,business
4,"{'Parking': {'garage': False, 'street': False,...",mYSpR_SLPgUVymYOvTQd_Q,"[Active Life, Golf]",Braddock,"1000 Clubhouse Dr\nBraddock, PA 15104","{'Sunday': {'close': '15:00', 'open': '10:00'}...",40.403405,-79.855782,Grand View Golf Club,[],True,3,5.0,PA,business


# Exercise 1: Create a new column that contains only the zipcode.

In [4]:
# Extract zip code from the 'full_address' column
business_df['Zipcode'] = business_df.apply(lambda x: x['full_address'].split(' ')[-1], 1)
business_df.head()

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type,Zipcode
0,"{'Take-out': True, 'Drive-Thru': False, 'Good ...",5UmKMjUEUNdYWqANhGckJw,"[Fast Food, Restaurants]",Dravosburg,"4734 Lebanon Church Rd\nDravosburg, PA 15034","{'Friday': {'close': '21:00', 'open': '11:00'}...",40.354327,-79.900706,Mr Hoagie,[],True,7,3.5,PA,business,15034
1,"{'Happy Hour': True, 'Accepts Credit Cards': T...",UsFtqoBl7naz8AVUBZMjQQ,[Nightlife],Dravosburg,"202 McClure St\nDravosburg, PA 15034",{},40.350553,-79.886814,Clancy's Pub,[],True,5,3.0,PA,business,15034
2,{'Good for Kids': True},cE27W9VPgO88Qxe4ol6y_g,"[Active Life, Mini Golf, Golf]",Bethel Park,"1530 Hamilton Rd\nBethel Park, PA 15234",{},40.354115,-80.01466,Cool Springs Golf Center,[],False,5,2.5,PA,business,15234
3,"{'Alcohol': 'full_bar', 'Noise Level': 'averag...",mVHrayjG3uZ_RLHkLj-AMg,"[Bars, American (New), Nightlife, Lounges, Res...",Braddock,"414 Hawkins Ave\nBraddock, PA 15104","{'Tuesday': {'close': '19:00', 'open': '10:00'...",40.40883,-79.866211,Emil's Lounge,[],True,26,4.5,PA,business,15104
4,"{'Parking': {'garage': False, 'street': False,...",mYSpR_SLPgUVymYOvTQd_Q,"[Active Life, Golf]",Braddock,"1000 Clubhouse Dr\nBraddock, PA 15104","{'Sunday': {'close': '15:00', 'open': '10:00'}...",40.403405,-79.855782,Grand View Golf Club,[],True,3,5.0,PA,business,15104


# Exercise 2: The table contains a column called 'categories' and each entry in this column is populated by a list. We are interested in those businesses that restaurants. Create a new column 'Restaurant_type' that contains a description of the restaurant based on the other elements of 'categories. 
## That is, if we have '[Sushi Bars, Japanese, Restaurants]' in categories the 'Restaurant_type will be '{'SushiBars': 1, 'Japanese': 1, 'Mexican': 0, ...}'

Create list of unique restaurants based on 'categories' column.

In [5]:
# Retrieve column from data frame
category_list = list(business_df['categories'])

# Keep only categories that contain 'Restaurants' and flatten
restaurants = [sublist for sublist in category_list if 'Restaurants' in sublist]
restaurants = [val for sublist in restaurants for val in sublist]

# Collect unique restaurants used in data frame
unique_restaurants = set(restaurants)

Create dictionary of applicable categories for each restaurant in data frame and append as new column.

In [6]:
# Create helper function to build dictionary from presence/absence of each category
def create_dictionary(row_categories):
    value_list =  [1 if (category in row_categories) else 0 for category in unique_restaurants]
    return {key: value for (key, value) in zip(unique_restaurants, value_list)}

# Apply function to each row in 'categories' column if business is a restaurant
Restaurant_type = []
for row in business_df['categories']:
    if 'Restaurants' in row:
        Restaurant_type.append(create_dictionary(row))
    else:
        Restaurant_type.append(None)

# Append result to data frame
business_df['Restaurant_type'] = Restaurant_type
business_df.head()

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type,Zipcode,Restaurant_type
0,"{'Take-out': True, 'Drive-Thru': False, 'Good ...",5UmKMjUEUNdYWqANhGckJw,"[Fast Food, Restaurants]",Dravosburg,"4734 Lebanon Church Rd\nDravosburg, PA 15034","{'Friday': {'close': '21:00', 'open': '11:00'}...",40.354327,-79.900706,Mr Hoagie,[],True,7,3.5,PA,business,15034,"{'Haitian': 0, 'Vegan': 0, 'Ethnic Grocery': 0..."
1,"{'Happy Hour': True, 'Accepts Credit Cards': T...",UsFtqoBl7naz8AVUBZMjQQ,[Nightlife],Dravosburg,"202 McClure St\nDravosburg, PA 15034",{},40.350553,-79.886814,Clancy's Pub,[],True,5,3.0,PA,business,15034,
2,{'Good for Kids': True},cE27W9VPgO88Qxe4ol6y_g,"[Active Life, Mini Golf, Golf]",Bethel Park,"1530 Hamilton Rd\nBethel Park, PA 15234",{},40.354115,-80.01466,Cool Springs Golf Center,[],False,5,2.5,PA,business,15234,
3,"{'Alcohol': 'full_bar', 'Noise Level': 'averag...",mVHrayjG3uZ_RLHkLj-AMg,"[Bars, American (New), Nightlife, Lounges, Res...",Braddock,"414 Hawkins Ave\nBraddock, PA 15104","{'Tuesday': {'close': '19:00', 'open': '10:00'...",40.40883,-79.866211,Emil's Lounge,[],True,26,4.5,PA,business,15104,"{'Haitian': 0, 'Vegan': 0, 'Ethnic Grocery': 0..."
4,"{'Parking': {'garage': False, 'street': False,...",mYSpR_SLPgUVymYOvTQd_Q,"[Active Life, Golf]",Braddock,"1000 Clubhouse Dr\nBraddock, PA 15104","{'Sunday': {'close': '15:00', 'open': '10:00'}...",40.403405,-79.855782,Grand View Golf Club,[],True,3,5.0,PA,business,15104,


# Exercise 3: Lets clean the 'attributes' column. The entries in this column are dictionaries. We need to do two things: 
## 1) Turn all the True or False in the dictionary to 1s and 0s.
## 2) There are some entries within dictionaries that are dictionaries themselves, lets turn the whole entry into just one dictionary, for example if we have 
### '{'Accepts Credit Cards': True, 'Alcohol': 'none','Ambience': {'casual': False,'classy': False}}' 
### then turn it into
### '{'Accepts Credit Cards':1, 'Alcohol_none': 1, 'Ambience_casual': 0, 'Ambience_classy': 0}'. 
### There might be other entries like {'Price Range': 1} where the values are numerical so we might want to change that into {'Price_Range_1': 1}.

*The reason we modify categorical variables like this is that machine learning algorithms cannot interpret textual data like "True" and "False". They need numerical inputs such as 1 and 0.*

In [7]:
# Create function to convert attributes to appropriate structure
def attributes_conversion(row):
    # Define existing and new attributes dictionaries
    dictionary = row['attributes']
    new_dictionary = {}
    
    # Loop through attributes dictionary to convert structures
    for key in dictionary.keys():
        value = dictionary[key]
        if type(value) == bool:
            new_dictionary[key] = int(value)
        elif type(value) == str or type(value) == int:
            new_key = "{}_{}".format(key, value)
            new_dictionary[new_key] = 1
        elif type(value) == dict:
            for value_key in value:
                new_key = "{}_{}".format(key, value_key)
                new_dictionary[new_key] = int(value[value_key])
    
    # Return new dictionary
    return new_dictionary

Apply attributes_conversion function to each row of data frame.

In [8]:
# Apply conversion function to 'attributes' column
business_df['flat_attributes'] = business_df.apply(lambda x: attributes_conversion(x), 1)
business_df.head()

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,open,review_count,stars,state,type,Zipcode,Restaurant_type,flat_attributes
0,"{'Take-out': True, 'Drive-Thru': False, 'Good ...",5UmKMjUEUNdYWqANhGckJw,"[Fast Food, Restaurants]",Dravosburg,"4734 Lebanon Church Rd\nDravosburg, PA 15034","{'Friday': {'close': '21:00', 'open': '11:00'}...",40.354327,-79.900706,Mr Hoagie,[],True,7,3.5,PA,business,15034,"{'Haitian': 0, 'Vegan': 0, 'Ethnic Grocery': 0...","{'Take-out': 1, 'Drive-Thru': 0, 'Good For_des..."
1,"{'Happy Hour': True, 'Accepts Credit Cards': T...",UsFtqoBl7naz8AVUBZMjQQ,[Nightlife],Dravosburg,"202 McClure St\nDravosburg, PA 15034",{},40.350553,-79.886814,Clancy's Pub,[],True,5,3.0,PA,business,15034,,"{'Happy Hour': 1, 'Accepts Credit Cards': 1, '..."
2,{'Good for Kids': True},cE27W9VPgO88Qxe4ol6y_g,"[Active Life, Mini Golf, Golf]",Bethel Park,"1530 Hamilton Rd\nBethel Park, PA 15234",{},40.354115,-80.01466,Cool Springs Golf Center,[],False,5,2.5,PA,business,15234,,{'Good for Kids': 1}
3,"{'Alcohol': 'full_bar', 'Noise Level': 'averag...",mVHrayjG3uZ_RLHkLj-AMg,"[Bars, American (New), Nightlife, Lounges, Res...",Braddock,"414 Hawkins Ave\nBraddock, PA 15104","{'Tuesday': {'close': '19:00', 'open': '10:00'...",40.40883,-79.866211,Emil's Lounge,[],True,26,4.5,PA,business,15104,"{'Haitian': 0, 'Vegan': 0, 'Ethnic Grocery': 0...","{'Alcohol_full_bar': 1, 'Noise Level_average':..."
4,"{'Parking': {'garage': False, 'street': False,...",mYSpR_SLPgUVymYOvTQd_Q,"[Active Life, Golf]",Braddock,"1000 Clubhouse Dr\nBraddock, PA 15104","{'Sunday': {'close': '15:00', 'open': '10:00'}...",40.403405,-79.855782,Grand View Golf Club,[],True,3,5.0,PA,business,15104,,"{'Parking_garage': 0, 'Parking_street': 0, 'Pa..."


# Exercise 4: Create a new column for every day of the week and fill it with the amount of hours the business is open that day.

*Your approach should handle businesses that stay open late like bars and nightclubs.*

In [9]:
# Create function to analyze hours per day
def count_hours(daily_hours):
    # Convert opening and closing times to datetime objects
    FMT = '%H:%M'
    time_close = datetime.strptime(daily_hours['close'], FMT)
    time_open = datetime.strptime(daily_hours['open'], FMT)
    
    # Calculate hours open (handling after midnight hours)
    if time_close < time_open:
        time_close += timedelta(days = 1)
    tdelta = time_close - time_open
    return tdelta.total_seconds() / 3600

# Create function to count hours for a given day of week
def hours_per_day(row, day):
    # Extract business hours for row of interest
    hours = row['hours']
    
    # Get hours open for day of interest for that row
    if day in hours.keys():
        daily_hours = count_hours(hours[day])
    else:
        daily_hours = None
        
    # Return result
    return daily_hours

Loop through days of week to create new columns for hours per day

In [10]:
# Create list containing days of the week
days = ['Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday']

# Apply hours_per_day function for each day of week
for day in days:
    business_df[day] = business_df.apply(lambda x: hours_per_day(x, day), 1)
business_df.head()

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,...,Zipcode,Restaurant_type,flat_attributes,Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday
0,"{'Take-out': True, 'Drive-Thru': False, 'Good ...",5UmKMjUEUNdYWqANhGckJw,"[Fast Food, Restaurants]",Dravosburg,"4734 Lebanon Church Rd\nDravosburg, PA 15034","{'Friday': {'close': '21:00', 'open': '11:00'}...",40.354327,-79.900706,Mr Hoagie,[],...,15034,"{'Haitian': 0, 'Vegan': 0, 'Ethnic Grocery': 0...","{'Take-out': 1, 'Drive-Thru': 0, 'Good For_des...",,10.0,10.0,10.0,10.0,10.0,
1,"{'Happy Hour': True, 'Accepts Credit Cards': T...",UsFtqoBl7naz8AVUBZMjQQ,[Nightlife],Dravosburg,"202 McClure St\nDravosburg, PA 15034",{},40.350553,-79.886814,Clancy's Pub,[],...,15034,,"{'Happy Hour': 1, 'Accepts Credit Cards': 1, '...",,,,,,,
2,{'Good for Kids': True},cE27W9VPgO88Qxe4ol6y_g,"[Active Life, Mini Golf, Golf]",Bethel Park,"1530 Hamilton Rd\nBethel Park, PA 15234",{},40.354115,-80.01466,Cool Springs Golf Center,[],...,15234,,{'Good for Kids': 1},,,,,,,
3,"{'Alcohol': 'full_bar', 'Noise Level': 'averag...",mVHrayjG3uZ_RLHkLj-AMg,"[Bars, American (New), Nightlife, Lounges, Res...",Braddock,"414 Hawkins Ave\nBraddock, PA 15104","{'Tuesday': {'close': '19:00', 'open': '10:00'...",40.40883,-79.866211,Emil's Lounge,[],...,15104,"{'Haitian': 0, 'Vegan': 0, 'Ethnic Grocery': 0...","{'Alcohol_full_bar': 1, 'Noise Level_average':...",,,9.0,9.0,9.0,10.0,6.0
4,"{'Parking': {'garage': False, 'street': False,...",mYSpR_SLPgUVymYOvTQd_Q,"[Active Life, Golf]",Braddock,"1000 Clubhouse Dr\nBraddock, PA 15104","{'Sunday': {'close': '15:00', 'open': '10:00'}...",40.403405,-79.855782,Grand View Golf Club,[],...,15104,,"{'Parking_garage': 0, 'Parking_street': 0, 'Pa...",5.0,,,9.0,9.0,9.0,9.0


# Exercise 5: Create a table with the average review for a business.

*You will need to pull in a new json file and merge DataFrames for the next 2 exercises.*

Use simplejson module to read in data/yelp_academic_dataset_review.json for use in exercises 5-6.

In [11]:
review_data = []
with open('data/yelp_academic_dataset_review.json') as data_file:
    for line in data_file:
        review_data.append(simplejson.loads(line))

Convert review_data to pandas dataframe.

In [12]:
review_df = pd.DataFrame(review_data)
review_df.head()

Unnamed: 0,business_id,date,review_id,stars,text,type,user_id,votes
0,5UmKMjUEUNdYWqANhGckJw,2012-08-01,Ya85v4eqdd6k9Od8HbQjyA,4,"Mr Hoagie is an institution. Walking in, it do...",review,PUFPaY9KxDAcGqfsorJp3Q,"{'funny': 0, 'useful': 0, 'cool': 0}"
1,5UmKMjUEUNdYWqANhGckJw,2014-02-13,KPvLNJ21_4wbYNctrOwWdQ,5,Excellent food. Superb customer service. I mis...,review,Iu6AxdBYGR4A0wspR9BYHA,"{'funny': 0, 'useful': 0, 'cool': 0}"
2,5UmKMjUEUNdYWqANhGckJw,2015-10-31,fFSoGV46Yxuwbr3fHNuZig,5,Yes this place is a little out dated and not o...,review,auESFwWvW42h6alXgFxAXQ,"{'funny': 1, 'useful': 1, 'cool': 0}"
3,5UmKMjUEUNdYWqANhGckJw,2015-12-26,pVMIt0a_QsKtuDfWVfSk2A,3,PROS: Italian hoagie was delicious. Friendly ...,review,qiczib2fO_1VBG8IoCGvVg,"{'funny': 0, 'useful': 0, 'cool': 0}"
4,5UmKMjUEUNdYWqANhGckJw,2016-04-08,AEyiQ_Y44isJmNbMTyoMKQ,2,First the only reason this place could possibl...,review,qEE5EvV-f-s7yHC0Z4ydJQ,"{'funny': 0, 'useful': 1, 'cool': 0}"


For creating an average rating by business, we will want to aggregate (group by) business_id and collect the mean number of stars on this id.

In [13]:
avg_reviews = review_df[['business_id', 'stars']].groupby(['business_id']).mean()
avg_reviews.head()

Unnamed: 0_level_0,stars
business_id,Unnamed: 1_level_1
--0ZoBTQWQra1FxD4rBWmg,2.0
--1emggGHgoG6ipd_RMb-g,3.333333
--4Pe8BZ6gj57VFL5mUE8g,2.75
--5jkZ3-nUPZxUvtcbr8Uw,4.580357
--7PRjnsjMA6uhPK8mW13Q,2.666667


Merge average_reviews with business_df to combine data into one dataframe.

In [14]:
merged_df = pd.merge(business_df, avg_reviews, left_on = 'business_id', right_index = True)
merged_df.head()

Unnamed: 0,attributes,business_id,categories,city,full_address,hours,latitude,longitude,name,neighborhoods,...,Restaurant_type,flat_attributes,Sunday,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,stars_y
0,"{'Take-out': True, 'Drive-Thru': False, 'Good ...",5UmKMjUEUNdYWqANhGckJw,"[Fast Food, Restaurants]",Dravosburg,"4734 Lebanon Church Rd\nDravosburg, PA 15034","{'Friday': {'close': '21:00', 'open': '11:00'}...",40.354327,-79.900706,Mr Hoagie,[],...,"{'Haitian': 0, 'Vegan': 0, 'Ethnic Grocery': 0...","{'Take-out': 1, 'Drive-Thru': 0, 'Good For_des...",,10.0,10.0,10.0,10.0,10.0,,3.428571
1,"{'Happy Hour': True, 'Accepts Credit Cards': T...",UsFtqoBl7naz8AVUBZMjQQ,[Nightlife],Dravosburg,"202 McClure St\nDravosburg, PA 15034",{},40.350553,-79.886814,Clancy's Pub,[],...,,"{'Happy Hour': 1, 'Accepts Credit Cards': 1, '...",,,,,,,,3.0
2,{'Good for Kids': True},cE27W9VPgO88Qxe4ol6y_g,"[Active Life, Mini Golf, Golf]",Bethel Park,"1530 Hamilton Rd\nBethel Park, PA 15234",{},40.354115,-80.01466,Cool Springs Golf Center,[],...,,{'Good for Kids': 1},,,,,,,,2.6
3,"{'Alcohol': 'full_bar', 'Noise Level': 'averag...",mVHrayjG3uZ_RLHkLj-AMg,"[Bars, American (New), Nightlife, Lounges, Res...",Braddock,"414 Hawkins Ave\nBraddock, PA 15104","{'Tuesday': {'close': '19:00', 'open': '10:00'...",40.40883,-79.866211,Emil's Lounge,[],...,"{'Haitian': 0, 'Vegan': 0, 'Ethnic Grocery': 0...","{'Alcohol_full_bar': 1, 'Noise Level_average':...",,,9.0,9.0,9.0,10.0,6.0,4.68
4,"{'Parking': {'garage': False, 'street': False,...",mYSpR_SLPgUVymYOvTQd_Q,"[Active Life, Golf]",Braddock,"1000 Clubhouse Dr\nBraddock, PA 15104","{'Sunday': {'close': '15:00', 'open': '10:00'}...",40.403405,-79.855782,Grand View Golf Club,[],...,,"{'Parking_garage': 0, 'Parking_street': 0, 'Pa...",5.0,,,9.0,9.0,9.0,9.0,5.0


# Exercise 6: Create a new table that only contains restaurants with the following schema:
## Business_Name | Restaurant_type | Friday hours | Saturday hours | Attributes | Zipcode | Average Rating

In [15]:
# Create new table with specified elements
restaurants_df = merged_df[['name', 'Restaurant_type', 'Friday', 'Saturday', 'flat_attributes', 'Zipcode', 'stars_y']]
restaurants_df = restaurants_df.rename(index = str, columns = {'name': 'Business_Name',
                                                               'Friday': 'Friday hours', 
                                                               'Saturday': 'Saturday hours', 
                                                               'Attributes': 'flat_attributes',
                                                               'Average Rating': 'stars_y'})

# Remove businesses missing elements
restaurants_df = restaurants_df.dropna()
restaurants_df.head()

Unnamed: 0,Business_Name,Restaurant_type,Friday hours,Saturday hours,flat_attributes,Zipcode,stars_y
3,Emil's Lounge,"{'Haitian': 0, 'Vegan': 0, 'Ethnic Grocery': 0...",10.0,6.0,"{'Alcohol_full_bar': 1, 'Noise Level_average':...",15104,4.68
5,Alexion's Bar & Grill,"{'Haitian': 0, 'Vegan': 0, 'Ethnic Grocery': 0...",15.0,14.0,"{'Alcohol_full_bar': 1, 'Noise Level_loud': 1,...",15106,3.894737
12,Kings Family Restaurant,"{'Haitian': 0, 'Vegan': 0, 'Ethnic Grocery': 0...",18.0,18.0,"{'Take-out': 1, 'Drive-Thru': 0, 'Good For_des...",15106,3.25
17,Rocky's Lounge,"{'Haitian': 0, 'Vegan': 0, 'Ethnic Grocery': 0...",12.0,12.0,"{'Alcohol_full_bar': 1, 'Noise Level_average':...",15106,3.8
18,Gab & Eat,"{'Haitian': 0, 'Vegan': 0, 'Ethnic Grocery': 0...",8.5,6.5,"{'Alcohol_none': 1, 'Noise Level_average': 1, ...",15106,4.25
