# Convert JSON to CSV

The file extracted from the Yelp dataset `yelp_academic.tar` file is a typeless file `yelp_dataset`. This is actually another tar file, so manually add the `.tar` extension and extract again to get to the true data files. 

The Yelp Dataset is a series of JSON files. Before performing data analysis, convert the JSON files to CSVs.

### Import Libaries

In [196]:
import pandas as pd
import time
import csv
import simplejson as sjson

### JSON and CSV File Paths

In [148]:
categories = ['business', 'checkin', 'photo', 'review', 'tip', 'user']

In [150]:
json_file_paths = {cat : '../data/json/yelp_academic_dataset_{}.json'.format(cat) 
                   for cat in categories}

csv_file_paths = {cat : '../data/csv/yelp_academic_dataset_{}.csv'.format(cat) 
                   for cat in categories}

In [152]:
json_file_paths

{'business': '../data/json/yelp_academic_dataset_business.json',
 'checkin': '../data/json/yelp_academic_dataset_checkin.json',
 'photo': '../data/json/yelp_academic_dataset_photo.json',
 'review': '../data/json/yelp_academic_dataset_review.json',
 'tip': '../data/json/yelp_academic_dataset_tip.json',
 'user': '../data/json/yelp_academic_dataset_user.json'}

In [151]:
csv_file_paths

{'business': '../data/csv/yelp_academic_dataset_business.csv',
 'checkin': '../data/csv/yelp_academic_dataset_checkin.csv',
 'photo': '../data/csv/yelp_academic_dataset_photo.csv',
 'review': '../data/csv/yelp_academic_dataset_review.csv',
 'tip': '../data/csv/yelp_academic_dataset_tip.csv',
 'user': '../data/csv/yelp_academic_dataset_user.csv'}

### Read in JSON, Write out to CSV

In [123]:
def read_and_write_file(json_file_path, csv_file_path, column_names):
    '''
    Read in JSON file
    write out as CSV file
    with column_names as header
    '''
    
    with open(csv_file_path, 'w+') as f_out:
        csv_file = csv.writer(f_out)
        csv_file.writerow(list(column_names))
        with open(json_file_path) as f_in:
            for line in f_in:
                d = sjson.loads(line)
                csv_file.writerow(get_row(d, column_names))
                
                


### Get Superset of Column Names
The superset is the set of all possible column names.

In [181]:
def get_superset_of_column_names(json_file_path):
    '''
    Arguments:
    json_file_path : string pathname of a JSON file
    
    Extracts the unique column names (keys) of a JSON file
    
    Return:
    column_names : Set of all column names (keys) of a JSON file
    
    Example:
    json_file = {'a' : {'b' : 1, 'c' : 2}}
    column_names = {'a.b', 'a.c'}
    '''
    
    column_names = set()
    with open(json_file_path) as f_in:
        for line in f_in:
            d = sjson.loads(line)
            column_names.update(set(get_column_names(d)))

    return column_names

In [182]:
json_file_path = '../data/json/yelp_academic_dataset_business.json'
get_superset_of_column_names(json_file_path)

{'address',
 'attributes',
 'attributes.AcceptsInsurance',
 'attributes.AgesAllowed',
 'attributes.Alcohol',
 'attributes.Ambience',
 'attributes.BYOB',
 'attributes.BYOBCorkage',
 'attributes.BestNights',
 'attributes.BikeParking',
 'attributes.BusinessAcceptsBitcoin',
 'attributes.BusinessAcceptsCreditCards',
 'attributes.BusinessParking',
 'attributes.ByAppointmentOnly',
 'attributes.Caters',
 'attributes.CoatCheck',
 'attributes.Corkage',
 'attributes.DietaryRestrictions',
 'attributes.DogsAllowed',
 'attributes.DriveThru',
 'attributes.GoodForDancing',
 'attributes.GoodForKids',
 'attributes.GoodForMeal',
 'attributes.HairSpecializesIn',
 'attributes.HappyHour',
 'attributes.HasTV',
 'attributes.Music',
 'attributes.NoiseLevel',
 'attributes.Open24Hours',
 'attributes.OutdoorSeating',
 'attributes.RestaurantsAttire',
 'attributes.RestaurantsCounterService',
 'attributes.RestaurantsDelivery',
 'attributes.RestaurantsGoodForGroups',
 'attributes.RestaurantsPriceRange2',
 'attributes

In [96]:
json_file_path = '../data/json/yelp_academic_dataset_review.json'
get_superset_of_column_names(json_file_path)

{'business_id',
 'cool',
 'date',
 'funny',
 'review_id',
 'stars',
 'text',
 'useful',
 'user_id'}

### Get Column Names

In [167]:
def get_column_names(d, parent_key = ''):
    '''
    Arguments:
    d : Nested dictionary of contents

    Recursively extracts the keys of d and 
    returns a list of flattened keys as column_names

    Return:
    column_names : Flattened list of key names
    
    Example: 
    d = {'a' : {'b' : 0, 'c' : 1}}
    column_names = ['a.b', 'a.c']
    '''
    
    column_names = []
    
    for key, val in d.items():
        column_name = f'{parent_key}.{key}' if parent_key else key
        
        # if val is a dictionary, recursively call get_column_names
        if isinstance(val, collections.MutableMapping):
            column_names.extend(get_column_names(val, column_name))
        else:
            column_names.append(column_name)
            
    return column_names

In [168]:
d = {'a' : {'b' : 0, 'c' : 1}}
get_column_names(d)

['a.b', 'a.c']

### Get Nested Value

In [179]:
def get_nested_value(d, key):
    '''
    Arguments: 
    d : Nested dictionary of contents
    key : Flattened key string
    
    Recursively extracts the value from a nested dictionary given a key
    
    Return:
    Value of d[key]
    
    Example:
    d = {'a' : {'b' : 0, 'c' : 1}}
    key = 'a.b'
    value = 0
    '''
    
    if d is None:
        return None
    
    # check if key is an end node
    if '.' not in key:
        # return value = d[key] if it exists
        if key not in d:
            return None
        return d[key]
    
    # extract parent key from child key(s)
    parent_key, child_key = key.split('.', 1)
    
    if parent_key not in d:
        return None
    # get value of d[parent_key] is a dictionary for recursive call
    child_d = d[parent_key]
    
    return get_nested_value(child_d, child_key)


In [180]:
d = {'a' : {'b' : 0, 'c' : 1}, 'd' : {'e' : None, 'f' : 2}}
get_nested_value(d, 'd.f')

2

### Get Row

In [90]:
def get_row(d, column_names):
    '''
    Arguments:
    d : Nested dictionary of contents
    column_names : List of column names.
        ** ASSUMES ALL column_names ARE VALID **
     
    Return:
    row : List of values as strings
    
    Returns a csv compatible row given column names and a dict
    '''
    row = []    
    for column_name in column_names:
        val = get_nested_value(d, column_name)

        if val is not None:
            row.append(str(val))
        else:
            row.append('')

    return row
    

In [91]:
d = {'a' : {'b' : 0, 'c' : 'C'}}
get_row(d, ['a', 'b', 'a.b', 'a.c', 'a.d'])

["{'b': 0, 'c': 'C'}", '', '0', 'C', '']

## Convert JSON to CSV

#### Business

In [183]:
t0 = time.time()

json_file_path = json_file_paths['business']
csv_file_path = csv_file_paths['business']

column_names = get_superset_of_column_names(json_file_path)
print(column_names)

read_and_write_file(json_file_path, csv_file_path, column_names)

print(time.time() - t0)

{'longitude', 'attributes.BYOB', 'attributes.Open24Hours', 'attributes.NoiseLevel', 'attributes.CoatCheck', 'attributes.BYOBCorkage', 'stars', 'attributes.HasTV', 'hours.Monday', 'attributes.BestNights', 'attributes.DriveThru', 'attributes.RestaurantsPriceRange2', 'attributes.RestaurantsTableService', 'review_count', 'business_id', 'attributes.RestaurantsCounterService', 'attributes.GoodForKids', 'attributes.RestaurantsGoodForGroups', 'state', 'attributes.Caters', 'attributes.Ambience', 'neighborhood', 'attributes.RestaurantsDelivery', 'address', 'attributes.WiFi', 'hours', 'attributes.HappyHour', 'attributes.RestaurantsAttire', 'attributes.OutdoorSeating', 'attributes.ByAppointmentOnly', 'attributes.Smoking', 'hours.Thursday', 'attributes.AcceptsInsurance', 'attributes.Alcohol', 'categories', 'attributes.Corkage', 'attributes.BusinessAcceptsCreditCards', 'attributes.BusinessParking', 'name', 'attributes.DogsAllowed', 'attributes.Music', 'attributes.GoodForMeal', 'hours.Friday', 'attri

#### Checkin

In [188]:
t0 = time.time()

json_file_path = json_file_paths['checkin']
csv_file_path = csv_file_paths['checkin']

column_names = get_superset_of_column_names(json_file_path)
print(column_names)

read_and_write_file(json_file_path, csv_file_path, column_names)

print(time.time() - t0)

{'time.Mon-7', 'time.Thu-8', 'time.Tue-6', 'time.Sun-5', 'time.Mon-1', 'time.Sun-10', 'time.Tue-1', 'time.Sat-22', 'time.Fri-13', 'time.Sat-5', 'time.Fri-3', 'time.Fri-0', 'time.Fri-20', 'time.Sun-8', 'time.Tue-14', 'time.Sat-23', 'time.Fri-23', 'time.Thu-11', 'time.Sat-16', 'time.Tue-17', 'time.Tue-21', 'time.Wed-19', 'time.Wed-16', 'time.Tue-11', 'time.Sat-7', 'time.Mon-10', 'time.Sat-19', 'time.Mon-11', 'time.Sat-4', 'time.Thu-5', 'time.Wed-11', 'time.Thu-4', 'time.Sat-6', 'time.Sun-20', 'time.Sat-21', 'time.Thu-12', 'time.Sat-1', 'time.Sat-11', 'time.Thu-17', 'time.Mon-5', 'time.Mon-16', 'time.Fri-8', 'time.Tue-12', 'time.Mon-3', 'time.Thu-16', 'time.Tue-18', 'time.Tue-22', 'time.Sun-16', 'time.Sat-8', 'time.Tue-2', 'time.Wed-1', 'time.Fri-16', 'time.Mon-0', 'time.Wed-8', 'time.Thu-6', 'time.Thu-22', 'time.Mon-4', 'time.Sat-17', 'time.Mon-13', 'time.Fri-17', 'time.Mon-14', 'time.Sun-4', 'time.Thu-9', 'time.Sat-12', 'time.Thu-19', 'time.Sun-23', 'time.Sun-7', 'time.Sun-21', 'time.Th

#### Photo

In [189]:
t0 = time.time()

json_file_path = json_file_paths['photo']
csv_file_path = csv_file_paths['photo']

column_names = get_superset_of_column_names(json_file_path)
print(column_names)

read_and_write_file(json_file_path, csv_file_path, column_names)

print(time.time() - t0)

{'label', 'caption', 'photo_id', 'business_id'}
5.052654027938843


#### Review

In [169]:
t0 = time.time()

json_file_path = '../data/json/yelp_academic_dataset_review.json'
csv_file_path = json_file_path.replace('json', 'csv')

column_names = get_superset_of_column_names(json_file_path)
read_and_write_file(json_file_path, csv_file_path, column_names)

print(time.time() - t0)

294.8083598613739


#### Tip

In [190]:
t0 = time.time()

json_file_path = json_file_paths['tip']
csv_file_path = csv_file_paths['tip']

column_names = get_superset_of_column_names(json_file_path)
print(column_names)

read_and_write_file(json_file_path, csv_file_path, column_names)

print(time.time() - t0)

{'text', 'likes', 'date', 'business_id', 'user_id'}
27.073715925216675


#### User

In [191]:
t0 = time.time()

json_file_path = json_file_paths['user']
csv_file_path = csv_file_paths['user']

column_names = get_superset_of_column_names(json_file_path)
print(column_names)

read_and_write_file(json_file_path, csv_file_path, column_names)

print(time.time() - t0)

{'name', 'elite', 'user_id', 'fans', 'average_stars', 'review_count', 'compliment_hot', 'compliment_writer', 'compliment_profile', 'compliment_note', 'compliment_cool', 'compliment_plain', 'useful', 'compliment_cute', 'yelping_since', 'friends', 'cool', 'compliment_photos', 'funny', 'compliment_more', 'compliment_funny', 'compliment_list'}
143.07469487190247


### Load CSV

Verify that JSON file was successfully converted to CSV.

In [193]:
user_df = pd.read_csv(csv_file_paths['user'])

In [194]:
user_df.head()

Unnamed: 0,name,elite,user_id,fans,average_stars,review_count,compliment_hot,compliment_writer,compliment_profile,compliment_note,...,useful,compliment_cute,yelping_since,friends,cool,compliment_photos,funny,compliment_more,compliment_funny,compliment_list
0,Susan,,lzlZwIpuSWXEnNS91wxjHw,0,2.0,1,0,0,0,0,...,0,0,2015-09-28,,0,0,0,0,0,0
1,Daipayan,,XvLBr-9smbI0m_a7dXtB7w,0,5.0,2,0,0,0,0,...,0,0,2015-09-05,,0,0,0,0,0,0
2,Andy,,QPT4Ud4H5sJVr68yXhoWFw,0,4.0,1,0,0,0,0,...,0,0,2016-07-21,,0,0,0,0,0,0
3,Jonathan,,i5YitlHZpf0B3R0s_8NVuw,0,4.05,19,0,0,0,0,...,0,0,2014-08-04,,0,0,0,0,0,0
4,Shashank,,s4FoIXE_LSGviTHBe8dmcg,0,3.0,3,0,0,0,0,...,0,0,2017-06-18,,0,0,0,0,0,0


In [195]:
user_df.shape

(1518169, 22)