# Convert JSON to CSV

The file extracted from the Yelp dataset (`yelp_academic.tar`) is a typeless file (`yelp_dataset`). This is actually another tar file, so manually add the `.tar` extension and extract again to get to the true data files. 

The Yelp Dataset is a series of JSON files. I will convert the JSON files to CSVs.

### Import Libaries

In [1]:
import pandas as pd
import time
import csv
import simplejson as sjson
import collections

### JSON and CSV File Paths

In [2]:
categories = ['business', 'checkin', 'photo', 'review', 'tip', 'user']

In [3]:
json_file_paths = {cat : f'../data/external/yelp_academic_dataset_{cat}.json'
                   for cat in categories}

csv_file_paths = {cat : f'../data/raw/yelp_academic_dataset_{cat}.csv'
                   for cat in categories}

In [4]:
json_file_paths

{'business': '../data/external/yelp_academic_dataset_business.json',
 'checkin': '../data/external/yelp_academic_dataset_checkin.json',
 'photo': '../data/external/yelp_academic_dataset_photo.json',
 'review': '../data/external/yelp_academic_dataset_review.json',
 'tip': '../data/external/yelp_academic_dataset_tip.json',
 'user': '../data/external/yelp_academic_dataset_user.json'}

In [5]:
csv_file_paths

{'business': '../data/raw/yelp_academic_dataset_business.csv',
 'checkin': '../data/raw/yelp_academic_dataset_checkin.csv',
 'photo': '../data/raw/yelp_academic_dataset_photo.csv',
 'review': '../data/raw/yelp_academic_dataset_review.csv',
 'tip': '../data/raw/yelp_academic_dataset_tip.csv',
 'user': '../data/raw/yelp_academic_dataset_user.csv'}

### Read in JSON, Write out to CSV

In [6]:
def read_and_write_file(json_file_path, csv_file_path, column_names):
    '''
    Arguments:
    json_file_path (str) : pathname of a JSON file
    csv_file_path (str) : pathname of CSV file
    column_names (set) : column names (keys) of a JSON file
    
    Returns:
    None
    
    Read in JSON file
    Write out as CSV file with column_names as header
    '''
    
    with open(csv_file_path, 'w+') as f_out:
        csv_file = csv.writer(f_out)
        csv_file.writerow(list(column_names))
        with open(json_file_path) as f_in:
            for line in f_in:
                d = sjson.loads(line)
                csv_file.writerow(get_row(d, column_names))
                
                


### Get Column Names

In [7]:
def get_column_names(d, parent_key = ''):
    '''
    Arguments:
    d (dict) : Nested dictionary of contents

    Recursively extracts the keys of d and 
    returns a list of flattened keys as column_names

    Returns:
    column_names (list) : Flattened list of key names
    
    Example: 
    d = {'a' : {'b' : 0, 'c' : 1}}
    column_names = ['a.b', 'a.c']
    '''
    
    column_names = []
    
    for key, val in d.items():
        column_name = f'{parent_key}.{key}' if parent_key else key
        
        # if val is a dictionary, recursively call get_column_names
        if isinstance(val, collections.MutableMapping):
            column_names.extend(get_column_names(val, column_name))
        else:
            column_names.append(column_name)
            
    return column_names

In [8]:
d = {'a' : {'b' : 0, 'c' : 1}}
get_column_names(d)

['a.b', 'a.c']

### Get Nested Value

In [9]:
def get_nested_value(d, key):
    '''
    Arguments: 
    d (dict) : Nested dictionary of contents
    key (str) : Flattened key string
    
    Recursively extracts the value from a nested dictionary given a key
    
    Returns:
    Value of d[key]
    
    Example:
    d = {'a' : {'b' : 0, 'c' : 1}}
    key = 'a.b'
    value = 0
    '''
    
    if d is None:
        return None
    
    # check if key is an end node
    if '.' not in key:
        # return value = d[key] if it exists
        if key not in d:
            return None
        return d[key]
    
    # extract parent key from child key(s)
    parent_key, child_key = key.split('.', 1)
    
    if parent_key not in d:
        return None
    # get value of d[parent_key] is a dictionary for recursive call
    child_d = d[parent_key]
    
    return get_nested_value(child_d, child_key)


In [10]:
d = {'a' : {'b' : 0, 'c' : 1}, 'd' : {'e' : None, 'f' : 2}}
get_nested_value(d, 'd.f')

2

### Get Row

In [11]:
def get_row(d, column_names):
    '''
    Arguments:
    d : Nested dictionary of contents
    column_names : List of column names. ** ASSUMES ALL column_names ARE VALID **
     
    Returns:
    row (list) : Values as strings
    
    Returns a csv compatible row given column names and a dict
    '''
    row = []    
    for column_name in column_names:
        val = get_nested_value(d, column_name)

        if val is not None:
            row.append(str(val))
        else:
            row.append('')

    return row
    

In [12]:
d = {'a' : {'b' : 0, 'c' : 'C'}}
get_row(d, ['a', 'b', 'a.b', 'a.c', 'a.d'])

["{'b': 0, 'c': 'C'}", '', '0', 'C', '']

### Get Superset of Column Names
The superset is the set of all possible column names.

In [13]:
def get_superset_of_column_names(json_file_path):
    '''
    Arguments:
    json_file_path (str) : pathname of a JSON file
    
    Extracts the unique column names (keys) of a JSON file
    
    Returns:
    column_names (set) : Column names (keys) of a JSON file
    
    Example:
    json_file = {'a' : {'b' : 1, 'c' : 2}}
    column_names = {'a.b', 'a.c'}
    '''
    
    column_names = set()
    with open(json_file_path) as f_in:
        for line in f_in:
            d = sjson.loads(line)
            column_names.update(set(get_column_names(d)))

    return column_names

#### Business column names

In [18]:
%%time

json_file_path = '../data/external/yelp_academic_dataset_business.json'
column_names = get_superset_of_column_names(json_file_path)
print(column_names)

{'attributes.WiFi', 'hours.Monday', 'attributes.RestaurantsPriceRange2', 'hours.Sunday', 'attributes.CoatCheck', 'attributes.RestaurantsGoodForGroups', 'neighborhood', 'attributes.DriveThru', 'city', 'attributes.NoiseLevel', 'address', 'attributes.BestNights', 'attributes.GoodForKids', 'attributes.GoodForMeal', 'attributes.ByAppointmentOnly', 'attributes.Open24Hours', 'attributes.RestaurantsTakeOut', 'postal_code', 'hours.Friday', 'attributes.BusinessAcceptsBitcoin', 'attributes.RestaurantsReservations', 'business_id', 'hours.Wednesday', 'attributes.RestaurantsCounterService', 'attributes.HappyHour', 'attributes.HairSpecializesIn', 'attributes.GoodForDancing', 'attributes.AgesAllowed', 'attributes.Caters', 'is_open', 'attributes', 'categories', 'attributes.RestaurantsDelivery', 'attributes.Alcohol', 'latitude', 'hours.Saturday', 'attributes.DietaryRestrictions', 'attributes.DogsAllowed', 'attributes.Smoking', 'attributes.Ambience', 'attributes.AcceptsInsurance', 'attributes.BYOB', 'sta

### Review column names

In [19]:
%%time 

json_file_path = '../data/external/yelp_academic_dataset_review.json'
column_names = get_superset_of_column_names(json_file_path)
print(column_names)

{'text', 'user_id', 'useful', 'business_id', 'date', 'stars', 'review_id', 'funny', 'cool'}
CPU times: user 1min 52s, sys: 1.58 s, total: 1min 53s
Wall time: 1min 55s


## Convert JSON to CSV

#### Business

In [20]:
%%time

json_file_path = json_file_paths['business']
csv_file_path = csv_file_paths['business']

column_names = get_superset_of_column_names(json_file_path)
print(column_names)

read_and_write_file(json_file_path, csv_file_path, column_names)

{'attributes.WiFi', 'hours.Monday', 'attributes.RestaurantsPriceRange2', 'hours.Sunday', 'attributes.CoatCheck', 'attributes.RestaurantsGoodForGroups', 'neighborhood', 'attributes.DriveThru', 'city', 'attributes.NoiseLevel', 'address', 'attributes.BestNights', 'attributes.GoodForKids', 'attributes.GoodForMeal', 'attributes.ByAppointmentOnly', 'attributes.Open24Hours', 'attributes.RestaurantsTakeOut', 'postal_code', 'hours.Friday', 'attributes.BusinessAcceptsBitcoin', 'attributes.RestaurantsReservations', 'business_id', 'hours.Wednesday', 'attributes.RestaurantsCounterService', 'attributes.HappyHour', 'attributes.HairSpecializesIn', 'attributes.GoodForDancing', 'attributes.AgesAllowed', 'attributes.Caters', 'is_open', 'attributes', 'categories', 'attributes.RestaurantsDelivery', 'attributes.Alcohol', 'latitude', 'hours.Saturday', 'attributes.DietaryRestrictions', 'attributes.DogsAllowed', 'attributes.Smoking', 'attributes.Ambience', 'attributes.AcceptsInsurance', 'attributes.BYOB', 'sta

#### Checkin

In [21]:
%%time

json_file_path = json_file_paths['checkin']
csv_file_path = csv_file_paths['checkin']

column_names = get_superset_of_column_names(json_file_path)
print(column_names)

read_and_write_file(json_file_path, csv_file_path, column_names)

{'time.Sun-6', 'time.Sat-21', 'time.Fri-5', 'time.Sat-4', 'time.Tue-8', 'time.Sat-13', 'time.Mon-13', 'time.Sat-8', 'time.Wed-22', 'time.Sat-3', 'time.Wed-21', 'time.Tue-13', 'time.Mon-22', 'time.Mon-15', 'time.Tue-18', 'time.Sat-10', 'time.Sun-9', 'time.Wed-18', 'time.Tue-23', 'time.Mon-16', 'time.Sun-23', 'time.Wed-8', 'time.Mon-11', 'time.Mon-12', 'time.Thu-15', 'time.Fri-18', 'time.Tue-17', 'time.Tue-14', 'time.Mon-5', 'time.Sat-23', 'time.Wed-16', 'time.Fri-23', 'time.Sun-12', 'time.Thu-18', 'time.Thu-22', 'time.Tue-4', 'time.Sun-4', 'time.Thu-6', 'time.Sun-22', 'time.Sat-6', 'time.Mon-19', 'time.Mon-23', 'time.Thu-1', 'time.Thu-23', 'time.Mon-3', 'time.Wed-12', 'time.Mon-10', 'time.Wed-13', 'time.Fri-22', 'time.Wed-15', 'time.Fri-17', 'time.Fri-20', 'time.Fri-13', 'time.Mon-0', 'time.Sun-11', 'time.Thu-3', 'time.Sun-20', 'time.Sat-20', 'time.Sat-5', 'time.Mon-9', 'time.Thu-16', 'time.Wed-20', 'time.Tue-0', 'time.Tue-21', 'time.Wed-11', 'time.Sun-17', 'time.Thu-20', 'time.Fri-9', 

#### Photo

In [22]:
t0 = time.time()

json_file_path = json_file_paths['photo']
csv_file_path = csv_file_paths['photo']

column_names = get_superset_of_column_names(json_file_path)
print(column_names)

read_and_write_file(json_file_path, csv_file_path, column_names)

print(time.time() - t0)

{'photo_id', 'business_id', 'caption', 'label'}
5.322238922119141


#### Review

In [23]:
t0 = time.time()

json_file_path = '../data/external/yelp_academic_dataset_review.json'
csv_file_path = json_file_path.replace('json', 'csv')

column_names = get_superset_of_column_names(json_file_path)
read_and_write_file(json_file_path, csv_file_path, column_names)

print(time.time() - t0)

306.009996175766


#### Tip

In [24]:
t0 = time.time()

json_file_path = json_file_paths['tip']
csv_file_path = csv_file_paths['tip']

column_names = get_superset_of_column_names(json_file_path)
print(column_names)

read_and_write_file(json_file_path, csv_file_path, column_names)

print(time.time() - t0)

{'text', 'likes', 'user_id', 'date', 'business_id'}
28.98428988456726


#### User

In [25]:
t0 = time.time()

json_file_path = json_file_paths['user']
csv_file_path = csv_file_paths['user']

column_names = get_superset_of_column_names(json_file_path)
print(column_names)

read_and_write_file(json_file_path, csv_file_path, column_names)

print(time.time() - t0)

{'compliment_profile', 'yelping_since', 'compliment_plain', 'compliment_photos', 'compliment_writer', 'compliment_list', 'compliment_note', 'useful', 'compliment_cute', 'friends', 'compliment_funny', 'compliment_more', 'average_stars', 'user_id', 'fans', 'compliment_cool', 'review_count', 'name', 'compliment_hot', 'funny', 'cool', 'elite'}
151.38960790634155


### Load CSV

Verify that JSON file was successfully converted to CSV.

In [26]:
user_df = pd.read_csv(csv_file_paths['user'])

In [27]:
user_df.head()

Unnamed: 0,compliment_profile,yelping_since,compliment_plain,compliment_photos,compliment_writer,compliment_list,compliment_note,useful,compliment_cute,friends,...,average_stars,user_id,fans,compliment_cool,review_count,name,compliment_hot,funny,cool,elite
0,0,2015-09-28,0,0,0,0,0,0,0,,...,2.0,lzlZwIpuSWXEnNS91wxjHw,0,0,1,Susan,0,0,0,
1,0,2015-09-05,0,0,0,0,0,0,0,,...,5.0,XvLBr-9smbI0m_a7dXtB7w,0,0,2,Daipayan,0,0,0,
2,0,2016-07-21,0,0,0,0,0,0,0,,...,4.0,QPT4Ud4H5sJVr68yXhoWFw,0,0,1,Andy,0,0,0,
3,0,2014-08-04,0,0,0,0,0,0,0,,...,4.05,i5YitlHZpf0B3R0s_8NVuw,0,0,19,Jonathan,0,0,0,
4,0,2017-06-18,0,0,0,0,0,0,0,,...,3.0,s4FoIXE_LSGviTHBe8dmcg,0,0,3,Shashank,0,0,0,


In [28]:
user_df.shape

(1518169, 22)