## Import json files into pandas

In [6]:
# Load dependencies
import pandas as pd
import json
import os

In [10]:
# Read the directory with the data and save file_names in a list
path_to_json_files = 'data/'
json_files = [single_json for single_json in os.listdir(path_to_json_files) if single_json.endswith('.json')]

In [11]:
# Initiate an empty dataframe
main_df = pd.DataFrame()

# Feed the main_df with dataframes for each json file
for file_name in json_files:
    # Import the json and read it 
    with open(path_to_json_files+file_name) as json_data:
        d = json.load(json_data)
        for table_name in d.keys():
            d = d[table_name]
    
    # Load context into a pandas dataframe
    df_helper = pd.DataFrame(d)
    # Re-order the columns
    df_helper = df_helper.reindex_axis(['_id','name','created','delay','accept'],axis=1)
    # Add to the main_df each df created
    main_df = pd.concat([df_helper,main_df])

In [12]:
# Reset index of main_df
main_df = main_df.reset_index().drop('index',axis=1)

In [19]:
main_df

Unnamed: 0,_id,name,created,delay,accept
0,1,Ricky,2017-02-18 23:33:11,654,0
1,2,Ricky,2017-02-19 09:00:25,356,1
2,3,Ricky,2017-02-19 09:07:39,564,1
3,4,Ricky,2017-02-19 12:24:08,398,1
4,5,Ricky,2017-02-20 18:15:11,538,0
5,6,Ricky,2017-02-22 08:59:31,456,1
6,7,Ricky,2017-02-22 09:00:54,600,0
7,8,Ricky,2017-02-22 09:01:29,200,1
8,9,Ricky,2017-02-22 12:08:03,280,1
9,1,Ricky,2017-02-18 23:33:11,654,0


## Start the analysis

In [20]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 117 entries, 0 to 116
Data columns (total 5 columns):
_id        117 non-null object
name       117 non-null object
created    117 non-null datetime64[ns]
delay      117 non-null object
accept     117 non-null object
dtypes: datetime64[ns](1), object(4)
memory usage: 4.6+ KB


In [24]:
# Convert data types
main_df['created'] = pd.to_datetime(main_df['created'])
main_df['delay'] = main_df['delay'].astype('int')
main_df['accept'] = main_df['accept'].astype('bool')

In [33]:
main_df.describe()

Unnamed: 0,delay
count,117.0
mean,449.555556
std,145.08597
min,200.0
25%,356.0
50%,456.0
75%,564.0
max,654.0
