In [1]:
import pandas as pd 

## Get all the API files path in the directory 'transit_9am'

In [2]:
from os import listdir
from os.path import isfile, join
files = [join('data/transit_9am',f) for f in listdir('data/transit_9am')
         if (isfile(join('data/transit_9am', f)))&(f.endswith('.json'))]
files[:5]

### same as above
# import os
# files=[]
# for f in os.listdir('transit_9am'):
#     if f.endswith(".json"):
#         files.append(join('transit_9am',f)) 

['data/transit_9am/10001_10002.json',
 'data/transit_9am/10001_10003.json',
 'data/transit_9am/10001_10004.json',
 'data/transit_9am/10001_10005.json',
 'data/transit_9am/10001_10006.json']

## Set up dataframe template of API info

In [3]:
df=pd.DataFrame(columns=['origin','destination','route','o_lat','o_long','d_lat','d_long',
                         'total_distance','total_duration','total_departure','total_arrival',
                         'step','mode','line','start_station','end_station','direction',
                         'stop_num','on_subway','off_subway'])

In [4]:
df

Unnamed: 0,origin,destination,route,o_lat,o_long,d_lat,d_long,total_distance,total_duration,total_departure,total_arrival,step,mode,line,start_station,end_station,direction,stop_num,on_subway,off_subway


## Extract data from API files

In [5]:
import json
for fi in files:
    with open(fi) as json_data:
        data = json.load(json_data)
        origin = fi.split('/')[1].split('.json')[0].split('_')[0]
        destination = fi.split('/')[1].split('.json')[0].split('_')[1]
            
        for route in range(len(data['routes'])):
    
            steps=[]
            for step in data['routes'][route]['legs'][0]['steps']:
                if step['travel_mode']=='TRANSIT':
                    total_arrival = data['routes'][route]['legs'][0]['arrival_time']['text']
                    total_departure = data['routes'][route]['legs'][0]['departure_time']['text']
                    total_distance = data['routes'][route]['legs'][0]['distance']['text']
                    total_duration = data['routes'][route]['legs'][0]['duration']['text']
                    o_lat = data['routes'][route]['legs'][0]['start_location']['lat']
                    o_long = data['routes'][route]['legs'][0]['start_location']['lng']
                    d_lat = data['routes'][route]['legs'][0]['end_location']['lat']
                    d_long = data['routes'][route]['legs'][0]['end_location']['lng']
                    steps.append( step)
            for step in range(len(steps)):
                mode=steps[step]['transit_details']['line']['vehicle']['type']
                if 'short_name' in steps[step]['transit_details']['line']:
                    line = steps[step]['transit_details']['line']['short_name']
                elif 'agencies' in steps[step]['transit_details']['line']:
                    line = steps[step]['transit_details']['line']['agencies'][0]['name']
                start_station= steps[step]['transit_details']['departure_stop']['name']
                end_station= steps[step]['transit_details']['arrival_stop']['name']
                direction = steps[step]['transit_details']['headsign']
                stop_num = steps[step]['transit_details']['num_stops']
                on_subway = steps[step]['transit_details']['departure_time']['text']
                off_subway = steps[step]['transit_details']['arrival_time']['text']
                dataframe = pd.DataFrame([[origin,destination,route,
                                o_lat,o_long,d_lat,d_long,
                                total_distance,total_duration,total_departure,total_arrival,
                                step,mode,line,start_station,end_station,
                                direction,stop_num, on_subway, off_subway]],
                            columns=['origin','destination','route','o_lat','o_long','d_lat','d_long',
                         'total_distance','total_duration','total_departure','total_arrival',
                         'step','mode','line','start_station','end_station','direction',
                         'stop_num','on_subway','off_subway'])
                df=df.append(dataframe)

In [6]:
df=df.reset_index(drop=True)
df['end_station']= df['end_station'].str.replace(u"\u2022" ,'-')
df['start_station']= df['start_station'].str.replace(u"\u2022" ,'-')

In [8]:
df.describe()

Unnamed: 0,route,o_lat,o_long,d_lat,d_long,step,stop_num
count,67655.0,67655.0,67655.0,67655.0,67655.0,67655.0,67655.0
mean,1.529274,40.768104,-73.882627,40.722931,-73.925916,0.673816,9.773764
std,1.11941,0.016986,0.085273,0.077669,0.105122,0.789934,8.69136
min,0.0,40.741346,-74.008432,40.507029,-74.244345,0.0,1.0
25%,1.0,40.7537,-73.952559,40.674866,-73.985997,0.0,4.0
50%,2.0,40.765754,-73.906467,40.724848,-73.943149,1.0,7.0
75%,3.0,40.778841,-73.812133,40.764672,-73.856759,1.0,13.0
max,3.0,40.803247,-73.709559,40.898855,-73.682835,5.0,74.0


In [10]:
df.head(130)

Unnamed: 0,origin,destination,route,o_lat,o_long,d_lat,d_long,total_distance,total_duration,total_departure,total_arrival,step,mode,line,start_station,end_station,direction,stop_num,on_subway,off_subway
0,transit,9am,0.0,40.7537,-73.999152,40.713731,-73.985971,4.2 mi,32 mins,8:25am,8:57am,0.0,SUBWAY,F,34 Street - Herald Sq Station,East Broadway Station,Kings Hwy,7.0,8:40am,8:52am
1,transit,9am,1.0,40.7537,-73.999152,40.713731,-73.985971,4.2 mi,32 mins,8:23am,8:55am,0.0,SUBWAY,F,34 Street - Herald Sq Station,East Broadway Station,Coney Island - Stillwell Av,7.0,8:38am,8:50am
2,transit,9am,2.0,40.7537,-73.999152,40.713731,-73.985971,4.3 mi,35 mins,8:22am,8:57am,0.0,SUBWAY,C,34 St - Penn Station,West 4 Street - Washington Square Station,Euclid Av,3.0,8:30am,8:36am
3,transit,9am,2.0,40.7537,-73.999152,40.713731,-73.985971,4.3 mi,35 mins,8:22am,8:57am,1.0,SUBWAY,F,West 4 Street - Washington Square Station,East Broadway Station,Kings Hwy,4.0,8:45am,8:52am
4,transit,9am,3.0,40.7537,-73.999152,40.713731,-73.985971,4.2 mi,33 mins,8:18am,8:51am,0.0,SUBWAY,F,34 Street - Herald Sq Station,East Broadway Station,Kings Hwy,7.0,8:33am,8:46am
5,transit,9am,0.0,40.7537,-73.999152,40.732437,-73.987276,3.0 mi,27 mins,8:31am,8:58am,0.0,SUBWAY,7,34th Street-Hudson Yards Subway Station,Times Square 42nd Street Station,Flushing - Main St,1.0,8:35am,8:39am
6,transit,9am,0.0,40.7537,-73.999152,40.732437,-73.987276,3.0 mi,27 mins,8:31am,8:58am,1.0,SUBWAY,N,Times Sq-42 St Station,Union Square,Coney Island - Stillwell Av,2.0,8:47am,8:51am
7,transit,9am,1.0,40.7537,-73.999152,40.732437,-73.987276,2.4 mi,29 mins,8:28am,8:57am,0.0,SUBWAY,E,34 St - Penn Station,14 Street / 8 Av,World Trade Center,2.0,8:36am,8:40am
8,transit,9am,1.0,40.7537,-73.999152,40.732437,-73.987276,2.4 mi,29 mins,8:28am,8:57am,1.0,SUBWAY,L,14 Street / 8 Av,3 Avenue Station,Canarsie - Rockaway Pkwy,3.0,8:50am,8:54am
9,transit,9am,2.0,40.7537,-73.999152,40.732437,-73.987276,3.0 mi,27 mins,8:27am,8:54am,0.0,SUBWAY,7,34th Street-Hudson Yards Subway Station,Times Square 42nd Street Station,Flushing - Main St,1.0,8:31am,8:34am


In [15]:
df.to_csv('aggregated_api', encoding = "utf-8")