<a href="https://colab.research.google.com/github/rajatrh/MTA-Schedule-Delays/blob/master/Static_MTA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy as sc
import seaborn as sb
from sklearn import preprocessing
import json
import csv
import requests

In [0]:
url = "http://web.mta.info/developers/data/lirr/lirr_gtfs.json"
response = requests.request("GET", url)

In [0]:
json_parsed = json.loads(response.text)

In [0]:
json_parsed.keys()

dict_keys(['agency_id', 'feed_version', 'revised', 'gtfs'])

In [0]:
json_parsed['gtfs'].keys()

##Stops - LIRR

In [16]:
json_parsed['gtfs']['stops'][0].keys()

dict_keys(['stop_id', 'stop_name', 'stop_desc', 'stop_lat', 'stop_lon', 'stop_url', 'wheelchair_boarding'])

In [0]:
df_stops = pd.DataFrame(columns=json_parsed['gtfs']['stops'][0].keys())

In [0]:
for stop in json_parsed['gtfs']['stops']:
  list_entries = []
  for key in stop.keys():
    list_entries.append(stop[key])
  df_stops = df_stops.append(pd.Series(list_entries, index=df_stops.columns ), ignore_index=True)

In [0]:
df_stops.to_csv('stops_lirr.csv')

##Routes - LIRR

In [15]:
json_parsed['gtfs']['routes'][0].keys()

dict_keys(['route_id', 'route_short_name', 'route_long_name', 'route_type', 'route_color', 'route_text_color'])

In [0]:
df_routes = pd.DataFrame(columns=json_parsed['gtfs']['routes'][0].keys())

In [0]:
for route in json_parsed['gtfs']['routes']:
  list_entries = []
  for key in route.keys():
    list_entries.append(route[key])
  df_routes = df_routes.append(pd.Series(list_entries, index=df_routes.columns ), ignore_index=True)

In [0]:
df_routes.to_csv('routes_lirr.csv')

##Shapes - LIRR - Not Needed

In [0]:
json_parsed['gtfs']['shapes'][0].keys()

dict_keys(['shape_id', 'shape_pt_lat', 'shape_pt_lon', 'shape_pt_sequence'])

In [0]:
df_shapes = pd.DataFrame(columns=json_parsed['gtfs']['shapes'][0].keys())

In [0]:
i=0
for shape in json_parsed['gtfs']['shapes']:
  if i% 2000 == 0:
    print(i)
  list_entries = []
  for key in shape.keys():
    list_entries.append(shape[key])
  i=i+1
  df_shapes = df_shapes.append(pd.Series(list_entries, index=df_shapes.columns ), ignore_index=True)

In [0]:
len(json_parsed['gtfs']['shapes'])

89312

In [0]:
df_shapes.to_csv('shapes_lirr.csv')

##Trips - LIRR

In [0]:
json_parsed['gtfs']['trips'][0].keys()

dict_keys(['route_id', 'service_id', 'trip_id', 'trip_headsign', 'trip_short_name', 'direction_id', 'shape_id'])

In [0]:
df_trips = pd.DataFrame(columns=json_parsed['gtfs']['trips'][0].keys())

In [0]:
len(json_parsed['gtfs']['trips'])

2618

In [0]:
for trip in json_parsed['gtfs']['trips']:
  list_entries = []
  for key in trip.keys():
    list_entries.append(trip[key])
  df_trips = df_trips.append(pd.Series(list_entries, index=df_trips.columns ), ignore_index=True)

In [0]:
df_trips.to_csv('trips_lirr.csv')

##Stop Times - LIRR

In [0]:
json_parsed['gtfs']['stop_times'][0].keys()

dict_keys(['trip_id', 'arrival_time', 'departure_time', 'stop_id', 'stop_sequence'])

In [0]:
df_stop_times = pd.DataFrame(columns=json_parsed['gtfs']['stop_times'][0].keys())

In [0]:
len(json_parsed['gtfs']['stop_times'])

29372

In [0]:
i=0
for stop_time in json_parsed['gtfs']['stop_times']:
  if i% 1000 == 0:
    print(i)
  list_entries = []
  for key in stop_time.keys():
    list_entries.append(stop_time[key])
  i=i+1
  df_stop_times = df_stop_times.append(pd.Series(list_entries, index=df_stop_times.columns ), ignore_index=True)

In [0]:
df_stop_times.to_csv('stop_times_lirr.csv')

##Calendar Dates - LIRR

In [0]:
json_parsed['gtfs']['calendar_dates'][0].keys()

dict_keys(['service_id', 'date', 'exception_type'])

In [0]:
df_calendar_dates = pd.DataFrame(columns=json_parsed['gtfs']['calendar_dates'][0].keys())

In [0]:
len(json_parsed['gtfs']['calendar_dates'])

337

In [0]:
for date in json_parsed['gtfs']['calendar_dates']:
  list_entries = []
  for key in date.keys():
    list_entries.append(date[key])
  df_calendar_dates = df_calendar_dates.append(pd.Series(list_entries, index=df_calendar_dates.columns ), ignore_index=True)

In [0]:
df_calendar_dates.to_csv('calendar_dates_lirr.csv')

##Distance Between Different Stations

In [0]:
# Haversine Formula
import math

def distance(lat1, lon1, lat2, lon2):
    #lat1, lon1 = origin
    #lat2, lon2 = destination
    radius = 6371 # km

    dlat = math.radians(lat2-lat1)
    dlon = math.radians(lon2-lon1)
    a = math.sin(dlat/2) * math.sin(dlat/2) + math.cos(math.radians(lat1)) \
        * math.cos(math.radians(lat2)) * math.sin(dlon/2) * math.sin(dlon/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    d = radius * c

    return d

In [0]:
df_stops_distance = pd.DataFrame(columns=['stop1', 'stop2', 'distance'])
for index1, stop1 in df_stops.iterrows():
  for index2, stop2 in df_stops.iterrows():
    if stop1.stop_id != stop2.stop_id:
      dist = distance(float(stop1.stop_lat), float(stop1.stop_lon), float(stop2.stop_lat), float(stop2.stop_lon))
      df_stops_distance = df_stops_distance.append(pd.Series([stop1.stop_name, stop2.stop_name, dist], index=df_stops_distance.columns ), ignore_index=True)


In [30]:
df_stops_distance['distance'].describe()
#min          0.640545

count    15252.000000
mean        39.732956
std         34.855434
min          0.640545
25%         14.781754
50%         26.661561
75%         53.420863
max        174.748899
Name: distance, dtype: float64

In [35]:
df_stops_distance

Unnamed: 0,stop1,stop2,distance
0,Penn Station,Atlantic Terminal,7.600665
1,Penn Station,Greenport,142.376899
2,Penn Station,Great River,69.373662
3,Penn Station,Glen Street,33.539425
4,Penn Station,Greenvale,31.713384
...,...,...,...
15247,Westhampton,Gibson,90.848204
15248,Westhampton,Glen Cove,81.335443
15249,Westhampton,Garden City,84.212068
15250,Westhampton,Glen Head,82.097619
