In [2]:
import pandas as pd
import glob
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
import re
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
root = 'data/'

In [3]:
from geopandas import GeoDataFrame
from geopandas import sjoin
from shapely.geometry import Point
from shapely.geometry import LineString

# LOAD STATION DATA

Source : NYC MTA (Subway Stations Data)

Description : Description of all the subway stations in NYC. Useful fields are 'STOP_NAME' (station name) and 'GTFS Latitude' and 'GTFS Longitude' (geographic coordinates of the station)

Processing : This dataset has been processed by the stations.ipynb python notebook and saved to Stations_geomerged.geojson

Following data issues have been addressed :

    1. There is no unique identifier that represents stations across the NYC MTA database : The 'STATION' column of the MTA turnstile dataset is the only identifier for the station in that set. The contents of this column differed significantly from the 'STOP_NAME' column of the stations dataset. For example, the station named 'TIMES SQ-42 ST' in one set was represented as 'Times Sq - 42 St' in the other. Although, issues like this were easy to fix, there were a fair number of cases where a station named 'Astoria - Ditmars Blvd' did not have any obvious match in the other data set. A possible cause for cases like this is the use of different station names for the same station ('Astoria - Ditmars Blvd' station was earlier known as 'Second Avenue'). Cases like this are hard, if not impossible to match. A python string-matching library called 'fuzzy-wuzzy' was used to find the best matches using 3 Levenshtein closeness ratios (normal ratio, partial ratio and token sort ratio). The match was accepted only if one of the three matching methods returned a ratio of 88% or higher. The matching station names from the trunstile dataset were added to the 'STATION' column of the stations dataset.

    2. The columns 'GTFS Latitude' and 'GTFS Longitude' required further processing in order to be readily consumable for joins (geographic) across different datasets (for example with the traffic and taxi/cab datasets) : The python geopandas library was leveraged was this purpose (This library in turn depends on shapely, fiona and rtree). 'GTFS Latitude' and 'GTFS Longitude' were merged into a single 'Point' geometry (shapely.geometry.Point) and the entire datset converted to a geopandas GeoDataFrame. This allows for fairly easy (though sometimes computationally expensive) joins across datasets using the geometry attributes like, points, lines and polygons. A circle of customizable radius, centered at each station, was also drawn and added to a new geometry column containing the circles as polygons. These circles represent 'circles of influence' or zones for each station and will be used to find intersection with traffic and taxi/cab data.

The processed data is saved in geojson format, to enable quick reading in the clean_and_wrangle notebook

In [4]:
file = root+'transit/Stations.csv'
df_stations = pd.read_csv(file,usecols=['Station ID','GTFS Stop ID','Stop Name','Borough','GTFS Latitude','GTFS Longitude'])
df_stations.info()
df_stations.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497 entries, 0 to 496
Data columns (total 6 columns):
Station ID        497 non-null int64
GTFS Stop ID      497 non-null object
Stop Name         497 non-null object
Borough           497 non-null object
GTFS Latitude     497 non-null float64
GTFS Longitude    497 non-null float64
dtypes: float64(2), int64(1), object(3)
memory usage: 23.4+ KB


Unnamed: 0,Station ID,GTFS Stop ID,Stop Name,Borough,GTFS Latitude,GTFS Longitude
0,1,R01,Astoria - Ditmars Blvd,Q,40.775036,-73.912034
1,2,R03,Astoria Blvd,Q,40.770258,-73.917843
2,3,R04,30 Av,Q,40.766779,-73.921479
3,4,R05,Broadway,Q,40.76182,-73.925508
4,5,R06,36 Av,Q,40.756804,-73.929575


In [5]:
df_stations.columns = ['STATION_ID','STOP_ID','STOP_NAME','BOROUGH','LATITUDE','LONGITUDE']
df_stations.info()
df_stations.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 497 entries, 0 to 496
Data columns (total 6 columns):
STATION_ID    497 non-null int64
STOP_ID       497 non-null object
STOP_NAME     497 non-null object
BOROUGH       497 non-null object
LATITUDE      497 non-null float64
LONGITUDE     497 non-null float64
dtypes: float64(2), int64(1), object(3)
memory usage: 23.4+ KB


Unnamed: 0,STATION_ID,STOP_ID,STOP_NAME,BOROUGH,LATITUDE,LONGITUDE
0,1,R01,Astoria - Ditmars Blvd,Q,40.775036,-73.912034
1,2,R03,Astoria Blvd,Q,40.770258,-73.917843
2,3,R04,30 Av,Q,40.766779,-73.921479
3,4,R05,Broadway,Q,40.76182,-73.925508
4,5,R06,36 Av,Q,40.756804,-73.929575


#convert to geodataframe

In [6]:
geometry = [Point(xy) for xy in zip(df_stations.LATITUDE,df_stations.LONGITUDE)]
df_stations = df_stations.drop(['LATITUDE','LONGITUDE'],axis=1)
crs={'init':'epsg:4326'}
geodf_stations = GeoDataFrame(df_stations,crs=crs,geometry=geometry)

In [7]:
geodf_stations.info()
geodf_stations.head()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 497 entries, 0 to 496
Data columns (total 5 columns):
STATION_ID    497 non-null int64
STOP_ID       497 non-null object
STOP_NAME     497 non-null object
BOROUGH       497 non-null object
geometry      497 non-null object
dtypes: int64(1), object(4)
memory usage: 19.5+ KB


Unnamed: 0,STATION_ID,STOP_ID,STOP_NAME,BOROUGH,geometry
0,1,R01,Astoria - Ditmars Blvd,Q,POINT (40.775036 -73.91203399999999)
1,2,R03,Astoria Blvd,Q,POINT (40.770258 -73.917843)
2,3,R04,30 Av,Q,POINT (40.766779 -73.92147900000001)
3,4,R05,Broadway,Q,POINT (40.76182 -73.92550799999999)
4,5,R06,36 Av,Q,POINT (40.756804 -73.929575)


In [8]:
#add a new geometry to geodf_stations of a circle of X miles around each station
X = 0.01
geodf_stations['CIRCLE'] = geodf_stations.geometry.buffer(X)
geodf_stations.geometry.name
geodf_stations = geodf_stations.rename(columns={'geometry':'POINT'}).set_geometry('CIRCLE')
geodf_stations.geometry.name
geodf_stations.info()
geodf_stations.head()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 497 entries, 0 to 496
Data columns (total 6 columns):
STATION_ID    497 non-null int64
STOP_ID       497 non-null object
STOP_NAME     497 non-null object
BOROUGH       497 non-null object
POINT         497 non-null object
CIRCLE        497 non-null object
dtypes: int64(1), object(5)
memory usage: 23.4+ KB


Unnamed: 0,STATION_ID,STOP_ID,STOP_NAME,BOROUGH,POINT,CIRCLE
0,1,R01,Astoria - Ditmars Blvd,Q,POINT (40.775036 -73.91203399999999),"POLYGON ((40.785036 -73.91203399999999, 40.784..."
1,2,R03,Astoria Blvd,Q,POINT (40.770258 -73.917843),"POLYGON ((40.780258 -73.917843, 40.78020984726..."
2,3,R04,30 Av,Q,POINT (40.766779 -73.92147900000001),"POLYGON ((40.776779 -73.92147900000001, 40.776..."
3,4,R05,Broadway,Q,POINT (40.76182 -73.92550799999999),"POLYGON ((40.77182 -73.92550799999999, 40.7717..."
4,5,R06,36 Av,Q,POINT (40.756804 -73.929575),"POLYGON ((40.76680399999999 -73.929575, 40.766..."


# LOAD STATIONS FROM TRANSIT DATA

In [9]:
col_func = lambda x:x.strip().upper() in ['STATION']

In [31]:
#consider exits and entries both or just one of them? and why?
file = root + 'transit/all_turnstile_1617.txt'
transit_df = pd.read_csv(file,header=0,
                         usecols = col_func,skipinitialspace=True, low_memory=False, squeeze=True)

In [32]:
transit_df.head()

0    59 ST
1    59 ST
2    59 ST
3    59 ST
4    59 ST
Name: STATION, dtype: object

In [33]:
transit_df = transit_df.rename(columns=lambda x: x.strip())

#read multiple files in a loop into dataframes and then concat them

In [34]:
transit_df = transit_df.drop_duplicates()
transit_df = transit_df.dropna()

In [35]:
transit_df.head()

0                59 ST
504         5 AV/59 ST
1092        57 ST-7 AV
1679             49 ST
2603    TIMES SQ-42 ST
dtype: object

# FUZZY EXTRACT STATION NAMES FOR TRANSIT DATA

In [36]:
stations_fuzz_1 = [process.extractOne(station,geodf_stations.STOP_NAME, scorer=fuzz.ratio) for station in transit_df]
stations_fuzz_2 = [process.extractOne(station,geodf_stations.STOP_NAME, scorer=fuzz.partial_ratio) for station in transit_df]
stations_fuzz_3 = [process.extractOne(station,geodf_stations.STOP_NAME, scorer=fuzz.token_sort_ratio) for station in transit_df]


In [40]:
stations_fuzzy = []
for station in transit_df:
    station_fuzz_1 = process.extractOne(station,geodf_stations.STOP_NAME, scorer=fuzz.ratio)
    station_fuzz_2 = process.extractOne(station,geodf_stations.STOP_NAME, scorer=fuzz.partial_ratio)
    station_fuzz_3 = process.extractOne(station,geodf_stations.STOP_NAME, scorer=fuzz.token_sort_ratio)
    stations = {station_fuzz_1[0]:station_fuzz_1[1],station_fuzz_2[0]:station_fuzz_2[1],station_fuzz_3[0]:station_fuzz_3[1]}
    station_max = max(stations.keys(),key=lambda key: stations[key])
    if stations[station_max] > 88:
        stations_fuzzy.append(station_max)
    else:
        stations_fuzzy.append(np.nan)

In [54]:
st_df = pd.concat([transit_df.reset_index(drop=True),pd.DataFrame(stations_fuzzy,columns=['fuzzy_stop'])],axis=1,ignore_index=True)
st_df.head(15)

Unnamed: 0,0,1
0,59 ST,59 St
1,5 AV/59 ST,5 Av/59 St
2,57 ST-7 AV,57 St - 7 Av
3,49 ST,49 St
4,TIMES SQ-42 ST,Times Sq - 42 St
5,34 ST-HERALD SQ,34 St - Herald Sq
6,28 ST,28 St
7,23 ST,23 St
8,14 ST-UNION SQ,14 St - Union Sq
9,8 ST-NYU,8 St - NYU


# ADD A COLUMN STATION, IN TURNSTILE FORMAT, TO THE STATIONS GEO DF

In [53]:
geodf_stations_merged = pd.merge(geodf_stations,st_df.dropna(),how='left',left_on='STOP_NAME',right_on=1)
geodf_stations_merged = geodf_stations_merged.drop(columns=[1]).rename(columns={0:'STATION'})
geodf_stations_merged.info()
geodf_stations_merged.head()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 538 entries, 0 to 537
Data columns (total 7 columns):
STATION_ID    538 non-null int64
STOP_ID       538 non-null object
STOP_NAME     538 non-null object
BOROUGH       538 non-null object
POINT         538 non-null object
CIRCLE        538 non-null object
STATION       444 non-null object
dtypes: int64(1), object(6)
memory usage: 33.6+ KB


Unnamed: 0,STATION_ID,STOP_ID,STOP_NAME,BOROUGH,POINT,CIRCLE,STATION
0,1,R01,Astoria - Ditmars Blvd,Q,POINT (40.775036 -73.91203399999999),"POLYGON ((40.785036 -73.91203399999999, 40.784...",
1,2,R03,Astoria Blvd,Q,POINT (40.770258 -73.917843),"POLYGON ((40.780258 -73.917843, 40.78020984726...",ASTORIA BLVD
2,3,R04,30 Av,Q,POINT (40.766779 -73.92147900000001),"POLYGON ((40.776779 -73.92147900000001, 40.776...",30 AV
3,4,R05,Broadway,Q,POINT (40.76182 -73.92550799999999),"POLYGON ((40.77182 -73.92550799999999, 40.7717...",BROADWAY
4,5,R06,36 Av,Q,POINT (40.756804 -73.929575),"POLYGON ((40.76680399999999 -73.929575, 40.766...",36 AV


In [60]:
geodf_stations_merged.drop(columns=['POINT']).to_file(root+'transit/Stations_geomerged.geojson',driver='GeoJSON')