In [1]:
#Initial Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import glob
import os
import googlemaps

In [2]:
# Read NYC turntable for the month of march, from march 3rd to march 30th (4weeks)
#df = pd.read_csv("NYCT180310.csv")
path =r'/Users/petermin/metis/01-benson' # use your path
all_files = glob.glob(os.path.join(path, "NYCT*.csv"))
all_files
df = pd.concat((pd.read_csv(f) for f in all_files))
df.columns = df.columns.str.strip()
df;

In [5]:
# Get the list of station names in the dataframe
stations = df["STATION"].unique()
stations[:10]

array(['59 ST', '5 AV/59 ST', '57 ST-7 AV', '49 ST', 'TIMES SQ-42 ST',
       '34 ST-HERALD SQ', '28 ST', '23 ST', '14 ST-UNION SQ', '8 ST-NYU'],
      dtype=object)

## Get Station Zipcode

In [None]:
#fetch zipcode from geocode API for each station
stations = stations + " station, NY"
def getzipcode(ser):
    station_dict = dict()
    for station in ser:
        # Try to pull lat & long coordinates. If fail, print index and fetch the next station lat & long
        try:
            zipcode = gmaps.geocode(station)[0]["address_components"][-1]['long_name']
            station_dict[station] = zipcode
            if len(station_dict) % 50 == 0:
                print("index =", len(station_dict), "zipcode =", zipcode)
        except IndexError:
            print("index error at index=", len(station_dict))
            pass
    return station_dict
station_dict = getzipcode(stations)

In [None]:
# save station & zipcode dictionary as text file
import csv
f = open("station_dict.txt","w")
f.write( str(station_dict) )
f.close()

In [None]:
#Write Station Zipcode Dictionary to CSV file

#Convert Dictionary to Dataframe, convert non-zipcodes to NaN, zipcodes to integers
zipcode_df = pd.DataFrame(list(station_dict.items()), columns=['STATION', 'zipcode'])
latlong_df = pd.DataFrame(list(latlong_dict.items()), columns=['STATION', 'latlong'])
zipcode_df["STATION"] = zipcode_df["STATION"].replace("\sstation,\sNY","", regex = True)
zipcode_df.head()

In [None]:
# Fix some of the missing/incorrect zipcodes
zipcode_df["zipcode"] = pd.to_numeric(zipcode_df["zipcode"],errors='coerce',downcast='integer')

# Find assigned values (google geocode could not locate the zipcode, or wrongly identifies the zipcode)
unassigned = zipcode_df[(zipcode_df.zipcode.isnull()) | (zipcode_df.zipcode < 10000)]
unassigned

In [None]:
#Assign mislocated or missing zipcodes manually
zipcode_df.iloc[0,1] = 11207.0
zipcode_df.iloc[4,1] = 10018.0
zipcode_df.iloc[8,1] = 10003.0
zipcode_df.iloc[10,1] = 10012.0
zipcode_df.iloc[16,1] = 10002.0
zipcode_df.iloc[41,1] = 11217.0
zipcode_df.iloc[61,1] = 11219.0
zipcode_df.iloc[67,1] = 10019.0
zipcode_df.iloc[78,1] = 11207.0
zipcode_df.iloc[87,1] = 11430.0
zipcode_df.iloc[107,1] = 11418.0
zipcode_df.iloc[129,1] = 10023.0
zipcode_df.iloc[152,1] = 11416.0
zipcode_df.iloc[190,1] = 11375.0
zipcode_df.iloc[193,1] = 11415.0
zipcode_df.iloc[229,1] = 11432.0
# skip new jersey stations & lackawanna
zipcode_df.iloc[245,1] = 10001.0
# skip new jersey newark city
zipcode_df.iloc[258,1] = 10040.0
zipcode_df.iloc[324,1] = 11101.0
zipcode_df.iloc[330,1] = 11377.0
zipcode_df.iloc[334,1] = 11372.0
zipcode_df.iloc[352,1] = 11212.0
#RIT-MANHATTAN

# unassigned = zipcode_df[(zipcode_df.zipcode.isnull()) | (zipcode_df.zipcode < 10000)]
# unassigned

In [None]:
#write to csv
zipcode_df.zipcode = zipcode_df.zipcode.astype("int64")
#zipcode_df.info()

In [None]:
#Save the zipcode dataframe to CSV file
zipcode_df.to_csv("zipcode_df.csv")

## Get station coordinates

In [None]:
# Fetch lat & long data from goeocode API
stations = stations + " station, NY"
def getlatlong(ser):
    latlong_dict = dict()
    for station in ser:
        # Try to pull lat & long coordinates. If fail, print index and fetch the next station lat & long
        try:
            latlong = gmaps.geocode(station)[0]["geometry"]["location"]
            latlong_dict[station] = latlong
            if len(latlong_dict) % 50 == 0:
                print("index =", len(latlong_dict), "latlong =", latlong)
        except IndexError:
            print("index error at index=", len(latlong_dict))
            pass
    return latlong_dict
latlong_dict = getlatlong(stations)

In [None]:
# save dictionary as text
import csv
f = open("latlong_dict.txt","w")
f.write( str(station_dict) )
f.close()

In [None]:
#Write Dictionary to CSV file
latlong_df = pd.DataFrame.from_dict(latlong_dict).T
#latlong_df["STATION"] = latlong_df.index
latlong_df.reset_index(level=latlong_df.index.names, inplace=True)
latlong_df = latlong_df.rename(columns={"index": "STATION"})
latlong_df["STATION"]= latlong_df["STATION"].replace("\sstation,\sNY","", regex = True)
latlong_df.lng.describe()