In [1]:
# ------------------------------------------------ #
# IMPORT LIBRARIES AND SET VARIABLES
# ------------------------------------------------ #

# Adapted from https://dev.socrata.com/foundry/data.ny.gov/wujg-7c2s 

# --- imports ---- #

import pandas as pd
from sodapy import Socrata

# handle secrets
from dotenv import load_dotenv
import os
load_dotenv()

# nyc data.ny.gov app token
my_token = os.getenv("API_TOKEN")



In [150]:
# ------------------------------------------------ #
# MAKE CALL TO API AND TURN INTO DATAFRAME
# ------------------------------------------------ #

# Unauthenticated client only works with public data sets. Note 'None'
# in place of application token, and no username or password:
client = Socrata("data.ny.gov", my_token)

# -----Change these variables & the 'where' parameter in get call to download each month ------ #

# By default, SODA will only return 1000 records max; can use limit parameter to change it to 50,000
# In theory this API is v2.1 which should be endless, but the request times out for downloading a year

# Some context of record size:
# Woodside 61-St   | Bryant Park
# Jan 1 - 10: 1771 | 1868  records
# Jan 1 - 31: 5503 | 5810  records

# m is month in XX form
# dlast is last day of month in XX form
m = '12'
dlast = '31'


# Filesnames to save to

#station_name = '456_61St-Woodside_2023'
station_name = '609_BryantPk5Av_2023'
station_id = '609'
save_string_raw = './data/2023//raw/' +station_id+ '/'+ station_name +'_' + m + '_raw.csv'
save_string_cleaned = './data/2023/cleaned/' +station_id+ '/' +station_name +'_' + m + '_cleaned.csv'

# Woodside Station ID is 456
# 42nd St Bryant Park ID is 609
results = client.get("wujg-7c2s", 
                     limit=50000,
                     where="station_complex_id = '" +station_id+ "' AND transit_timestamp between '2023-" +m+ "-01T00:00:00' and '2023-" +m+ "-" + dlast + "T23:00:00'")


# Convert to pandas DataFrame
results_df = pd.DataFrame.from_records(results)

# Print shape of array
# Rows x Cols (# records, # parameters)
# Expected 12 parameters
results_df.shape


(6050, 12)

In [151]:
# ------------------------------------------------ #
# WRITE RAW TO CSV
# ------------------------------------------------ #

results_df.to_csv(save_string_raw, index = False)

In [152]:
# ------------------------------------------------ #
# REMOVE NA & UNNECESSARY COLUMNS
# ------------------------------------------------ #

# remove rows with any NaNs -- seems to me that the data is relatively clean
results_clean = results_df.dropna()


results_clean = results_clean.drop(columns=['latitude',
                                     'longitude',
                                     'georeference',
                                     'transit_mode',
                                     'payment_method',
                                     'fare_class_category'])

results_clean = results_clean.sort_values(by=['transit_timestamp'])

# Print head
# the 'earlier' records are larger indexes in the df
results_clean[:10]


Unnamed: 0,transit_timestamp,station_complex_id,station_complex,borough,ridership,transfers
2506,2023-12-01T00:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,289.0,0.0
2459,2023-12-01T00:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,17.0,0.0
2602,2023-12-01T00:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,64.0,0.0
2616,2023-12-01T00:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,17.0,0.0
2440,2023-12-01T00:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,117.0,1.0
2481,2023-12-01T00:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,20.0,1.0
2608,2023-12-01T00:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,1.0,0.0
2619,2023-12-01T00:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,82.0,0.0
2574,2023-12-01T01:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,1.0,0.0
2569,2023-12-01T01:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,23.0,0.0


In [153]:
# ------------------------------------------------ #
# GROUP & SUM DATATYPES
# ------------------------------------------------ #

# cast to float (starts as object)
results_clean['ridership'] = results_clean['ridership'].astype('float')
results_clean['transfers'] = results_clean['transfers'].astype('float')

# check dtypes if you need
#results_clean.dtypes

results_grouped = results_clean.groupby(by=["transit_timestamp", "station_complex_id", "station_complex", "borough"]).sum().reset_index()

results_grouped[:10]

Unnamed: 0,transit_timestamp,station_complex_id,station_complex,borough,ridership,transfers
0,2023-12-01T00:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,607.0,2.0
1,2023-12-01T01:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,168.0,0.0
2,2023-12-01T02:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,76.0,2.0
3,2023-12-01T03:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,56.0,2.0
4,2023-12-01T04:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,47.0,0.0
5,2023-12-01T05:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,96.0,15.0
6,2023-12-01T06:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,206.0,45.0
7,2023-12-01T07:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,337.0,49.0
8,2023-12-01T08:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,478.0,48.0
9,2023-12-01T09:00:00.000,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan,663.0,29.0


In [154]:
# ------------------------------------------------ #
# SPLIT OUT DATETIME 
# ------------------------------------------------ #

results_grouped['transit_timestamp'] = pd.to_datetime(results_grouped['transit_timestamp'])

results_grouped['year'] = pd.DatetimeIndex(results_grouped['transit_timestamp']).year
results_grouped['month'] = pd.DatetimeIndex(results_grouped['transit_timestamp']).month
results_grouped['day'] = pd.DatetimeIndex(results_grouped['transit_timestamp']).day
results_grouped['hour'] = pd.DatetimeIndex(results_grouped['transit_timestamp']).hour
results_grouped['dayofweek'] = pd.DatetimeIndex(results_grouped['transit_timestamp']).dayofweek

results_grouped = results_grouped.reindex(columns=['transit_timestamp', 
                                                   'year',
                                                   'month',
                                                   'day',
                                                   'dayofweek',
                                                   'hour',
                                                   'ridership',
                                                   'transfers',
                                                   'station_complex_id',
                                                   'station_complex',
                                                   'borough'])


results_grouped[:10]


Unnamed: 0,transit_timestamp,year,month,day,dayofweek,hour,ridership,transfers,station_complex_id,station_complex,borough
0,2023-12-01 00:00:00,2023,12,1,4,0,607.0,2.0,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan
1,2023-12-01 01:00:00,2023,12,1,4,1,168.0,0.0,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan
2,2023-12-01 02:00:00,2023,12,1,4,2,76.0,2.0,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan
3,2023-12-01 03:00:00,2023,12,1,4,3,56.0,2.0,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan
4,2023-12-01 04:00:00,2023,12,1,4,4,47.0,0.0,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan
5,2023-12-01 05:00:00,2023,12,1,4,5,96.0,15.0,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan
6,2023-12-01 06:00:00,2023,12,1,4,6,206.0,45.0,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan
7,2023-12-01 07:00:00,2023,12,1,4,7,337.0,49.0,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan
8,2023-12-01 08:00:00,2023,12,1,4,8,478.0,48.0,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan
9,2023-12-01 09:00:00,2023,12,1,4,9,663.0,29.0,609,"Bryant Pk (B,D,F,M)/5 Av (7)",Manhattan


In [155]:
# ------------------------------------------------ #
# WRITE TO CSV
# ------------------------------------------------ #

results_grouped.to_csv(save_string_cleaned, index = False)