In [32]:
import pandas as pd

In [33]:
riders = pd.read_csv("../data/Ridership/ridership.csv")

#don't care about origin and destination of same station, no one does this, ridership is always 0
riders = riders[riders["Origin_ID"] != riders['Dest_ID']]

#don't care about weekend ridership because we don't have schedule data for weekends
riders = riders[riders['TOD'] != 'wknd']

#make ridership column a number not a string
riders['Ridership_Number'] = riders['Ridership_Number'].str.replace(',', '')
riders = riders.astype({'Ridership_Number': 'int64'})

#reduce to only the scenarios we care about
riders = riders[((riders['Scenario'] == 'Ridership_2017_observed') | 
                  (riders['Scenario'] == 'Ridership_2040_HSR2') |
                  (riders['Scenario'] == 'Ridership_2040_moderate') |
                  (riders['Scenario'] == 'Ridership_2040_high'))]

In [34]:
new_trains = pd.read_csv("../data/parsed_data/schedules/combined_schedules_metrics.csv")

In [35]:
new_trains2 = new_trains[['scenario', 'departure_hour',
       'count_journey_time', 'departure_station', 'arrival_station']]

In [36]:
print(len(new_trains2))
print(len(new_trains2.drop_duplicates()))

37169
37169


In [37]:
print(len(new_trains))
print(len(new_trains.drop_duplicates()))

37169
37169


In [38]:
new_trains.columns

Index(['scenario', 'departure_station', 'arrival_station', 'departure_hour',
       'count_journey_time', 'mean_journey_time', 'min_journey_time',
       'max_journey_time', 'count_wait_time', 'mean_wait_time',
       'min_wait_time', 'max_wait_time', 'lat_departure_station',
       'lon_departure_station', 'SB_order_departure_station',
       'NB_order_departure_station', 'lat_arrival_station',
       'lon_arrival_station', 'SB_order_arrival_station',
       'NB_order_arrival_station', 'isochrone_circle_distance'],
      dtype='object')

In [39]:
trains = pd.read_csv("../data/parsed_data/schedules/combined_schedules_metrics.csv")

#filter to just the information we care about
trains = trains[['scenario', 'departure_hour',
       'count_journey_time', 'departure_station', 'arrival_station']]

#again nobody rides from origin and destination of the same station.
trains = trains[trains['arrival_station'] != trains['departure_station']]

trains=trains.drop_duplicates()

#match up scenario names and times of day with the ridership info
scenario_dict = {'Baseline': 'Ridership_2040_HSR2', 
                 'Existing': 'Ridership_2017_observed', 
                 'High': 'Ridership_2040_high', 
                 'HIgh': 'Ridership_2040_high', 
                 'Moderate': 'Ridership_2040_moderate'}

#add time of day information to the trains
time_dict = {4: 'ea', 5: 'ea', 
             6: 'am', 7: 'am', 8: 'am', 9: 'am', 10: 'am', 
             11: 'md', 12: 'md', 13: 'md', 14: 'md', 
             15: 'pm', 16: 'pm', 17: 'pm', 18: 'pm', 19: 'pm', 
             20: 'ev', 21: 'ev', 22: 'ev', 23: 'ev', 24: 'ev', 25: 'ev'}

trains['Scenario'] = trains['scenario'].apply(lambda x: scenario_dict[x])
trains = trains.drop(columns='scenario')
trains['TOD'] = trains['departure_hour'].apply(lambda x: time_dict[x])

#to get trains for each time of day, just group by TOD and then sum up the count of trains
trains_by_tod = trains.drop(columns='departure_hour').groupby(by = ['departure_station',
       'arrival_station', 'Scenario', 'TOD']).sum().reset_index()

In [40]:
riders.columns

Index(['TOD', 'Origin_ID', 'Origin_Name', 'Dest_ID', 'Dest_Name', 'Scenario',
       'Ridership_Number'],
      dtype='object')

In [41]:
trains.columns

Index(['departure_hour', 'count_journey_time', 'departure_station',
       'arrival_station', 'Scenario', 'TOD'],
      dtype='object')

In [42]:
new_trains[(new_trains['departure_station'] == 'South San Francisco') & 
       (new_trains['arrival_station'] == 'Belmont') &
       (new_trains['departure_hour'] == 11)]

Unnamed: 0,scenario,departure_station,arrival_station,departure_hour,count_journey_time,mean_journey_time,min_journey_time,max_journey_time,count_wait_time,mean_wait_time,...,max_wait_time,lat_departure_station,lon_departure_station,SB_order_departure_station,NB_order_departure_station,lat_arrival_station,lon_arrival_station,SB_order_arrival_station,NB_order_arrival_station,isochrone_circle_distance
4436,High,South San Francisco,Belmont,11,2,23.0,23,23,60,18.25,...,44.0,37.654343,-122.406324,4,27,37.521203,-122.276244,12,19,3108.0
4450,Moderate,South San Francisco,Belmont,11,1,21.0,21,21,60,29.5,...,59.0,37.654343,-122.406324,4,27,37.521203,-122.276244,12,19,3276.0
4469,Existing,South San Francisco,Belmont,11,1,27.0,27,27,60,29.5,...,59.0,37.654343,-122.406324,4,27,37.521203,-122.276244,12,19,2772.0


In [44]:
riders_trains_by_tod = pd.merge(riders, trains_by_tod,  how='inner', 
                         left_on=['TOD','Scenario','Origin_Name','Dest_Name'], 
                         right_on = ['TOD','Scenario','departure_station','arrival_station'])

riders_trains_by_tod = riders_trains_by_tod.drop(columns = ['departure_station','arrival_station'])

In [45]:
#this merge will add a station to every row (and repeat that row for each station)
df_tod = riders_trains_by_tod.copy()
df2 = riders_trains_by_tod[['TOD', 'Scenario', 'Origin_Name', 'Origin_ID']]
df2 = df2.rename(columns={"Origin_Name":"Station_Name", "Origin_ID": "Station_ID"})
df2 = df2.drop_duplicates()

df_tod = df_tod.merge(df2, on=['Scenario', 'TOD'])

In [46]:
#separate northbound and southbound trips
sb_tod = df_tod[df_tod['Origin_ID'] < df_tod['Dest_ID']]
nb_tod = df_tod[df_tod['Origin_ID'] > df_tod['Dest_ID']]

#now we only care about the stations that are equal to or between the origin and destination - 
#those riders are people who are on the train at that station
sb_tod = sb_tod[(sb_tod['Station_ID'] >= sb_tod['Origin_ID']) & (sb_tod['Station_ID'] <= sb_tod['Dest_ID'])]
nb_tod = nb_tod[(nb_tod['Station_ID'] <= nb_tod['Origin_ID']) & (nb_tod['Station_ID'] >= nb_tod['Dest_ID'])]

#group by station and sum up how many riders on the train at that station
sb_tod_grouped = sb_tod.groupby(['TOD', 'Scenario', 'Station_Name']).agg({'Ridership_Number': 'sum', 'count_journey_time': 'max'}).reset_index()
nb_tod_grouped = nb_tod.groupby(['TOD', 'Scenario', 'Station_Name']).agg({'Ridership_Number': 'sum', 'count_journey_time': 'max'}).reset_index()

#calculate number of riders per train
sb_tod_grouped['per_train'] = sb_tod_grouped['Ridership_Number']/sb_tod_grouped['count_journey_time']
nb_tod_grouped['per_train'] = nb_tod_grouped['Ridership_Number']/nb_tod_grouped['count_journey_time']

In [47]:
sb_tod_grouped.sample(20)

Unnamed: 0,TOD,Scenario,Station_Name,Ridership_Number,count_journey_time,per_train
479,pm,Ridership_2040_HSR2,Menlo Park,24090,50,481.8
295,ev,Ridership_2040_high,San Mateo,5290,32,165.3125
501,pm,Ridership_2040_high,Broadway,20283,40,507.075
393,md,Ridership_2040_high,Menlo Park,2741,28,97.892857
147,ea,Ridership_2040_HSR2,Menlo Park,154,10,15.4
472,pm,Ridership_2040_HSR2,Burlingame,26925,50,538.5
548,pm,Ridership_2040_moderate,San Jose Diridon,10430,44,237.045455
15,am,Ridership_2017_observed,San Carlos,4152,15,276.8
90,am,Ridership_2040_moderate,California Ave,6635,40,165.875
304,ev,Ridership_2040_moderate,Bayshore,5985,31,193.064516


In [88]:
'''
Ridership is given by Time of day (TOD) blocks. Train schedule is more precise than this.
The below maps can be used to divide up ridership into hourly blocks for better visualizations.

Hour to TOD map:
{4: 'ea', 5: 'ea', 
    6: 'am', 7: 'am', 8: 'am', 9: 'am', 10: 'am', 
    11: 'md', 12: 'md', 13: 'md', 14: 'md', 
    15: 'pm', 16: 'pm', 17: 'pm', 18: 'pm', 19: 'pm', 
    20: 'ev', 21: 'ev', 22: 'ev', 23: 'ev', 24: 'ev', 25: 'ev'}
'''

# a uniform distribution of riders within each time block
hourly_division_uniform = {4: 0.5, 5: 0.5, 
             6: 0.2, 7: 0.2, 8: 0.2, 9: 0.2, 10: 0.2, 
             11: 0.25, 12: 0.25, 13: 0.25, 14: 0.25, 
             15: 0.2, 16: 0.2, 17: 0.2, 18: 0.2, 19: 0.2, 
             20: 0.2, 21: 0.2, 22: 0.2, 23: 0.2, 24: 0.2, 25: 0.0}

# a smoothed distribution of riders that creates peaks for the morning/evening commutes 
# and attempts to even out jumps between TOD blocks
hourly_division_smoothed = {4: 0.5, 5: 0.5, 
             6: 0.1, 7: 0.25, 8: 0.3, 9: 0.25, 10: 0.1, 
             11: 0.25, 12: 0.25, 13: 0.25, 14: 0.25, 
             15: 0.1, 16: 0.25, 17: 0.3, 18: 0.25, 19: 0.1, 
             20: 0.3, 21: 0.25, 22: 0.2, 23: 0.1, 24: 0.1, 25: 0.05}

In [89]:
riders_trains_by_hour = pd.merge(riders, trains,  how='inner', 
                         left_on=['TOD','Scenario','Origin_Name','Dest_Name'], 
                         right_on = ['TOD','Scenario','departure_station','arrival_station'])

riders_trains_by_hour = riders_trains_by_hour.drop(columns = ['departure_station','arrival_station'])

In [90]:
#this merge will add a station to every row (and repeat that row for each station)
df_hourly = riders_trains_by_hour.copy()
df2 = riders_trains_by_hour[['TOD', 'Scenario', 'Origin_Name', 'Origin_ID']]
df2 = df2.rename(columns={"Origin_Name":"Station_Name", "Origin_ID": "Station_ID"})
df2 = df2.drop_duplicates()

df_hourly = df_hourly.merge(df2, on=['Scenario', 'TOD'])

In [91]:
#separate northbound and southbound trips
sb_hourly = df_hourly[df_hourly['Origin_ID'] < df_hourly['Dest_ID']]
nb_hourly = df_hourly[df_hourly['Origin_ID'] > df_hourly['Dest_ID']]

#now we only care about the stations that are equal to or between the origin and destination - 
#those riders are people who are on the train at that station

sb_hourly = sb_hourly[(sb_hourly['Station_ID'] >= sb_hourly['Origin_ID']) & 
                      (sb_hourly['Station_ID'] <= sb_hourly['Dest_ID'])]
nb_hourly = nb_hourly[(nb_hourly['Station_ID'] <= nb_hourly['Origin_ID']) & 
                      (nb_hourly['Station_ID'] >= nb_hourly['Dest_ID'])]

#group by station and sum up how many riders on the train at that station
sb_hourly_grouped = sb_hourly.groupby(['departure_hour', 'Scenario', 'Station_Name']).agg({'Ridership_Number': 'sum', 'count_journey_time': 'max'}).reset_index()
nb_hourly_grouped = nb_hourly.groupby(['departure_hour', 'Scenario', 'Station_Name']).agg({'Ridership_Number': 'sum', 'count_journey_time': 'max'}).reset_index()



In [92]:
#for the uniform distribution
#divide up ridership by hour
sb_hourly_grouped['Hourly_Ridership_uniform'] = sb_hourly_grouped['departure_hour'].apply(lambda x: hourly_division_uniform[x])*sb_hourly_grouped['Ridership_Number']
nb_hourly_grouped['Hourly_Ridership_uniform'] = nb_hourly_grouped['departure_hour'].apply(lambda x: hourly_division_uniform[x])*nb_hourly_grouped['Ridership_Number']

#calculate number of riders per train
sb_hourly_grouped['per_train_uniform'] = sb_hourly_grouped['Hourly_Ridership_uniform']/sb_hourly_grouped['count_journey_time']
nb_hourly_grouped['per_train_uniform'] = nb_hourly_grouped['Hourly_Ridership_uniform']/nb_hourly_grouped['count_journey_time']


In [93]:
#for the smoothed distribution
#divide up ridership by hour
sb_hourly_grouped['Hourly_Ridership_smoothed'] = sb_hourly_grouped['departure_hour'].apply(lambda x: hourly_division_smoothed[x])*sb_hourly_grouped['Ridership_Number']
nb_hourly_grouped['Hourly_Ridership_smoothed'] = nb_hourly_grouped['departure_hour'].apply(lambda x: hourly_division_smoothed[x])*nb_hourly_grouped['Ridership_Number']

#calculate number of riders per train
sb_hourly_grouped['per_train_smoothed'] = sb_hourly_grouped['Hourly_Ridership_smoothed']/sb_hourly_grouped['count_journey_time']
nb_hourly_grouped['per_train_smoothed'] = nb_hourly_grouped['Hourly_Ridership_smoothed']/nb_hourly_grouped['count_journey_time']

In [94]:
sb_hourly_grouped.sample(20)

Unnamed: 0,departure_hour,Scenario,Station_Name,Ridership_Number,count_journey_time,Hourly_Ridership_uniform,per_train_uniform,Hourly_Ridership_smoothed,per_train_smoothed
228,6,Ridership_2040_moderate,Gilroy,99,9,19.8,2.2,9.9,1.1
1346,16,Ridership_2040_moderate,Palo Alto,18509,8,3701.8,462.725,4627.25,578.40625
814,12,Ridership_2017_observed,Palo Alto,535,1,133.75,133.75,133.75,133.75
258,7,Ridership_2017_observed,Mountain View,1949,5,389.8,77.96,487.25,97.45
877,12,Ridership_2040_high,South San Francisco,3517,11,879.25,79.931818,879.25,79.931818
1783,20,Ridership_2040_high,South San Francisco,6073,10,1214.6,121.46,1821.9,182.19
753,11,Ridership_2040_high,Capitol,352,8,88.0,11.0,88.0,11.0
885,12,Ridership_2040_moderate,Belmont,2810,5,702.5,140.5,702.5,140.5
1364,17,Ridership_2017_observed,College Park,2861,4,572.2,143.05,858.3,214.575
1817,21,Ridership_2017_observed,Bayshore,11,1,2.2,2.2,2.75,2.75


In [96]:
locs = pd.read_csv("../data/station_lat_lon_key.csv")

In [97]:
def add_location_data(ridership_df, locations_df):
    
    #drop unneeded data from locations, reset index for join
    locs = locations_df.drop(columns=["nasty_station", "SB_order", "NB_order"]).set_index("pretty_station")
    
    #join location and ridership on station name (station of ridership origin)
    data = ridership_df.join(locs, on="Station_Name")
    
#     #join location and ridership on station name again, this time for ridership destination
#     #so there are now two sets of location data in each row, for both origin and destination
#     data = data.join(locs, on="Dest_Name", rsuffix="_dest")
    
#     #rename some columns for clarity
#     data = data.rename(columns={"lat":"lat_origin", 
#                              "lon":"lon_origin", 
#                              "SB_order":"SB_order_origin", 
#                              "NB_order":"NB_order_origin"
#                                })
    
    return data

In [98]:
sb_tod_data = add_location_data(sb_tod_grouped, locs)
nb_tod_data = add_location_data(nb_tod_grouped, locs)
sb_hourly_data = add_location_data(nb_hourly_grouped, locs)
nb_hourly_data = add_location_data(nb_hourly_grouped, locs)

In [99]:
sb_tod_data.to_csv("../data/Ridership/sb_ridership_bytod_bystation.csv", index=False)
nb_tod_data.to_csv("../data/Ridership/nb_ridership_bytod_bystation.csv", index=False)
sb_hourly_data.to_csv("../data/Ridership/sb_ridership_byhour_bystation.csv", index=False)
nb_hourly_data.to_csv("../data/Ridership/nb_ridership_byhour_bystation.csv", index=False)