In [37]:
import pandas as pd

In [38]:
riders = pd.read_csv("../data/Ridership/ridership.csv")

#don't care about origin and destination of same station, no one does this, ridership is always 0
riders = riders[riders["Origin_ID"] != riders['Dest_ID']]

#make ridership column a number not a string
riders['Ridership_Number'] = riders['Ridership_Number'].str.replace(',', '')
riders = riders.astype({'Ridership_Number': 'int64'})

In [39]:
#this merge will add a station to every row (and repeat that row for each station)
df = riders.copy()
df2 = riders[['TOD', 'Scenario', 'Origin_Name', 'Origin_ID']]
df2 = df2.rename(columns={"Origin_Name":"Station_Name", "Origin_ID": "Station_ID"})

df = df.merge(df2, on=['Scenario', 'TOD'], suffixes=('_station', '_train'))

In [40]:
df.columns

Index(['TOD', 'Origin_ID', 'Origin_Name', 'Dest_ID', 'Dest_Name', 'Scenario',
       'Ridership_Number', 'Station_Name', 'Station_ID'],
      dtype='object')

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 64948224 entries, 0 to 64948223
Data columns (total 9 columns):
TOD                 object
Origin_ID           int64
Origin_Name         object
Dest_ID             int64
Dest_Name           object
Scenario            object
Ridership_Number    int64
Station_Name        object
Station_ID          int64
dtypes: int64(4), object(5)
memory usage: 4.8+ GB


In [42]:
#separate northbound and southbound trips
sb = df[df['Origin_ID'] < df['Dest_ID']]
nb = df[df['Origin_ID'] > df['Dest_ID']]

In [43]:
#now we only care about the stations that are equal to or between the origin and destination - 
#those riders are people who are on the train at that station

sb = sb[(sb['Station_ID'] >= sb['Origin_ID']) & (sb['Station_ID'] <= sb['Dest_ID'])]
nb = nb[(nb['Station_ID'] <= nb['Origin_ID']) & (nb['Station_ID'] >= nb['Dest_ID'])]

In [48]:
#group by station and sum up how many riders on the train at that station
sb_grouped = sb.groupby(['TOD', 'Scenario', 'Station_Name'])['Ridership_Number'].sum().reset_index()
nb_grouped = nb.groupby(['TOD', 'Scenario', 'Station_Name'])['Ridership_Number'].sum().reset_index()

In [56]:
locs = pd.read_csv("../data/station_lat_lon_key.csv")

In [57]:
def add_location_data(ridership_df, locations_df):
    
    #drop unneeded data from locations, reset index for join
    locs = locations_df.drop(columns=["nasty_station", "SB_order", "NB_order"]).set_index("pretty_station")
    
    #join location and ridership on station name (station of ridership origin)
    data = df.join(locs, on="Origin_Name")
    
#     #join location and ridership on station name again, this time for ridership destination
#     #so there are now two sets of location data in each row, for both origin and destination
#     data = data.join(locs, on="Dest_Name", rsuffix="_dest")
    
#     #rename some columns for clarity
#     data = data.rename(columns={"lat":"lat_origin", 
#                              "lon":"lon_origin", 
#                              "SB_order":"SB_order_origin", 
#                              "NB_order":"NB_order_origin"
#                                })
    
    return data

In [60]:
sb_grouped

Unnamed: 0,TOD,Scenario,Station_Name,Ridership_Number
0,am,Ridership_2017_observed,22nd St,160611
1,am,Ridership_2017_observed,4th and King,117800
2,am,Ridership_2017_observed,Atherton,235569
3,am,Ridership_2017_observed,Bayshore,164517
4,am,Ridership_2017_observed,Belmont,254851
5,am,Ridership_2017_observed,Blossom Hill,0
6,am,Ridership_2017_observed,Broadway,214892
7,am,Ridership_2017_observed,Burlingame,227819
8,am,Ridership_2017_observed,California Ave,123659
9,am,Ridership_2017_observed,Capitol,0


In [58]:
sb_data = add_location_data(sb_grouped, locs)

In [59]:
sb_data.head()

Unnamed: 0,TOD,Origin_ID,Origin_Name,Dest_ID,Dest_Name,Scenario,Ridership_Number,Station_Name,Station_ID,lat,lon
0,ea,2,4th and King,1,Transbay,Ridership_2017_observed,0,4th and King,2,37.776738,-122.394947
1,ea,2,4th and King,1,Transbay,Ridership_2017_observed,0,22nd St,4,37.776738,-122.394947
2,ea,2,4th and King,1,Transbay,Ridership_2017_observed,0,Bayshore,7,37.776738,-122.394947
3,ea,2,4th and King,1,Transbay,Ridership_2017_observed,0,South San Francisco,8,37.776738,-122.394947
4,ea,2,4th and King,1,Transbay,Ridership_2017_observed,0,San Bruno,9,37.776738,-122.394947


In [None]:
nb_data = add_location_data(nb_grouped, locs)

In [55]:
#Still need to normalize ridership by number of trains going through that station - 
#to get an average number of riders per train. Need to align scenario names in schedule and ridership data

In [50]:
trains = pd.read_csv("../data/parsed_data/schedules/all_schedules_metrics_final.csv")

In [52]:
trains['scenario'].unique()

array(['Baseline NB', 'HIgh NB', 'High SB', 'Moderate NB', 'Baseline SB',
       'Moderate SB'], dtype=object)

In [53]:
riders['Scenario'].unique()

array(['Ridership_2017_observed', 'Ridership_2022_diesel',
       'Ridership_2022_PCEP', 'Ridership_2029_PCEP_noTransbay',
       'Ridership_2029_PCEP', 'Ridership_2029_HSR1',
       'Ridership_2033_HSR1', 'Ridership_2033_HSR2',
       'Ridership_2040_HSR2', 'Ridership_2040_moderate',
       'Ridership_2040_high'], dtype=object)