In [66]:
'''
This script takes in the riderhsip data and returns a single dataframe, then writes the data out to a csv
'''

'\nThis script takes in the riderhsip data and returns a single dataframe, then writes the data out to a csv\n'

In [57]:
import pandas as pd

In [43]:
def reshape(df, label):
    '''
    Takes in a csv parsed from Caltrain Visualization Challenge Excel files
    Returns a reshaped dataframe 
    
    Parameters:
    ----------
    input:
    df {dataframe}: a Pandas dataframe
    label {string}: a string identifying the time of day represented by the dataframe
    
    output:
    out {dataframe}: reshaped dataframe
    '''
    
    out = pd.DataFrame(df.set_index(['Origin_ID', 'Origin_Name', 'Dest_ID', 'Dest_Name']).stack()).reset_index()
    out["TOD"] = label
    out.rename(index=str, columns={'level_4':'Scenario', 0:'Ridership_Number'}, inplace=True)
    return out[['TOD', 'Origin_ID', 'Origin_Name', 'Dest_ID', 'Dest_Name', 'Scenario', 'Ridership_Number']]

In [60]:
def build_df(dataframes, labels):
    '''
    Takes lists of dataframes and labels and returns concatenated, reshaped dataframe
    
    Parameters:
    ----------
    input
    dataframes {list: dataframe}: a list of Pandas dataframe objects
    labels {list: string}: a list of labels identifying the times of day (same index as dataframes)
    
    output:
    out {dataframe}: a single reshaped and concatenated dataframe
    
    '''
    for i in range(len(dataframes)):
        if i == 0:
            out = reshape(dataframes[i], labels[i])
        else:
            out = pd.concat([out, reshape(dataframes[i], labels[i])], axis=0)
    return out

In [61]:
ea_df = pd.read_csv('../data/Ridership/EA-Table1.csv')
am_df = pd.read_csv('../data/Ridership/AM-Table1.csv')
md_df = pd.read_csv('../data/Ridership/MD-Table1.csv')
ev_df = pd.read_csv('../data/Ridership/EV-Table1.csv')
pm_df = pd.read_csv('../data/Ridership/PM-Table1.csv')
wknd_df = pd.read_csv('../data/Ridership/Wknd-Table1.csv')
dataframes = [ea_df, am_df, md_df, ev_df, pm_df, wknd_df]
labels = ['ea', 'am', 'md', 'ev', 'pm', 'wknd']

In [62]:
ridership = build_df(dataframes, labels)

In [67]:
ridership.to_csv('../data/ridership/ridership.csv', index=False)

In [63]:
ridership.info()

<class 'pandas.core.frame.DataFrame'>
Index: 67584 entries, 0 to 11263
Data columns (total 7 columns):
TOD                 67584 non-null object
Origin_ID           67584 non-null int64
Origin_Name         67584 non-null object
Dest_ID             67584 non-null int64
Dest_Name           67584 non-null object
Scenario            67584 non-null object
Ridership_Number    67584 non-null object
dtypes: int64(2), object(5)
memory usage: 4.1+ MB


In [64]:
ridership.sample(100)

Unnamed: 0,TOD,Origin_ID,Origin_Name,Dest_ID,Dest_Name,Scenario,Ridership_Number
9768,md,31,College Park,34,Capitol,Ridership_2017_observed,0
1403,am,38,Gilroy,7,Bayshore,Ridership_2033_HSR1,11
1585,wknd,21,Menlo Park,8,South San Francisco,Ridership_2022_diesel,1
10480,wknd,31,College Park,36,Morgan Hill,Ridership_2040_HSR2,0
10237,am,4,22nd St,36,Morgan Hill,Ridership_2033_HSR2,0
2903,am,11,Broadway,12,Burlingame,Ridership_2040_high,0
2634,am,20,Atherton,11,Broadway,Ridership_2029_HSR1,0
2377,wknd,31,College Park,10,Millbrae,Ridership_2022_diesel,0
9287,am,16,Belmont,33,Tamien,Ridership_2029_PCEP_noTransbay,0
10082,wknd,26,Mountain View,35,Blossom Hill,Ridership_2033_HSR1,0


In [28]:
ridership.to_csv()

Unnamed: 0,Origin_ID,Origin_Name,Dest_ID,Dest_Name,level_4,0
0,1,Transbay,1,Transbay,Ridership_2017_observed,0
1,1,Transbay,1,Transbay,Ridership_2022_diesel,0
2,1,Transbay,1,Transbay,Ridership_2022_PCEP,0
3,1,Transbay,1,Transbay,Ridership_2029_PCEP_noTransbay,0
4,1,Transbay,1,Transbay,Ridership_2029_PCEP,0
5,1,Transbay,1,Transbay,Ridership_2029_HSR1,0
6,1,Transbay,1,Transbay,Ridership_2033_HSR1,0
7,1,Transbay,1,Transbay,Ridership_2033_HSR2,0
8,1,Transbay,1,Transbay,Ridership_2040_HSR2,0
9,1,Transbay,1,Transbay,Ridership_2040_moderate,0
