In [559]:
import numpy as np
import pandas as pd

In [564]:
#This adds dummy train names
def fill_train_ids(df):
    df = df.copy()
    n = 0 #For paired trains
    m = 0 #For single trains
    for i, val in df.iloc[1].iteritems():
        if i > 1:
            #If we have that 'to future column' arrow...
            if (df.iloc[:,i].str.strip() == '↳').any():
                #Set dummy train ID
                df.at[1, i] = str(df.at[1, i]) + '_pair_' + str(n)
            #If we have that 'from past column' arrow...
            elif (df.iloc[:,i].str.strip() == '↴').any():
                #Set dummy train ID
                df.at[1, i] = str(df.at[1, i]) + '_pair_' + str(n)
                #Advance n to get a new dummy train ID
                n += 1
            #This is a single train
            else:
                #Set dummy train ID
                df.at[1, i] = str(df.at[1, i]) + '_single_' + str(m)
                m += 1
    return df

In [565]:
#This sets the top two rows a column names
def set_column_names(df):
    df = df.copy()
    
    #Combine top two rows into one row
    df.loc[-1] = df.iloc[0].astype(str) + ' ' + df.iloc[1].astype(str)
    df.sort_index(inplace=True)  # sorting by index
    
    #Fix column names for variables
    df.iloc[0,0] = 'km'
    df.iloc[0,1] = 'station'
    
    #Set column names
    df = df.rename(columns=df.iloc[0]).drop(df.index[0]).reset_index(drop=True)
    
    #Drop top two rows
    df.drop(df.index[0:2], inplace=True)
    
    return df

In [566]:
#This converts the time column to minutes since midnight
def convert_times(df):
    df = df.copy()
    
    #Create hr and min columns
    df['hr'], df['min'] = df['time']\
        .str.replace('|', '')\
        .str.replace('↳', '')\
        .str.replace('↴', '')\
        .str.replace('o', '')\
        .str.strip()\
        .str.split(':').str

    #Tweak times after midnight
    df.loc[df['hr'] == '0', ['hr']] = 24
    df.loc[df['hr'] == '1', ['hr']] = 25

    #Replace blanks with NaNs so we can convert to float
    df.loc[df['hr'] == '', ['hr']] = np.NaN

    #Calculate time in minutes
    df['time'] = df['hr'].astype(float)*60 + df['min'].astype(float)
    
    #Remove rows where time column is NaN (train does not stop)
    df = df[df['time'] == df['time']].reset_index(drop=True)
    
    #Convert to ints
    df['time'] = df['time'].astype(int)
    df['hr'] = df['hr'].astype(int)
    df['min'] = df['min'].astype(int)

    return df

In [567]:
#Load all schedules and process
xls = pd.ExcelFile('../data/1 - Weekday Schedule.xlsx')

li = []

for sheet_name in xls.sheet_names:
    print('Loading {}...'.format(sheet_name))
    df = xls.parse(sheet_name, header=None, skiprows=[0,1,2,3,4,5,6])
    
    #Remove useless rows
    df.drop(df.index[2:6], inplace=True)
    
    #Remove blank column
    #df.drop(columns=[2], inplace=True)
        
    df = df.reset_index(drop=True)

    df = fill_train_ids(df)

    df = set_column_names(df)
    
    #San Jose is double-entered in some timetables; drop one of them by dropping rows where km is NaN
    df = df[(df['km'] == df['km'])]
    
    #Unpivot table
    df = df.melt(id_vars=['km', 'station'], var_name='train_id', value_name='time')
    
    df = convert_times(df)
    
    #Set scenario name
    df['scenario'] = sheet_name
    
    li.append(df)

df_final = pd.concat(li, axis=0, ignore_index=True)

Loading Baseline SB...
Loading Baseline NB...
Loading Moderate SB...
Loading Moderate NB...
Loading High SB...
Loading HIgh NB...


In [568]:
df_final.groupby('scenario').count()

Unnamed: 0_level_0,km,station,train_id,time,hr,min
scenario,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Baseline NB,1739,1739,1739,1739,1739,1739
Baseline SB,1653,1653,1653,1653,1653,1653
HIgh NB,3432,3432,3432,3432,3432,3432
High SB,3432,3432,3432,3432,3432,3432
Moderate NB,2836,2836,2836,2836,2836,2836
Moderate SB,2836,2836,2836,2836,2836,2836


In [569]:
df_final.to_csv('../data/parsed_data/schedules/all_schedules_cleaned.csv', index=False)