In [437]:
import numpy as np
import pandas as pd
import glob

In [438]:
#This adds dummy train names for any trains that are missing them
def fill_train_ids(df):
    df = df.copy()
    n = 0 #For paired trains
    m = 0 #For single trains
    for i, val in df.iloc[1].iteritems():
        #If we have no train name (i.e. it's a NaN):
        if i > 1 and val != val:
            #If we have that 'to future column' arrow...
            if (df.iloc[:,i].str.strip() == '↳').any():
                #Set dummy train ID
                df.at[1, i] = 'pair_' + str(n)
            #If we have that 'from past column' arrow...
            elif (df.iloc[:,i].str.strip() == '↴').any():
                #Set dummy train ID
                df.at[1, i] = 'pair_' + str(n)
                #Advance n to get a new dummy train ID
                n += 1
            #This is a single train
            else:
                #Set dummy train ID
                df.at[1, i] = 'single_' + str(m)
                m += 1
    return df

In [439]:
#This sets the top two rows a column names
def set_column_names(df):
    df = df.copy()
    
    #Combine top two rows into one row
    df.loc[-1] = df.iloc[0] + ' ' + df.iloc[1]
    df.sort_index(inplace=True)  # sorting by index
    
    #Fix column names for variables
    df.iloc[0,0] = 'km'
    df.iloc[0,1] = 'station'
    
    #Set column names
    df = df.rename(columns=df.iloc[0]).drop(df.index[0]).reset_index(drop=True)
    
    #Drop top two rows
    df.drop(df.index[0:2], inplace=True)
    return df

In [469]:
#This converts the time column to minutes since midnight
def convert_times(df):
    df = df.copy()
    
    #Create hr and min columns
    df['hr'], df['min'] = df['time'].str.strip()\
        .str.replace('|', '')\
        .str.replace('↳', '')\
        .str.replace('↴', '')\
        .str.replace('o', '')\
        .str.split(':').str

    #Tweak times after midnight
    df.loc[df['hr'] == '0', ['hr']] = 24
    df.loc[df['hr'] == '1', ['hr']] = 25

    #Replace blanks with NaNs so we can convert to float
    df.loc[df['hr'] == '', ['hr']] = np.NaN

    #Calculate time in minutes
    df['time'] = df['hr'].astype(float)*60 + df['min'].astype(float)

    #Drop hr and min
    df.drop(columns=['hr', 'min'], inplace=True)
    
    return df

In [472]:
#Load all schedules and process
schedules = glob.glob('../data/1-WeekdaySchedule/*.csv')

li = []

for filename in schedules:
    print('Loading {}...'.format(filename))
    df = pd.read_csv(filename, header=None)

    df = fill_train_ids(df)

    df = set_column_names(df)
    
    #San Jose is double-entered in some timetables; drop one of them by dropping rows where km is NaN
    df = df[(df['km'] == df['km'])]
    
    #Unpivot table
    df = df.melt(id_vars=['km', 'station'], var_name='train_id', value_name='time')
    
    df = convert_times(df)
    
    #Remove rows where time column is NaN (train does not stop)
    df = df[df['time'] == df['time']].reset_index(drop=True)
    
    #Set scenario name
    df['scenario'] = filename.split('/')[3].split('-')[0]
    
    li.append(df)

df_final = pd.concat(li, axis=0, ignore_index=True)

Loading ../data/1-WeekdaySchedule/HighNB-Table1.csv...
Loading ../data/1-WeekdaySchedule/ModerateNB-Table1.csv...
Loading ../data/1-WeekdaySchedule/BaselineSB-Table1.csv...
Loading ../data/1-WeekdaySchedule/BaselineNB-Table1.csv...
Loading ../data/1-WeekdaySchedule/HighSB-Table1.csv...
Loading ../data/1-WeekdaySchedule/ModerateSB-Table1.csv...


In [474]:
df_final.to_csv('../data/parsed_data/schedules/all_schedules_cleaned.csv', index=False)