In [8]:
import numpy as np
import pandas as pd

In [9]:
df = pd.read_csv('../data/parsed_data/schedules/all_schedules_cleaned.csv')

In [17]:
df.head()

Unnamed: 0,km,station,train_id,time,hr,min,scenario
0,78,SAN JOSE,IC (HSR) 701_single_1,348,5,48,Baseline SB
1,126,GILROY (HSR) $,IC (HSR) 701_single_1,373,6,13,Baseline SB
2,0,SALESFORCE TRANSIT CENTER (STC) $,HSR 801_single_2,310,5,10,Baseline SB
3,24,MILLBRAE,HSR 801_single_2,330,5,30,Baseline SB
4,78,SAN JOSE,HSR 801_single_2,365,6,5,Baseline SB


In [18]:
def stops_to_durations(df):
    #Get departure and arrival stop info
    df_stops_arr = df.copy()
    df = df.rename(columns={"station": "departure_station",
                                        "time": "departure_time",
                                        "hr": "departure_hour",
                                        "min": "departure_minute",
                                        "km": "departure_km"})
    
    df_stops_arr = df_stops_arr.rename(columns={"station": "arrival_station",
                                        "time": "arrival_time",
                                        "hr": "arrival_hour",
                                        "min": "arrival_minute",
                                        "km": "arrival_km"})

    #Join the two on train ID and scenario
    df = df.merge(df_stops_arr, on=['scenario', 'train_id'])

    #Thow out any journeys that do not go forwards in time
    df = df[df['arrival_time'] > df['departure_time']]

    #Add trip duration column
    df['trip_duration'] = df['arrival_time'] - df['departure_time']
    
    #Add trip distance column
    df['trip_length'] = df['arrival_km'] - df['departure_km']
    
    df = df.reset_index(drop=True)

    return df

df_pairs = stops_to_durations(df)

In [20]:
df_pairs.head()

Unnamed: 0,departure_km,departure_station,train_id,departure_time,departure_hour,departure_minute,scenario,arrival_km,arrival_station,arrival_time,arrival_hour,arrival_minute,trip_duration,trip_length
0,78,SAN JOSE,IC (HSR) 701_single_1,348,5,48,Baseline SB,126,GILROY (HSR) $,373,6,13,25,48
1,0,SALESFORCE TRANSIT CENTER (STC) $,HSR 801_single_2,310,5,10,Baseline SB,24,MILLBRAE,330,5,30,20,24
2,0,SALESFORCE TRANSIT CENTER (STC) $,HSR 801_single_2,310,5,10,Baseline SB,78,SAN JOSE,365,6,5,55,78
3,0,SALESFORCE TRANSIT CENTER (STC) $,HSR 801_single_2,310,5,10,Baseline SB,126,GILROY (HSR) $,391,6,31,81,126
4,24,MILLBRAE,HSR 801_single_2,330,5,30,Baseline SB,78,SAN JOSE,365,6,5,35,54


In [26]:
df_journey_times = df_pairs.groupby(['scenario', 'departure_station', 'arrival_station', 'departure_hour'])\
    .agg(['count', 'mean', 'min', 'max'])['trip_duration']\
    .reset_index()

In [27]:
df_journey_times.head()

Unnamed: 0,scenario,departure_station,arrival_station,departure_hour,count,mean,min,max
0,Baseline NB,22nd STREET,SALESFORCE TRANSIT CENTER (STC) $,6,3,8.666667,8,9
1,Baseline NB,22nd STREET,SALESFORCE TRANSIT CENTER (STC) $,7,4,8.5,8,9
2,Baseline NB,22nd STREET,SALESFORCE TRANSIT CENTER (STC) $,8,4,8.5,8,9
3,Baseline NB,22nd STREET,SALESFORCE TRANSIT CENTER (STC) $,9,4,8.5,8,9
4,Baseline NB,22nd STREET,SALESFORCE TRANSIT CENTER (STC) $,10,4,8.5,8,9


In [19]:
def cartesian_product( lsts ):
    """
    Returns Pandas DataFrame containing cartesian product of lists. This is the
    same as itertools.product, but faster.
    """

    ret = None

    for lst in lsts:
        subtable = pd.DataFrame(lst)
        subtable["key"] = 1

        if ret is None:
            ret = subtable
        else:
            ret = ret.merge(subtable, on="key")

    # they 'key' column was just a trick to get a set product; it's no longer needed
    ret = ret.drop("key", axis=1)

    return ret

In [None]:
def durations_to_distributions(df, verbose=True):
    """
    Finds parameter estimates for the distribution of travel times for all
    sets of (start_time, route_name, origin_stop, destination_stop) present in
    the input dataframe.
    Args:
        df (DataFrame): DataFrame in format returned by `stops_to_durations`.
    Returns:
        (DataFrame): Contains distribution parameters of fit beta
        distribution for all (start_time, route_name, origin_stop,
        destination_stop) present in `df`.
    """

    # we'll construct a dataframe with those unique combinations of origin
    # stop, destination stop, and departure time
    if verbose: print( "finding all time slices..." )
    df_timestamps = cartesian_product( [df['departure_station'].unique(),
                              df['arrival_station'].unique(),
                              df['departure_hour'].unique(),
                              np.arange(0,60)] )
    df_timestamps.columns = ["departure_station", "arrival_station", "departure_hour", "minute"]
    # the `departure_hour` and `minute` columns were just a means towards
    # a `departure_time` column
    df_timestamps['departure_time'] = df_timestamps['departure_hour']*60 + df_timestamps['minute']
    df_timestamps.drop(["departure_hour", "minute"], axis=1, inplace=True )

    # Get every depart/arrival/time combination and sort them so that
    # depart+arrive are adjacent and in chronological order. Then, take
    # the observed journey times, and fill in corresponding time/block rows.
    # For example, the journey stop:3072 -> stop:3074 occurs 18 times during
    # 2018-11-09. There are 1440 minute-rows for the pair (3072->3074) during
    # that day, of which 13 will be filled in.
    if verbose: print( "merging with observed journeys...")
    df_timestamps = df_timestamps.sort_values(['departure_stop_id', 'arrival_stop_id', 'departure_time_minute'])
    df_timestamps = df_timestamps.reset_index(drop=True)

    df = df_timestamps.merge(df, on=['departure_time_minute', 'departure_stop_id', 'arrival_stop_id'], how='left')

    # Backfill so each minute has the data for the next departure. Thus each
    # row contains a minute of the day, the next arrival, and the journey time
    # for that trip.
    if verbose: print( "backfilling time slices with next journey..." )
    df = df.groupby(['departure_stop_id', 'arrival_stop_id']).apply(lambda group: group.fillna(method='bfill'))

    #Add total journey time column
    df['total_journey_time'] = (df['arrival_time'] - df['departure_time_minute']).dt.total_seconds()

    #Drop NaNs (occurs at the end of the data set when we don't know when the next bus will come.)
    df = df.dropna(subset=['total_journey_time'])

    # Within each origin-destination pair, 'df' is now a time series of the
    # total journey time as a function of the time. Plotted it looks like a
    # sawtooth function, steadily decreasing at 1 minute per minute until a
    # local minimum at the time of departure, at which point it takes the value
    # of the vehicle's journey time to the next stop.

    class CalcDistribution:
        def __init__(self, n):
            self.n = n
            self.i = 0

        def __call__(self, x):
            self.i += 1

            if self.i%1000==0:
                print( "%s/%s"%(self.i, self.n) )

            try:
                params = st.gamma.fit(x[x > 0], floc=0)
                shape = params[0]
                scale = params[2]
            except Exception as e:
                print(e)
                print(x)
                shape = np.NaN
                scale = np.NaN
            return shape, scale

    n_orig = df.departure_stop_id.unique().size
    n_dest = df.arrival_stop_id.unique().size
    n_time = df.departure_time_hour.unique().size
    n_groups = n_orig*n_dest*n_time
    calc_distribution = CalcDistribution(n_groups)

    if verbose:
        print( f"Fitting distribution to {n_groups}ish groups" )

    #Calculate shape and scale parameters
    df = df.groupby(['departure_time_hour', 'departure_stop_id', 'arrival_stop_id'])['total_journey_time'].agg([calc_distribution, "size"]).reset_index()

    #Split into columns
    df[['shape', 'scale']] = df['CalcDistribution'].apply(pd.Series)
    df = df.drop('CalcDistribution', axis=1)

    #Drop NAs
    df = df.dropna()

    #Generate Target
    df['mean'] = df['shape'] * df['scale']

    #Drop uneeded columns
    df = df[['schedule_date', 'route_short_name', 'departure_time_hour', 'departure_stop_id','arrival_stop_id','shape','scale','mean']]

    #Drop NAs
    df = df.dropna()


In [None]:
'''
Hour
origin_id
origin_name
dest_id
scenario
Min Wait at Origin Station
Average Wait at Origin Station
Max Wait at Origin Station
Min Length of Journey
Average Length of Journey
Max Length of Journey
CO2 Emissions
Ridership Number
'''