In [135]:
import pandas as pd
import datetime as dt
import dask.dataframe as dd
import dateparser
import swifter
from fastparquet import ParquetFile
from datetime import timedelta
import time
import datetime
from multiprocessing import Pool, cpu_count
import numpy as np

In [136]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Read raw APC and append

The raw APC data files for 2019-2020 are saved in Teams under general > datasets > wego-occupancy > APCData > Raw APC 2019-2020 data. Change the classpaths when reading in files throughout this notebook.

In [137]:
#read raw data files as string. 
jan_mar = pd.read_csv('rawdata/Jan-Mar2019.txt', dtype = str)
apr_jun = pd.read_csv('rawdata/apr-jun 2019.txt', dtype = str)
jul_sept = pd.read_csv('rawdata/jul-sept 2019.txt', dtype = str)
oct_dec = pd.read_csv('rawdata/oct-dec 2019.txt', dtype = str)

In [138]:
#append data
apc_df = jan_mar.append(apr_jun)
apc_df = apc_df.append(jul_sept)
apc_df = apc_df.append(oct_dec)
apc_df = apc_df.reset_index(drop=True)

In [139]:
#rename columns and filter for ride_check_mode = 2
apc_df.columns = ['initial_load', 'apc_stop_id', 'stop_number', 'e_time', 'actual_arrival_time', 'actual_depart_time',
                 'scheduled_arrival_time', 'scheduled_departure_time', 'sequence', 'board_count', 'alight_count',
                 'ride_check_type', 'line', 'block_name', 'bus_number', 'service_id', 'ride_check_date', 'pattern',
                 'pattern_id', 'apc_trip_id', 'apc_lat', 'apc_lon', 'stop_abbr', 'apc_stop_name', 'ride_check_mode']
apc_df = apc_df.loc[apc_df['ride_check_mode'] == '2'] 

In [140]:
apc_df.shape[0]

3205142

# Clean APC data
step 1: cast strings to appropriate types
step 2: delete ride check type column
step 3: convert dates to datetime
step 4: convert times from seconds past midnight to HH/MM/SS
**for 2019 only: remove data from dates 10/17/19 and 10/18/19 (these represent faulty records in the data)

These processing steps have been run on the raw 2019 and 2020 APC data and both files are saved in Teams under general > datasets > wego-occupancy > APCData > Cleaned APC 2019-2020 data. The 2020 cleaned data file is called apc_cleaned_jan_through_oct15_2020parquet.gz and the 2019 cleaned data file is called apc_cleaned_jan_through_dec_2019parquet.gz

In [141]:
#convert initial load, ride check mode, sequence, line abbr to int 18
apc_df['initial_load'] = apc_df['initial_load'].astype('int8')
apc_df['ride_check_mode'] = apc_df['ride_check_mode'].astype('int8')
apc_df['sequence'] = apc_df['sequence'].astype('int8')
apc_df['line'] = apc_df['line'].astype('int8')

In [142]:
#convert stop id, stop number to int 16
apc_df['apc_stop_id']=apc_df['apc_stop_id'].astype('int16')
apc_df['stop_number']=apc_df['stop_number'].astype('int16')

In [143]:
#convert board count, alight count to int
apc_df['board_count']=apc_df['board_count'].astype('int64')
apc_df['alight_count']=apc_df['alight_count'].astype('int64')

In [146]:
#convert  apc_trip_id, lat, long, pattern id, bus number, service id to int 
apc_df['apc_trip_id']=apc_df['apc_trip_id'].astype(int)
apc_df['apc_lat']=apc_df['apc_lat'].astype(int)
apc_df['apc_lon']=apc_df['apc_lon'].astype(int)
apc_df['pattern_id']=apc_df['pattern_id'].astype(int)
apc_df['bus_number']=apc_df['bus_number'].astype(int)
apc_df['service_id']=apc_df['service_id'].astype(int)

In [148]:
#convert stop abbr, stop name, block name to string
apc_df['stop_abbr'] = apc_df['stop_abbr'].astype(str)
apc_df['apc_stop_name'] = apc_df['apc_stop_name'].astype(str)
apc_df['block_name'] = apc_df['block_name'].astype(str)

In [149]:
#delete ride_check_type column
apc_df = apc_df.drop('ride_check_type', 1)

In [152]:
#FOR 2019 ONLY: drop dates 10/17/19 and 10/18/19
apc_df = apc_df[(apc_df.ride_check_date != '20191017') & (apc_df.ride_check_date != '20191018')]

In [157]:
apc_df.head()

Unnamed: 0,initial_load,apc_stop_id,stop_number,e_time,actual_arrival_time,actual_depart_time,scheduled_arrival_time,scheduled_departure_time,sequence,board_count,alight_count,line,block_name,bus_number,service_id,ride_check_date,pattern,pattern_id,apc_trip_id,apc_lat,apc_lon,stop_abbr,apc_stop_name,ride_check_mode
0,2,661,2,35160,35806,35806,-1,-1,11,0,2,10,1505,135,7,20190101,3,12381,178755,36153524,-86800747,CHU20AWN,CHURCH ST & 20TH AVE N WB,2
1,0,411,14,31233,53131,-1,-1,-1,6,1,0,15,1505,135,7,20190101,12,12397,178858,36059458,-86641035,BELHHINN,BELL RD & HICKORY HIGHLANDS DR NB,2
2,0,2598,51,32701,61148,-1,-1,-1,43,1,0,15,1505,135,7,20190101,12,12397,178858,36140803,-86738128,MURTRAWN,MURFREESBORO PIKE & TRANSIT AVE WB,2
3,0,5277,61,22797,28519,-1,-1,-1,8,1,0,17,1700,716,7,20190101,7,12399,178877,36153661,-86784135,11ALAUSF,11TH AVE & LAUREL ST SB,2
4,0,4663,62,22826,30399,-1,-1,-1,9,1,0,17,1700,716,7,20190101,7,12399,178877,36152044,-86784228,11A12ASN,11TH AVE & 12TH AVE NB,2


In [158]:
#convert dates to datetime 
apc_df['ride_check_date'] = apc_df['ride_check_date'].apply(lambda x: datetime.datetime.strptime(x,'%Y%m%d'))

In [163]:
def fix_times (time) :
    """
    :param time: time in seconds past midnight 
    :return: time converted from seconds past midnight to HH:MM:SS
    """
    time = int(time)
    if time == -1 :
        return None 
    else: 
        fixed_time = dt.timedelta(seconds = time) 
        return fixed_time

In [167]:
#convert all times from seconds past midnight to HH:MM:SS
for l in ['e_time','actual_arrival_time','actual_depart_time','scheduled_arrival_time','scheduled_departure_time']:
    print(l)
    apc_df[l]= apc_df.swifter.set_npartitions(20).apply(lambda x: fix_times(x[l]),axis=1)

e_time
actual_arrival_time
actual_depart_time
scheduled_arrival_time
scheduled_departure_time


In [171]:
apc_df.shape[0]

3204725

In [172]:
apc_df.to_parquet('data/apc_cleaned_jan_through_dec_2019.parquet', engine='fastparquet', compression='gzip')

# Create the joined trips stops file (run this once).
This is saved as 'trips_gtfs_surrogate.parquet.gzip' in Teams under general > datasets > wego-occupancy > APCData > Trips and Stops sequences

In [None]:
trips=pd.read_excel('data/TripsFile.xlsx',sheet_name="Sheet1")

In [None]:
trips.columns = ['route_id', 'version', 'trip_start_time', 'pattern_id','trip_id', 'direction']

In [None]:
trips['trip_start_time'] = pd.to_datetime(trips["trip_start_time"], unit='s').dt.time

In [None]:
stops=pd.read_excel('StopsFile.xlsx',sheet_name="update-pattern-id")

In [None]:
stops.columns = ['pattern_id', 'stop_id', 'lat', 'lon','stop_seq', 'direction','version','activation_date','deactivation_date','route_id']

In [None]:
trip_stop_sequence = pd.merge(stops, trips,  how='left', on=['version','route_id','direction','pattern_id'])

In [None]:
#check no duplicates
dups = trip_stop_sequence.loc[trip_stop_sequence.duplicated(subset = ['version','stop_id','trip_id','route_id' ,'pattern_id','stop_seq','activation_date'], keep=False)]
dups.shape[0]

In [None]:
trip_stop_sequence.to_parquet('data/trips_gtfs_surrogate.parquet.gzip',engine='pyarrow',compression='gzip')

# Create version ID column of APC data.

This is necessary to be able to merge the APC with the trip-stops information. The APC with version for 2019 and 2020 are saved in Teams under general > datasets > wego-occupancy > APCData > Cleaned APC 2019-2020 data. The 2019 data set with version is called apc_df_2019_jan_dec_with_version.parquet.gz and the 2020 data set with version is called apc_df_2020_jan_oct_with_version.parquet.gz. 

In [8]:
#read joined trip-stops information file AND APC file
trip_stop_sequence = pd.read_parquet('cleandata/trips_gtfs_surrogate.parquet.gzip',engine='pyarrow')
#replace with 2020 apc
apc_df=pd.read_parquet('cleandata/apc_cleaned_jan_through_dec_2019.parquet', engine='fastparquet')

In [9]:
trip_date=trip_stop_sequence[['apc_trip_id', 'version', 'activation_date']].drop_duplicates(['apc_trip_id','version', 'activation_date'],keep='first')
trip_date=trip_date[['apc_trip_id', 'version', 'activation_date']]

In [10]:
apc_df_trip_dates=apc_df[['apc_trip_id','ride_check_date']]
apc_df_trip_dates=apc_df_trip_dates.drop_duplicates(['apc_trip_id','ride_check_date'],keep='first',ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
apc_df_trip_dates=apc_df_trip_dates.reset_index()
trip_date=trip_date.reset_index()

In [12]:
def find_version_id(apc_trip_id, ride_check_date) :
    temp = trip_date.loc[(trip_date['apc_trip_id'] == apc_trip_id)]
    if (temp is None or temp.size==0):
        return 0
    temp['ride_check_date'] = ride_check_date
    temp['days'] = (temp['ride_check_date'] - temp['activation_date']).dt.days
    df3 = temp.loc[temp['days'] >= 0]
    if (df3 is None or df3.size==0):
        return 0
    df3=df3.reset_index()
    df3 = df3.sort_values(['days'], ascending = True)    
    return df3['version'].iloc[0]

In [13]:
apc_df_trip_dates['version']= apc_df_trip_dates.swifter.set_npartitions(20).apply(lambda x: find_version_id(x['apc_trip_id'], x['ride_check_date'] ),axis=1)

HBox(children=(FloatProgress(value=0.0, description='Dask Apply', max=20.0, style=ProgressStyle(description_wi…




In [14]:
apc_df_trip_dates.head()

Unnamed: 0,index,apc_trip_id,ride_check_date,version
0,0,193712,2020-01-01,58
1,1,193713,2020-01-01,58
2,2,193714,2020-01-01,58
3,3,193716,2020-01-01,58
4,4,193718,2020-01-01,58


In [15]:
apc_df_trip_dates = apc_df_trip_dates.loc[ : , ('apc_trip_id', 'ride_check_date', 'version')]

In [16]:
apc_df = apc_df.merge(apc_df_trip_dates, on=['apc_trip_id', 'ride_check_date'], how='left')

In [17]:
#check that there are no null version Ids (!!)
apc_df_trip_dates.loc[(apc_df_trip_dates.version==0)].shape[0]

0

In [19]:
apc_df=pd.read_parquet('cleandata/apc_df_2019_jan_dec_with_version.parquet', engine='fastparquet')

In [21]:
apc_df.shape[0]

1735230

# Merge APC with version WITH full trips-stops information

The WeGo APC data contains data of non-zero boardings and alightings. We want to create a full data set that includes both non-zero and zero boardings and alightings. This is done by augmenting the APC data with the joined trips-stops information.

These merged files for 2019-2020 are saved in Teams under general > datasets > wego-occupancy > APCData > Merged APC 2019-2020 data. They are saved in csv files for every 2 months.

For 2020, only run for months Jan through August.

In [None]:
apc_df=pd.read_parquet('cleandata/apc_df_2019_jan_dec_with_version.parquet', engine='fastparquet')
trip_stop_sequence = pd.read_parquet('cleandata/trips_gtfs_surrogate.parquet.gzip',engine='pyarrow')

In [30]:
#If running month by month:
jan = apc_df[apc_df['ride_check_date'].dt.month == 1]
#feb = apc_df[apc_df['ride_check_date'].dt.month == 2]
#mar = apc_df[apc_df['ride_check_date'].dt.month == 3]
#apr = apc_df[apc_df['ride_check_date'].dt.month == 4]
#may = apc_df[apc_df['ride_check_date'].dt.month == 5]
#jun = apc_df[apc_df['ride_check_date'].dt.month == 6]
#jul = apc_df[apc_df['ride_check_date'].dt.month == 7]
#aug = apc_df[apc_df['ride_check_date'].dt.month == 8]
#sep = apc_df[apc_df['ride_check_date'].dt.month == 9]
#octo = apc_df[apc_df['ride_check_date'].dt.month == 10]
#nov = apc_df[apc_df['ride_check_date'].dt.month == 11]
#dec = apc_df[apc_df['ride_check_date'].dt.month == 12]

In [None]:
jan = jan.sort_values(['ride_check_date', 'apc_trip_id', 'sequence'], ascending= True)
jan['ride_check_date'] = jan['ride_check_date'].astype(str)

In [None]:
#Change the renaming of the trips_stop_sequence file. 
trip_stop_sequence.columns = ['pattern_id', 'stop_abbr', 'lat', 'lon', 'sequence', 'direction',
                 'version', 'activation_date', 'deactivation_date', 'line', 'trip_start_time',
                 'apc_trip_id']

In [None]:
#fix lat and lon values
trip_stop_sequence['lat']=trip_stop_sequence.lat/1e7
trip_stop_sequence['lon']=trip_stop_sequence.lon/1e7

In [None]:
tripsdata=trip_stop_sequence[['apc_trip_id','trip_start_time','pattern_id','version','line','stop_abbr','lat','lon','sequence','direction']]

In [88]:
def fill_nan_values(df):
    """
    fill nan values in the dataframe that result from the left join
    
    :param df: the pandas DataFrame after APC and trips/stops data have been combined
    """
    filled = df.copy()

    # missing board and alight counts are all 0
    filled[['board_count', 'alight_count']] = filled[['board_count', 'alight_count']].fillna(0)

    # otherwise, fill in missing information from existing apc rows
    filled[['initial_load', 'line', 'block_name', 'bus_number',
            'service_id', 'ride_check_date', 'pattern', 'apc_trip_id']] = filled[
        ['initial_load', 'line', 'block_name', 'bus_number',
         'service_id', 'ride_check_date', 'pattern', 'apc_trip_id']].fillna(method='ffill', axis=0).fillna(method='bfill',
                                                                                                         axis=0)
    return filled

In [89]:
def calc_bus_occupancy(df):
    """
    calculate occupancy at each stop along a route

    :param df: dataframe with board/alight values for all stops along a single trip
    """
    tmp = df.copy()

    tmp['initial_load'] = pd.to_numeric(tmp['initial_load'], errors='coerce')
    tmp['board_count'] = pd.to_numeric(tmp['board_count'], errors='coerce')
    tmp['alight_count'] = pd.to_numeric(tmp['alight_count'], errors='coerce')

    # calc occupancy net change
    tmp['occupancy_net_change'] = tmp['board_count'] - tmp['alight_count']

    # calc cumulative sum in occupancy net change as an intermediate step
    tmp_sum_df = pd.DataFrame(tmp['occupancy_net_change'].cumsum())
    tmp_sum_df.columns = ['tmp_sum']

    # merge tmp sum (cumulative sum) into tmp
    tmp = tmp.merge(tmp_sum_df, left_index=True, right_index=True)

    # calc occupancy for a particular stop
    tmp['occupancy'] = tmp['tmp_sum'] + tmp['initial_load']

    return tmp.drop(columns=['tmp_sum', 'occupancy_net_change'])

In [91]:
def stops_merge_for_trip (group) :
    
    """
    Merge APC group with trips/stops information, and calculate occupancy 

    :param: group of APC dataframe based on apc_trip_id, pattern_id, line, ride_check_date, version
    """
    
    apc_stops = group.reset_index()
    trip = apc_stops.apc_trip_id[0]
    date = apc_stops.ride_check_date[0]
    version = apc_stops.version[0]
    line = apc_stops.line[0]
    
    #get expected stops from joined file
    all_stops = tripsdata[(tripsdata.apc_trip_id == trip) & (tripsdata.version == version) & (tripsdata.line == line)]
    all_stops['ride_check_date'] = date
    all_stops.sort_values(by=['sequence'])

    #left join with trips on the left, apc on the right 
    merge = pd.merge(all_stops, apc_stops, how='left', on=['apc_trip_id', 'ride_check_date', 'sequence', 'stop_abbr',
                                                           'version', 'line']).sort_values(by=['sequence']).reset_index(drop=True)

    #fill NaN values and calculate occupancy 
    filled = fill_nan_values(merge)
    occupancy_df = calc_bus_occupancy(filled)
    
    #drop apc columns 
    occupancy_df = occupancy_df.drop(columns=['apc_stop_id', 'stop_number','block_name','apc_stop_name','apc_lat','pattern_id_y','apc_lon', 'index'])

    #rename trips table pattern ID
    occupancy_df = occupancy_df.rename(columns={'pattern_id_x' : 'pattern_id'})

    return occupancy_df

In [92]:
def apply_parallel(df_grouped, stops_merge_for_trip):
    with Pool(cpu_count()) as p:
        ret_list=p.map(stops_merge_for_trip, [group for name, group in df_grouped])
    p.close()
    p.join()
    return pd.concat(ret_list)

In [93]:
def main(): 
    #change file name here
    df_grouped = jan.groupby(['apc_trip_id', 'pattern_id','line','ride_check_date', 'version'])

    start = datetime.datetime.now()
    parallel_result = apply_parallel(df_grouped, stops_merge_for_trip)
    end = datetime.datetime.now()
    print("time elapsed:", end - start)

    #rename saved file
    parallel_result.to_csv('jan19_merged.csv')

In [None]:
if __name__ == '__main__':
    main()

# Append all month data frames and reset index

In [None]:
#the steps are just show from Jul - Dec 2019.
jul = pd.read_csv('jul19_merged.csv', index_col=0)
aug = pd.read_csv('aug19_merged.csv', index_col=0)
sep = pd.read_csv('sep19_merged.csv', index_col=0)
octo = pd.read_csv('oct19_merged.csv', index_col=0)
nov = pd.read_csv('nov19_merged.csv', index_col=0)
dec = pd.read_csv('dec19_merged.csv', index_col=0)

In [None]:
jul = jul.reset_index()
jul = jul.drop(columns=['index'])

In [None]:
aug = aug.reset_index()
aug = aug.drop(columns=['index'])

In [None]:
sep = sep.reset_index()
sep = sep.drop(columns=['index'])

In [None]:
octo = octo.reset_index()
octo = octo.drop(columns=['index'])

In [None]:
nov = nov.reset_index()
nov = nov.drop(columns=['index'])

In [None]:
dec = dec.reset_index()
dec = dec.drop(columns=['index'])

In [None]:
#append data
jul_aug = jul.append(aug)
jul_aug = jul_aug.reset_index(drop=True)

sep_oct = sep.append(octo)
sep_oct = sep_oct.reset_index(drop=True)

nov_dec = nov.append(dec)
nov_dec = nov_dec.reset_index(drop=True)

In [None]:
jul_aug.to_csv('wego_jul_aug_2019_merged_apc.csv')

In [None]:
sep_oct.to_csv('wego_sep_oct_2019_merged_apc.csv')

In [None]:
nov_dec.to_csv('wego_nov_dec_2019_merged_apc.csv')