# WeGo APC Data Processing

This notebook processes and formats the raw WeGo APC data. Outputs wegoapc_dashboard.csv.

In [1]:
import pandas as pd
import os
import zipfile
import dateparser
import swifter
import datetime
import time
import numpy as np
from multiprocessing import Pool

In [2]:
file_path = os.path.join(os.getcwd(), 'data.zip')
with zipfile.ZipFile(file_path,"r") as zip_ref:
    zip_ref.extractall(os.getcwd())
    
file_path = os.path.join(os.getcwd(), 'output.zip')
with zipfile.ZipFile(file_path,"r") as zip_ref:
    zip_ref.extractall(os.getcwd())

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [46]:
# load raw data

apc_path = os.path.join(os.getcwd(), 'data', 'apc')
dfs = []
for file in os.listdir(apc_path):
    df_temp = pd.read_csv(os.path.join(apc_path, file), dtype = str)
    dfs.append(df_temp)
apc_df = pd.concat(dfs, ignore_index=True)
print(f"number of rows in raw apc data: {apc_df.shape[0]}")
apc_df.head(2)

number of rows in raw apc data: 5019806


Unnamed: 0,initialload,stopid,stopnum,etime,actualarrivetime,actualdeparttime,schedarrivetime,scheddeparttime,sequence,ridboardcount,ridalightcount,ridchecktype,lineabbr,blockname,busnum,serviceid,ridcheckdate,pattern,patternid,tripid,lat,lon,stopabbr,stopname,ridcheckmode
0,0,4418,1,51300,50620,51304,-1,-1,1,2,0,0,14,1400,708,3,20200101,2,13955,193712,36166545,-86781895,MCC4_20,CENTRAL 4TH AVE - BAY 20,2
1,0,4418,1,54900,54437,55240,-1,-1,1,7,0,0,14,1400,708,3,20200101,2,13955,193713,36166545,-86781895,MCC4_20,CENTRAL 4TH AVE - BAY 20,2


In [25]:
#rename columns and filter for ride_check_mode = 2
apc_df.columns = ['initial_load', 'apc_stop_id', 'stop_number', 'e_time', 'actual_arrival_time', 'actual_depart_time',
                 'scheduled_arrival_time', 'scheduled_departure_time', 'sequence', 'board_count', 'alight_count',
                 'ride_check_type', 'line', 'block_name', 'bus_number', 'service_id', 'ride_check_date', 'pattern',
                 'pattern_id', 'apc_trip_id', 'apc_lat', 'apc_lon', 'stop_abbr', 'apc_stop_name', 'ride_check_mode']

print(f"number of rows in apc before dropping ride_check_mode=1: {apc_df.shape[0]}")
apc_df = apc_df.loc[apc_df['ride_check_mode'] == '2'] 
print(f"number of rows in apc after dropping ride_check_mode=1: {apc_df.shape[0]}")

print(f"number of readings before dropping nan values: {apc_df.shape[0]}")
apc_df = apc_df.dropna()
print(f"number of readings after dropping nan values: {apc_df.shape[0]}")
apc_df.head(2)

number of rows in apc before dropping ride_check_mode=1: 5019806
number of rows in apc after dropping ride_check_mode=1: 4940372
number of readings before dropping nan values: 4940372
number of readings after dropping nan values: 4939701


Unnamed: 0,initial_load,apc_stop_id,stop_number,e_time,actual_arrival_time,actual_depart_time,scheduled_arrival_time,scheduled_departure_time,sequence,board_count,alight_count,ride_check_type,line,block_name,bus_number,service_id,ride_check_date,pattern,pattern_id,apc_trip_id,apc_lat,apc_lon,stop_abbr,apc_stop_name,ride_check_mode
0,1,4431,1,22680,22026,22668,-1,-1,1,4,0,0,1,100,650,1,20190401,2,12375,163531,36166590,-86781996,MCC5_11,CENTRAL 5TH AVE - BAY 11,2
1,1,1788,3,23760,23504,23504,-1,-1,26,0,4,0,1,100,650,1,20190401,2,12375,163531,36106851,-86762639,100OAKS,100 OAKS MALL,2


In [26]:
# set column types

apc_df['initial_load'] = apc_df['initial_load'].astype(int)
apc_df['ride_check_mode'] = apc_df['ride_check_mode'].astype(int)
apc_df['sequence'] = apc_df['sequence'].astype(int)
apc_df['line'] = apc_df['line'].astype(int)

apc_df['apc_stop_id']=apc_df['apc_stop_id'].astype(int)
apc_df['stop_number']=apc_df['stop_number'].astype(int)

apc_df['board_count']=apc_df['board_count'].astype(int)
apc_df['alight_count']=apc_df['alight_count'].astype(int)

apc_df['apc_trip_id']=apc_df['apc_trip_id'].astype(int)
apc_df['apc_lat']=apc_df['apc_lat'].astype(int)
apc_df['apc_lon']=apc_df['apc_lon'].astype(int)
apc_df['pattern_id']=apc_df['pattern_id'].astype(int)
apc_df['bus_number']=apc_df['bus_number'].astype(int)
apc_df['service_id']=apc_df['service_id'].astype(int)

apc_df['stop_abbr'] = apc_df['stop_abbr'].astype(str)
apc_df['apc_stop_name'] = apc_df['apc_stop_name'].astype(str)
apc_df['block_name'] = apc_df['block_name'].astype(str)

apc_df = apc_df.drop('ride_check_type', 1)

apc_df = apc_df[(apc_df.ride_check_date != '20191017') & (apc_df.ride_check_date != '20191018')]

#convert dates to datetime 
apc_df['ride_check_date'] = apc_df['ride_check_date'].apply(lambda x: datetime.datetime.strptime(x,'%Y%m%d'))


apc_df.head(2)

Unnamed: 0,initial_load,apc_stop_id,stop_number,e_time,actual_arrival_time,actual_depart_time,scheduled_arrival_time,scheduled_departure_time,sequence,board_count,alight_count,line,block_name,bus_number,service_id,ride_check_date,pattern,pattern_id,apc_trip_id,apc_lat,apc_lon,stop_abbr,apc_stop_name,ride_check_mode
0,1,4431,1,22680,22026,22668,-1,-1,1,4,0,1,100,650,1,20190401,2,12375,163531,36166590,-86781996,MCC5_11,CENTRAL 5TH AVE - BAY 11,2
1,1,1788,3,23760,23504,23504,-1,-1,26,0,4,1,100,650,1,20190401,2,12375,163531,36106851,-86762639,100OAKS,100 OAKS MALL,2


In [29]:
def fix_times (time) :
    """
    :param time: time in seconds past midnight 
    :return: time converted from seconds past midnight to HH:MM:SS
    """
    time = int(time)
    if time == -1 :
        return None 
    else: 
        #fixed_time = datetime.timedelta(seconds = time) 
        #return fixed_time
        minu, sec = divmod(time, 60)
        hour, minu = divmod(minu, 60) 
        if hour == 24:
            hour = 0
        return datetime.time(hour=hour, minute=minu, second=sec)
    
#convert all times from seconds past midnight to HH:MM:SS
for l in ['e_time','actual_arrival_time','actual_depart_time','scheduled_arrival_time','scheduled_departure_time']:
    print(l)
    apc_df[l]= apc_df.swifter.set_npartitions(20).apply(lambda x: fix_times(x[l]),axis=1)

In [32]:
file_path = os.path.join(os.getcwd(), 'output', 'apc_cleaned.parquet')
apc_df.to_parquet(file_path, compression='gzip')

# Add GTFS version 

In [3]:
file_path = os.path.join(os.getcwd(), 'output', 'apc_cleaned.parquet')
apc_df = pd.read_parquet(file_path, engine='pyarrow')
apc_df.head(2)

Unnamed: 0,initial_load,apc_stop_id,stop_number,e_time,actual_arrival_time,actual_depart_time,scheduled_arrival_time,scheduled_departure_time,sequence,board_count,alight_count,line,block_name,bus_number,service_id,ride_check_date,pattern,pattern_id,apc_trip_id,apc_lat,apc_lon,stop_abbr,apc_stop_name,ride_check_mode
0,1,4431,1,06:18:00,06:07:06,06:17:48,,,1,4,0,1,100,650,1,2019-04-01,2,12375,163531,36166590,-86781996,MCC5_11,CENTRAL 5TH AVE - BAY 11,2
1,1,1788,3,06:36:00,06:31:44,06:31:44,,,26,0,4,1,100,650,1,2019-04-01,2,12375,163531,36106851,-86762639,100OAKS,100 OAKS MALL,2


In [10]:
file_path = os.path.join(os.getcwd(), 'data', 'trips_gtfs_surrogate.parquet.gzip')
trip_stop_sequence = pd.read_parquet(file_path, engine='pyarrow')
trip_stop_sequence['apc_trip_id'] = trip_stop_sequence['trip_id']
trip_stop_sequence.head(2)

Unnamed: 0,pattern_id,stop_id,lat,lon,stop_seq,direction,version,activation_date,deactivation_date,route_id,trip_start_time,trip_id,apc_trip_id
0,14729,6AOAKSN,361480730,-867723030,8,FROM DOWNTOWN,51,2018-10-14,2018-12-25 23:59:59,1,15:15:00,163534,163534
1,14729,6AOAKSN,361480730,-867723030,8,FROM DOWNTOWN,51,2018-10-14,2018-12-25 23:59:59,1,16:20:00,163535,163535


In [11]:
# add trip version

def find_version_id(apc_trip_id, ride_check_date) :
    temp = trip_date.loc[(trip_date['apc_trip_id'] == apc_trip_id)]
    if (temp is None or temp.size==0):
        return 0
    temp['ride_check_date'] = ride_check_date
    temp['days'] = (temp['ride_check_date'] - temp['activation_date']).dt.days
    df3 = temp.loc[temp['days'] >= 0]
    if (df3 is None or df3.size==0):
        return 0
    #df3=df3.reset_index()
    #df3 = df3.sort_values(['days'], ascending = True)  
    #return df3['version'].iloc[0]
    ind = df3[['days']].idxmin()['days']
    result = df3.loc[ind, 'version']
    return result

trip_date=trip_stop_sequence[['apc_trip_id', 'version', 'activation_date']].drop_duplicates(['apc_trip_id','version', 'activation_date'],keep='first')
trip_date=trip_date[['apc_trip_id', 'version', 'activation_date']]

apc_df_trip_dates=apc_df[['apc_trip_id','ride_check_date']]
apc_df_trip_dates=apc_df_trip_dates.drop_duplicates(['apc_trip_id','ride_check_date'],keep='first',ignore_index=True)

apc_df_trip_dates=apc_df_trip_dates.reset_index()
trip_date=trip_date.reset_index()

print(f"starting to apply the version number at time {time.time()}")
apc_df_trip_dates['version']= apc_df_trip_dates.apply(lambda x: find_version_id(x['apc_trip_id'], x['ride_check_date'] ),axis=1)
print(f"done the version number at time {time.time()}")

apc_df_trip_dates = apc_df_trip_dates.loc[ : , ('apc_trip_id', 'ride_check_date', 'version')]

TypeError: drop_duplicates() got an unexpected keyword argument 'ignore_index'

In [None]:
#apc_df_trip_dates
#df = pd.merge_asof(apc_df_trip_dates, trip_date, left_on='ride_check_date', right_on='activation_date', by='apc_trip_id', direction='backward', allow_exact_matches=True)

In [None]:
print(f"length of apc_df before merge: {apc_df.shape[0]}")
apc_df = apc_df.merge(apc_df_trip_dates, on=['apc_trip_id', 'ride_check_date'], how='left')
print(f"length of apc_df after merge: {apc_df.shape[0]}")

apc_df_trip_dates.loc[(apc_df_trip_dates.version==0)].shape[0]

In [None]:
file_path = os.path.join(os.getcwd(), 'output', 'apc_cleaned_version.parquet')
apc_df.to_parquet(file_path, compression='gzip')

# Merge APC data with GTFS

In [3]:
file_path = os.path.join(os.getcwd(), 'output', 'apc_cleaned_version.parquet')
apc_df = pd.read_parquet(file_path, engine='pyarrow')
print(f"number of rows in apc_df before dropping nan {apc_df.shape[0]}")
apc_df = apc_df.dropna(subset=['e_time', 'actual_arrival_time'])
print(f"number of rows in apc_df after dropping nan {apc_df.shape[0]}")
apc_df.head(2)

number of rows in apc_df before dropping nan 4939284
number of rows in apc_df after dropping nan 4860423


Unnamed: 0,initial_load,apc_stop_id,stop_number,e_time,actual_arrival_time,actual_depart_time,scheduled_arrival_time,scheduled_departure_time,sequence,board_count,alight_count,line,block_name,bus_number,service_id,ride_check_date,pattern,pattern_id,apc_trip_id,apc_lat,apc_lon,stop_abbr,apc_stop_name,ride_check_mode,version
0,1,4431,1,06:18:00,06:07:06,06:17:48,,,1,4,0,1,100,650,1,2019-04-01,2,12375,163531,36166590,-86781996,MCC5_11,CENTRAL 5TH AVE - BAY 11,2,53
1,1,1788,3,06:36:00,06:31:44,06:31:44,,,26,0,4,1,100,650,1,2019-04-01,2,12375,163531,36106851,-86762639,100OAKS,100 OAKS MALL,2,53


In [4]:
# add time_diff
def add_time_diff(actual_arrival_time, e_time):
    actual_arrival_time_dt = datetime.datetime.combine(datetime.date.today(), actual_arrival_time)
    e_time_dt = datetime.datetime.combine(datetime.date.today(), e_time)
    diff = actual_arrival_time_dt - e_time_dt
    return abs(diff.total_seconds())

apc_df['time_diff'] = apc_df.apply(lambda row: add_time_diff(row['actual_arrival_time'], row['e_time']), axis=1)
apc_df.head(2)

Unnamed: 0,initial_load,apc_stop_id,stop_number,e_time,actual_arrival_time,actual_depart_time,scheduled_arrival_time,scheduled_departure_time,sequence,board_count,alight_count,line,block_name,bus_number,service_id,ride_check_date,pattern,pattern_id,apc_trip_id,apc_lat,apc_lon,stop_abbr,apc_stop_name,ride_check_mode,version,time_diff
0,1,4431,1,06:18:00,06:07:06,06:17:48,,,1,4,0,1,100,650,1,2019-04-01,2,12375,163531,36166590,-86781996,MCC5_11,CENTRAL 5TH AVE - BAY 11,2,53,654.0
1,1,1788,3,06:36:00,06:31:44,06:31:44,,,26,0,4,1,100,650,1,2019-04-01,2,12375,163531,36106851,-86762639,100OAKS,100 OAKS MALL,2,53,256.0


In [5]:
#apc_df = apc_df[(apc_df['ride_check_date']>=pd.Timestamp(year=2019, month=3, day=1, hour=0)) & (apc_df['ride_check_date']<=pd.Timestamp(year=2019, month=4, day=1, hour=0))]      
#len(apc_df)

In [6]:
file_path = os.path.join(os.getcwd(), 'data', 'trips_gtfs_surrogate.parquet.gzip')
trip_stop_sequence = pd.read_parquet(file_path, engine='pyarrow')
trip_stop_sequence['apc_trip_id'] = trip_stop_sequence['trip_id']
trip_stop_sequence['lat']=trip_stop_sequence.lat/1e7
trip_stop_sequence['lon']=trip_stop_sequence.lon/1e7
trip_stop_sequence.head(2)

Unnamed: 0,pattern_id,stop_id,lat,lon,stop_seq,direction,version,activation_date,deactivation_date,route_id,trip_start_time,trip_id,apc_trip_id
0,14729,6AOAKSN,36.148073,-86.772303,8,FROM DOWNTOWN,51,2018-10-14,2018-12-25 23:59:59,1,15:15:00,163534,163534
1,14729,6AOAKSN,36.148073,-86.772303,8,FROM DOWNTOWN,51,2018-10-14,2018-12-25 23:59:59,1,16:20:00,163535,163535


In [7]:
tripdata = trip_stop_sequence[['route_id', 'trip_start_time', 'stop_seq', 'lat', 'lon', 'direction', 'apc_trip_id', 'version', 'stop_id']]
tripdata.head(2)

Unnamed: 0,route_id,trip_start_time,stop_seq,lat,lon,direction,apc_trip_id,version,stop_id
0,1,15:15:00,8,36.148073,-86.772303,FROM DOWNTOWN,163534,51,6AOAKSN
1,1,16:20:00,8,36.148073,-86.772303,FROM DOWNTOWN,163535,51,6AOAKSN


In [8]:
df_grouped = apc_df.groupby(['apc_trip_id', 'ride_check_date', 'version']).size().reset_index().rename(columns={0:'count'})
df_grouped.head(2)

Unnamed: 0,apc_trip_id,ride_check_date,version,count
0,163531,2019-01-11,52,3
1,163531,2019-01-14,52,3


In [9]:
apc_df.isna().sum()

initial_load                      0
apc_stop_id                       0
stop_number                       0
e_time                            0
actual_arrival_time               0
actual_depart_time          3358920
scheduled_arrival_time      4860423
scheduled_departure_time    4860423
sequence                          0
board_count                       0
alight_count                      0
line                              0
block_name                        0
bus_number                        0
service_id                        0
ride_check_date                   0
pattern                           0
pattern_id                        0
apc_trip_id                       0
apc_lat                           0
apc_lon                           0
stop_abbr                         0
apc_stop_name                     0
ride_check_mode                   0
version                           0
time_diff                         0
dtype: int64

In [10]:
def calc_bus_occupancy(df):
    """
    calculate occupancy at each stop along a route

    :param df: dataframe with board/alight values for all stops along a single trip
    """
    tmp = df.copy()

    tmp['initial_load'] = pd.to_numeric(tmp['initial_load'], errors='coerce')
    tmp['board_count'] = pd.to_numeric(tmp['board_count'], errors='coerce')
    tmp['alight_count'] = pd.to_numeric(tmp['alight_count'], errors='coerce')

    # calc occupancy net change
    tmp['occupancy_net_change'] = tmp['board_count'] - tmp['alight_count']

    # calc cumulative sum in occupancy net change as an intermediate step
    tmp_sum_df = pd.DataFrame(tmp['occupancy_net_change'].cumsum())
    tmp_sum_df.columns = ['tmp_sum']

    # merge tmp sum (cumulative sum) into tmp
    tmp = tmp.merge(tmp_sum_df, left_index=True, right_index=True)

    # calc occupancy for a particular stop
    tmp['occupancy'] = tmp['tmp_sum'] + tmp['initial_load']

    return tmp.drop(columns=['tmp_sum', 'occupancy_net_change'])


def remove_duplicates(df):
    df = df.sort_values(by=['time_diff'])
    df_result = df.drop_duplicates(subset=['stop_seq'], keep='first')
    return df_result

In [12]:
def process_group(group_index):
    if (group_index % 10000) == 0:
        print(group_index)
        print(time.time())
        print("......")
    #print(group_index)
    v = df_grouped.loc[group_index]
    apc_trip_id, ride_check_date, version = v['apc_trip_id'], v['ride_check_date'], v['version']
    tripdata_slice = tripdata[(tripdata['apc_trip_id']==apc_trip_id) & (tripdata['version']==version)]
    apc_df_slice = apc_df[(apc_df['apc_trip_id']==apc_trip_id) & (apc_df['ride_check_date']==ride_check_date) & (apc_df['version']==version)]
    df = tripdata_slice.merge(apc_df_slice, how='left', left_on=['apc_trip_id', 'version', 'stop_id'], right_on=['apc_trip_id', 'version', 'stop_abbr'], suffixes=['_gtfs', None], indicator=True).sort_values(by=['stop_seq'])
    df = df.sort_values('stop_seq').reset_index(drop=True)
    
    # fill nan
    df[['board_count', 'alight_count']] = df[['board_count', 'alight_count']].fillna(0.0)
    
    # propagate missing values
    df[['route_id', 'trip_start_time', 'direction', 'apc_trip_id', 'version', 'initial_load', 'line', 'block_name', 'bus_number', 'service_id', 'ride_check_date', 'pattern_id', 'ride_check_mode']] = df[['route_id', 'trip_start_time', 'direction', 'apc_trip_id', 'version', 'initial_load', 'line', 'block_name', 'bus_number', 'service_id', 'ride_check_date', 'pattern_id', 'ride_check_mode']].fillna(method='ffill').fillna(method='bfill')
    
    # remove duplicates (if stop appears more than once on this trip)
    df = remove_duplicates(df)
    df = df.sort_values('stop_seq').reset_index(drop=True)
    
    # calculate occupancy 
    df = calc_bus_occupancy(df)
    #print(df.head())
    return df


def run_single_thread():
    result = []
    print(f"There are {df_grouped.shape[0]} total rows in df_grouped")
    start_time = time.time()
    count_val = 0
    for group_index in list(df_grouped.index):
        df = process_group(group_index)
        result.append(df)
        if (count_val % 10000) == 0:
            end_time = time.time() - start_time
            print(f"Done with {count_val}, took {end_time} seconds")
            start_time = time.time()
        count_val += 1
    return result


def run_parallel():
    print(f"There are {df_grouped.shape[0]} total rows in df_grouped")
    start_time = time.time()
    with Pool(16) as p:
        group_indexes = list(df_grouped.index)
        result = p.map(process_group, group_indexes)
    p.close()
    p.join()
    end_time = time.time() - start_time
    print(f"done processing in parallel, took {end_time} seconds")
    return result

In [13]:
if __name__ == '__main__':
    #result = run_single_thread()
    result = run_parallel()
    df1 = pd.concat(result, ignore_index=True)
    file_path = os.path.join(os.getcwd(), 'output', 'apc_cleaned_version_merged.parquet')
    df1.to_parquet(file_path, compression='gzip')

There are 573398 total rows in df_grouped
0
1609872293.5031316
......
90000
1609872327.460201
......
10000
1609872382.0160742
......
100000
1609872415.381473
......
20000
1609872469.745559
......
110000
1609872504.7747653
......
30000
1609872561.8743365
......
120000
1609872594.5421267
......
40000
1609872648.505417
......
130000
1609872680.7137113
......
50000
1609872734.5530028
......
140000
1609872769.70928
......
60000
1609872823.750342
......
70000
1609872915.7892792
......
80000
1609873002.4414597
......
180000
1609873131.715516
......
260000
1609873206.6194763
......
190000
1609873225.1982017
......
270000
1609873314.2211957
......
200000
1609873321.3391895
......
280000
1609873419.8797388
......
210000
1609873423.866094
......
220000
1609873537.450685
......
150000
1609873578.5543554
......
230000
1609873643.6429536
......
160000
1609873675.7640173
......
240000
1609873757.2312138
......
170000
1609873783.5435236
......
250000
1609873862.7702305
......
350000
1609873937.9780772

# Reformat columns and generate final CSV file

In [34]:
file_path = os.path.join(os.getcwd(), 'output', 'apc_cleaned_version_merged.parquet')
df = pd.read_parquet(file_path, engine='pyarrow')
print(f"Number of rows in df: {df.shape[0]}")
print(f"number of rows in apc_df: {apc_df.shape[0]}")

Number of rows in df: 20122541


In [19]:
x = df.isna().sum()
for k, v in x.iteritems():
    r = v / df.shape[0]
    print(f"{k}: {r}")

route_id: 0.0
trip_start_time: 0.0
stop_seq: 0.0
lat: 0.0
lon: 0.0
direction: 0.0
apc_trip_id: 0.0
version: 0.0
stop_id: 0.0
initial_load: 4.4229006664714956e-05
apc_stop_id: 0.7612871058381742
stop_number: 0.7612871058381742
e_time: 0.7612871058381742
actual_arrival_time: 0.7612871058381742
actual_depart_time: 0.9262527530693067
scheduled_arrival_time: 1.0
scheduled_departure_time: 1.0
sequence: 0.7612871058381742
board_count: 0.0
alight_count: 0.0
line: 4.4229006664714956e-05
block_name: 4.4229006664714956e-05
bus_number: 4.4229006664714956e-05
service_id: 4.4229006664714956e-05
ride_check_date: 4.4229006664714956e-05
pattern: 0.7612871058381742
pattern_id: 4.4229006664714956e-05
apc_lat: 0.7612871058381742
apc_lon: 0.7612871058381742
stop_abbr: 0.7612871058381742
apc_stop_name: 0.7612871058381742
ride_check_mode: 4.4229006664714956e-05
time_diff: 0.7612871058381742
_merge: 0.0
occupancy: 4.4229006664714956e-05


In [36]:
df = df[['apc_trip_id', 'actual_arrival_time', 'lat', 'lon', 'route_id', 'direction', 'stop_seq', 'board_count', 'alight_count', 'occupancy', 'stop_id', 'initial_load', 'trip_start_time', '_merge', 'version', 'ride_check_date']]

df = df.rename(columns={'apc_trip_id': 'trip_id',
                       'actual_arrival_time': 'actual_arrival_time',
                       'lat': 'stop_lat',
                       'lon': 'stop_lon',
                       'route_id': 'route_id',
                       'direction': 'direction_id',
                       'stop_seq': 'stop_sequence',
                       'board_count': 'board_count',
                       'alight_count': 'alight_count',
                       'occupancy': 'occupancy',
                       'stop_id': 'stop_id',
                       'initial_load': 'initial_load',
                       'trip_start_time': 'trip_start_time', 
                       '_merge': 'from_apc',
                       'version': 'version',
                       'ride_check_date': 'date'})

x = df.isna().sum()
for k, v in x.iteritems():
    r = v / df.shape[0]
    print(f"{k}: {r}")

trip_id: 0.0
actual_arrival_time: 0.7612871058381742
stop_lat: 0.0
stop_lon: 0.0
route_id: 0.0
direction_id: 0.0
stop_sequence: 0.0
board_count: 0.0
alight_count: 0.0
occupancy: 4.4229006664714956e-05
stop_id: 0.0
initial_load: 4.4229006664714956e-05
trip_start_time: 0.0
from_apc: 0.0
version: 0.0
date: 4.4229006664714956e-05


In [37]:
df = df.dropna(subset=['date', 'occupancy', 'initial_load'])
x = df.isna().sum()
for k, v in x.iteritems():
    r = v / df.shape[0]
    print(f"{k}: {r}")

trip_id: 0.0
actual_arrival_time: 0.7612765473369953
stop_lat: 0.0
stop_lon: 0.0
route_id: 0.0
direction_id: 0.0
stop_sequence: 0.0
board_count: 0.0
alight_count: 0.0
occupancy: 0.0
stop_id: 0.0
initial_load: 0.0
trip_start_time: 0.0
from_apc: 0.0
version: 0.0
date: 0.0


In [38]:
df.head(2)

Unnamed: 0,trip_id,actual_arrival_time,stop_lat,stop_lon,route_id,direction_id,stop_sequence,board_count,alight_count,occupancy,stop_id,initial_load,trip_start_time,from_apc,version,date
0,163531,14:30:06,36.167091,-86.781923,1,FROM DOWNTOWN,1,6.0,0.0,7.0,MCC5_11,1.0,06:18:00,both,52,2019-01-11
1,163531,14:34:08,36.165,-86.78406,1,FROM DOWNTOWN,2,0.0,1.0,6.0,CHA7AWN,1.0,06:18:00,both,52,2019-01-11


In [39]:
def fix_time(t):
    try:
        h, m, s = t.split(":")
        if h == "24":
            hh = '00'
        else:
            hh = h
        return f"{hh}:{m}:{s}"
    except:
        return t

def fix_merge(x):
    if x == 'both':
        return 1
    else:
        return 0

df['trip_id'] = df['trip_id'].astype(int)
df['stop_lat'] = df['stop_lat'].astype(float)
df['stop_lon'] = df['stop_lon'].astype(float)
df['route_id'] = df['route_id'].astype(str)
df['direction_id'] = df['direction_id'].astype(str)
df['stop_sequence'] = df['stop_sequence'].astype(int)
df['board_count'] = df['board_count'].astype(int)
df['alight_count'] = df['alight_count'].astype(int)
df['occupancy'] = df['occupancy'].astype(int)
df['stop_id'] = df['stop_id'].astype(str)
df['initial_load'] = df['initial_load'].astype(int)
df['trip_start_time'] = df['trip_start_time'].apply(lambda x: fix_time(x))
df['from_apc'] = df['from_apc'].apply(lambda x: fix_merge(x))
df['from_apc'] = df['from_apc'].astype(int)
df['date'] = pd.to_datetime(df['date'])
df['day_of_week'] = df['date'].dt.dayofweek

In [40]:
df.head(5)

Unnamed: 0,trip_id,actual_arrival_time,stop_lat,stop_lon,route_id,direction_id,stop_sequence,board_count,alight_count,occupancy,stop_id,initial_load,trip_start_time,from_apc,version,date,day_of_week
0,163531,14:30:06,36.167091,-86.781923,1,FROM DOWNTOWN,1,6,0,7,MCC5_11,1,06:18:00,1,52,2019-01-11,4
1,163531,14:34:08,36.165,-86.78406,1,FROM DOWNTOWN,2,0,1,6,CHA7AWN,1,06:18:00,1,52,2019-01-11,4
2,163531,,36.164406,-86.785498,1,FROM DOWNTOWN,3,0,0,6,CHA8AWN,1,06:18:00,0,52,2019-01-11,4
3,163531,,36.16232,-86.790405,1,FROM DOWNTOWN,4,0,0,6,CXONGULC,1,06:18:00,0,52,2019-01-11,4
4,163531,14:34:56,36.106851,-86.762639,1,FROM DOWNTOWN,5,0,5,1,100OAKS,1,06:18:00,1,52,2019-01-11,4


In [41]:
out_path = os.path.join(os.getcwd(), 'output', f"wegoapc_dashboard.csv")
df.to_csv(out_path, index=False)