In [1]:
import pandas as pd
import os
import zipfile
import dateparser
import swifter
import datetime
import time
import numpy as np

In [2]:
file_path = os.path.join(os.getcwd(), 'data.zip')
with zipfile.ZipFile(file_path,"r") as zip_ref:
    zip_ref.extractall(os.getcwd())
    
file_path = os.path.join(os.getcwd(), 'output.zip')
with zipfile.ZipFile(file_path,"r") as zip_ref:
    zip_ref.extractall(os.getcwd())

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [46]:
# load raw data

apc_path = os.path.join(os.getcwd(), 'data', 'apc')
dfs = []
for file in os.listdir(apc_path):
    df_temp = pd.read_csv(os.path.join(apc_path, file), dtype = str)
    dfs.append(df_temp)
apc_df = pd.concat(dfs, ignore_index=True)
print(f"number of rows in raw apc data: {apc_df.shape[0]}")
apc_df.head(2)

number of rows in raw apc data: 5019806


Unnamed: 0,initialload,stopid,stopnum,etime,actualarrivetime,actualdeparttime,schedarrivetime,scheddeparttime,sequence,ridboardcount,ridalightcount,ridchecktype,lineabbr,blockname,busnum,serviceid,ridcheckdate,pattern,patternid,tripid,lat,lon,stopabbr,stopname,ridcheckmode
0,0,4418,1,51300,50620,51304,-1,-1,1,2,0,0,14,1400,708,3,20200101,2,13955,193712,36166545,-86781895,MCC4_20,CENTRAL 4TH AVE - BAY 20,2
1,0,4418,1,54900,54437,55240,-1,-1,1,7,0,0,14,1400,708,3,20200101,2,13955,193713,36166545,-86781895,MCC4_20,CENTRAL 4TH AVE - BAY 20,2


In [25]:
#rename columns and filter for ride_check_mode = 2
apc_df.columns = ['initial_load', 'apc_stop_id', 'stop_number', 'e_time', 'actual_arrival_time', 'actual_depart_time',
                 'scheduled_arrival_time', 'scheduled_departure_time', 'sequence', 'board_count', 'alight_count',
                 'ride_check_type', 'line', 'block_name', 'bus_number', 'service_id', 'ride_check_date', 'pattern',
                 'pattern_id', 'apc_trip_id', 'apc_lat', 'apc_lon', 'stop_abbr', 'apc_stop_name', 'ride_check_mode']

print(f"number of rows in apc before dropping ride_check_mode=1: {apc_df.shape[0]}")
apc_df = apc_df.loc[apc_df['ride_check_mode'] == '2'] 
print(f"number of rows in apc after dropping ride_check_mode=1: {apc_df.shape[0]}")

print(f"number of readings before dropping nan values: {apc_df.shape[0]}")
apc_df = apc_df.dropna()
print(f"number of readings after dropping nan values: {apc_df.shape[0]}")
apc_df.head(2)

number of rows in apc before dropping ride_check_mode=1: 5019806
number of rows in apc after dropping ride_check_mode=1: 4940372
number of readings before dropping nan values: 4940372
number of readings after dropping nan values: 4939701


Unnamed: 0,initial_load,apc_stop_id,stop_number,e_time,actual_arrival_time,actual_depart_time,scheduled_arrival_time,scheduled_departure_time,sequence,board_count,alight_count,ride_check_type,line,block_name,bus_number,service_id,ride_check_date,pattern,pattern_id,apc_trip_id,apc_lat,apc_lon,stop_abbr,apc_stop_name,ride_check_mode
0,1,4431,1,22680,22026,22668,-1,-1,1,4,0,0,1,100,650,1,20190401,2,12375,163531,36166590,-86781996,MCC5_11,CENTRAL 5TH AVE - BAY 11,2
1,1,1788,3,23760,23504,23504,-1,-1,26,0,4,0,1,100,650,1,20190401,2,12375,163531,36106851,-86762639,100OAKS,100 OAKS MALL,2


In [26]:
# set column types

apc_df['initial_load'] = apc_df['initial_load'].astype(int)
apc_df['ride_check_mode'] = apc_df['ride_check_mode'].astype(int)
apc_df['sequence'] = apc_df['sequence'].astype(int)
apc_df['line'] = apc_df['line'].astype(int)

apc_df['apc_stop_id']=apc_df['apc_stop_id'].astype(int)
apc_df['stop_number']=apc_df['stop_number'].astype(int)

apc_df['board_count']=apc_df['board_count'].astype(int)
apc_df['alight_count']=apc_df['alight_count'].astype(int)

apc_df['apc_trip_id']=apc_df['apc_trip_id'].astype(int)
apc_df['apc_lat']=apc_df['apc_lat'].astype(int)
apc_df['apc_lon']=apc_df['apc_lon'].astype(int)
apc_df['pattern_id']=apc_df['pattern_id'].astype(int)
apc_df['bus_number']=apc_df['bus_number'].astype(int)
apc_df['service_id']=apc_df['service_id'].astype(int)

apc_df['stop_abbr'] = apc_df['stop_abbr'].astype(str)
apc_df['apc_stop_name'] = apc_df['apc_stop_name'].astype(str)
apc_df['block_name'] = apc_df['block_name'].astype(str)

apc_df = apc_df.drop('ride_check_type', 1)

apc_df = apc_df[(apc_df.ride_check_date != '20191017') & (apc_df.ride_check_date != '20191018')]


apc_df.head(2)

Unnamed: 0,initial_load,apc_stop_id,stop_number,e_time,actual_arrival_time,actual_depart_time,scheduled_arrival_time,scheduled_departure_time,sequence,board_count,alight_count,line,block_name,bus_number,service_id,ride_check_date,pattern,pattern_id,apc_trip_id,apc_lat,apc_lon,stop_abbr,apc_stop_name,ride_check_mode
0,1,4431,1,22680,22026,22668,-1,-1,1,4,0,1,100,650,1,20190401,2,12375,163531,36166590,-86781996,MCC5_11,CENTRAL 5TH AVE - BAY 11,2
1,1,1788,3,23760,23504,23504,-1,-1,26,0,4,1,100,650,1,20190401,2,12375,163531,36106851,-86762639,100OAKS,100 OAKS MALL,2


In [27]:
#convert dates to datetime 
apc_df['ride_check_date'] = apc_df['ride_check_date'].apply(lambda x: datetime.datetime.strptime(x,'%Y%m%d'))


In [29]:
def fix_times (time) :
    """
    :param time: time in seconds past midnight 
    :return: time converted from seconds past midnight to HH:MM:SS
    """
    time = int(time)
    if time == -1 :
        return None 
    else: 
        #fixed_time = datetime.timedelta(seconds = time) 
        #return fixed_time
        minu, sec = divmod(time, 60)
        hour, minu = divmod(minu, 60) 
        if hour == 24:
            hour = 0
        return datetime.time(hour=hour, minute=minu, second=sec)

In [30]:
#convert all times from seconds past midnight to HH:MM:SS
for l in ['e_time','actual_arrival_time','actual_depart_time','scheduled_arrival_time','scheduled_departure_time']:
    print(l)
    apc_df[l]= apc_df.swifter.set_npartitions(20).apply(lambda x: fix_times(x[l]),axis=1)

e_time
actual_arrival_time
actual_depart_time
scheduled_arrival_time
scheduled_departure_time


In [32]:
file_path = os.path.join(os.getcwd(), 'output', 'apc_cleaned.parquet')

apc_df.to_parquet(file_path, compression='gzip')


# Add GTFS version 

In [3]:
file_path = os.path.join(os.getcwd(), 'output', 'apc_cleaned.parquet')
apc_df = pd.read_parquet(file_path, engine='pyarrow')
apc_df.head(2)

Unnamed: 0,initial_load,apc_stop_id,stop_number,e_time,actual_arrival_time,actual_depart_time,scheduled_arrival_time,scheduled_departure_time,sequence,board_count,alight_count,line,block_name,bus_number,service_id,ride_check_date,pattern,pattern_id,apc_trip_id,apc_lat,apc_lon,stop_abbr,apc_stop_name,ride_check_mode
0,1,4431,1,06:18:00,06:07:06,06:17:48,,,1,4,0,1,100,650,1,2019-04-01,2,12375,163531,36166590,-86781996,MCC5_11,CENTRAL 5TH AVE - BAY 11,2
1,1,1788,3,06:36:00,06:31:44,06:31:44,,,26,0,4,1,100,650,1,2019-04-01,2,12375,163531,36106851,-86762639,100OAKS,100 OAKS MALL,2


In [10]:
file_path = os.path.join(os.getcwd(), 'data', 'trips_gtfs_surrogate.parquet.gzip')
trip_stop_sequence = pd.read_parquet(file_path, engine='pyarrow')
trip_stop_sequence['apc_trip_id'] = trip_stop_sequence['trip_id']
trip_stop_sequence.head(2)

Unnamed: 0,pattern_id,stop_id,lat,lon,stop_seq,direction,version,activation_date,deactivation_date,route_id,trip_start_time,trip_id,apc_trip_id
0,14729,6AOAKSN,361480730,-867723030,8,FROM DOWNTOWN,51,2018-10-14,2018-12-25 23:59:59,1,15:15:00,163534,163534
1,14729,6AOAKSN,361480730,-867723030,8,FROM DOWNTOWN,51,2018-10-14,2018-12-25 23:59:59,1,16:20:00,163535,163535


In [11]:
# add trip version

def find_version_id(apc_trip_id, ride_check_date) :
    temp = trip_date.loc[(trip_date['apc_trip_id'] == apc_trip_id)]
    if (temp is None or temp.size==0):
        return 0
    temp['ride_check_date'] = ride_check_date
    temp['days'] = (temp['ride_check_date'] - temp['activation_date']).dt.days
    df3 = temp.loc[temp['days'] >= 0]
    if (df3 is None or df3.size==0):
        return 0
    #df3=df3.reset_index()
    #df3 = df3.sort_values(['days'], ascending = True)  
    #return df3['version'].iloc[0]
    ind = df3[['days']].idxmin()['days']
    result = df3.loc[ind, 'version']
    return result

trip_date=trip_stop_sequence[['apc_trip_id', 'version', 'activation_date']].drop_duplicates(['apc_trip_id','version', 'activation_date'],keep='first')
trip_date=trip_date[['apc_trip_id', 'version', 'activation_date']]

apc_df_trip_dates=apc_df[['apc_trip_id','ride_check_date']]
apc_df_trip_dates=apc_df_trip_dates.drop_duplicates(['apc_trip_id','ride_check_date'],keep='first',ignore_index=True)

apc_df_trip_dates=apc_df_trip_dates.reset_index()
trip_date=trip_date.reset_index()

print(f"starting to apply the version number at time {time.time()}")
apc_df_trip_dates['version']= apc_df_trip_dates.apply(lambda x: find_version_id(x['apc_trip_id'], x['ride_check_date'] ),axis=1)
print(f"done the version number at time {time.time()}")

apc_df_trip_dates = apc_df_trip_dates.loc[ : , ('apc_trip_id', 'ride_check_date', 'version')]

TypeError: drop_duplicates() got an unexpected keyword argument 'ignore_index'

In [None]:
#apc_df_trip_dates
#df = pd.merge_asof(apc_df_trip_dates, trip_date, left_on='ride_check_date', right_on='activation_date', by='apc_trip_id', direction='backward', allow_exact_matches=True)

In [None]:
print(f"length of apc_df before merge: {apc_df.shape[0]}")
apc_df = apc_df.merge(apc_df_trip_dates, on=['apc_trip_id', 'ride_check_date'], how='left')
print(f"length of apc_df after merge: {apc_df.shape[0]}")

apc_df_trip_dates.loc[(apc_df_trip_dates.version==0)].shape[0]

In [None]:
file_path = os.path.join(os.getcwd(), 'output', 'apc_cleaned_version.parquet')
apc_df.to_parquet(file_path, compression='gzip')

# GTFS merge

In [11]:
file_path = os.path.join(os.getcwd(), 'output', 'apc_cleaned_version.parquet')
apc_df = pd.read_parquet(file_path, engine='pyarrow')
print(f"number of rows in apc_df before dropping nan {apc_df.shape[0]}")
apc_df = apc_df.dropna(subset=['e_time', 'actual_arrival_time'])
print(f"number of rows in apc_df after dropping nan {apc_df.shape[0]}")
apc_df.head(2)

number of rows in apc_df before dropping nan 4939284
number of rows in apc_df after dropping nan 4860423


Unnamed: 0,initial_load,apc_stop_id,stop_number,e_time,actual_arrival_time,actual_depart_time,scheduled_arrival_time,scheduled_departure_time,sequence,board_count,alight_count,line,block_name,bus_number,service_id,ride_check_date,pattern,pattern_id,apc_trip_id,apc_lat,apc_lon,stop_abbr,apc_stop_name,ride_check_mode,version
0,1,4431,1,06:18:00,06:07:06,06:17:48,,,1,4,0,1,100,650,1,2019-04-01,2,12375,163531,36166590,-86781996,MCC5_11,CENTRAL 5TH AVE - BAY 11,2,53
1,1,1788,3,06:36:00,06:31:44,06:31:44,,,26,0,4,1,100,650,1,2019-04-01,2,12375,163531,36106851,-86762639,100OAKS,100 OAKS MALL,2,53


In [12]:
# add time_diff
def add_time_diff(actual_arrival_time, e_time):
    actual_arrival_time_dt = datetime.datetime.combine(datetime.date.today(), actual_arrival_time)
    e_time_dt = datetime.datetime.combine(datetime.date.today(), e_time)
    diff = actual_arrival_time_dt - e_time_dt
    return abs(diff.total_seconds())

apc_df['time_diff'] = apc_df.apply(lambda row: add_time_diff(row['actual_arrival_time'], row['e_time']), axis=1)
apc_df.head(2)

Unnamed: 0,initial_load,apc_stop_id,stop_number,e_time,actual_arrival_time,actual_depart_time,scheduled_arrival_time,scheduled_departure_time,sequence,board_count,alight_count,line,block_name,bus_number,service_id,ride_check_date,pattern,pattern_id,apc_trip_id,apc_lat,apc_lon,stop_abbr,apc_stop_name,ride_check_mode,version,time_diff
0,1,4431,1,06:18:00,06:07:06,06:17:48,,,1,4,0,1,100,650,1,2019-04-01,2,12375,163531,36166590,-86781996,MCC5_11,CENTRAL 5TH AVE - BAY 11,2,53,654.0
1,1,1788,3,06:36:00,06:31:44,06:31:44,,,26,0,4,1,100,650,1,2019-04-01,2,12375,163531,36106851,-86762639,100OAKS,100 OAKS MALL,2,53,256.0


In [13]:
#apc_df = apc_df[(apc_df['ride_check_date']>=pd.Timestamp(year=2019, month=3, day=1, hour=0)) & (apc_df['ride_check_date']<=pd.Timestamp(year=2019, month=4, day=1, hour=0))]      
#len(apc_df)

In [14]:
file_path = os.path.join(os.getcwd(), 'data', 'trips_gtfs_surrogate.parquet.gzip')
trip_stop_sequence = pd.read_parquet(file_path, engine='pyarrow')
trip_stop_sequence['apc_trip_id'] = trip_stop_sequence['trip_id']
trip_stop_sequence['lat']=trip_stop_sequence.lat/1e7
trip_stop_sequence['lon']=trip_stop_sequence.lon/1e7
trip_stop_sequence.head(2)

Unnamed: 0,pattern_id,stop_id,lat,lon,stop_seq,direction,version,activation_date,deactivation_date,route_id,trip_start_time,trip_id,apc_trip_id
0,14729,6AOAKSN,36.148073,-86.772303,8,FROM DOWNTOWN,51,2018-10-14,2018-12-25 23:59:59,1,15:15:00,163534,163534
1,14729,6AOAKSN,36.148073,-86.772303,8,FROM DOWNTOWN,51,2018-10-14,2018-12-25 23:59:59,1,16:20:00,163535,163535


In [8]:
tripdata = trip_stop_sequence[['route_id', 'trip_start_time', 'stop_seq', 'lat', 'lon', 'direction', 'apc_trip_id', 'version', 'stop_id']]
tripdata.head(2)

Unnamed: 0,route_id,trip_start_time,stop_seq,lat,lon,direction,apc_trip_id,version,stop_id
0,1,15:15:00,8,36.148073,-86.772303,FROM DOWNTOWN,163534,51,6AOAKSN
1,1,16:20:00,8,36.148073,-86.772303,FROM DOWNTOWN,163535,51,6AOAKSN


In [61]:
df_grouped = apc_df.groupby(['apc_trip_id', 'ride_check_date', 'version']).size().reset_index().rename(columns={0:'count'})
df_grouped.head(2)

Unnamed: 0,apc_trip_id,ride_check_date,version,count
0,163531,2019-01-11,52,3
1,163531,2019-01-14,52,3


In [9]:
apc_df.isna().sum()

initial_load                      0
apc_stop_id                       0
stop_number                       0
e_time                            0
actual_arrival_time               0
actual_depart_time          3358920
scheduled_arrival_time      4860423
scheduled_departure_time    4860423
sequence                          0
board_count                       0
alight_count                      0
line                              0
block_name                        0
bus_number                        0
service_id                        0
ride_check_date                   0
pattern                           0
pattern_id                        0
apc_trip_id                       0
apc_lat                           0
apc_lon                           0
stop_abbr                         0
apc_stop_name                     0
ride_check_mode                   0
version                           0
time_diff                         0
dtype: int64

In [10]:
def calc_bus_occupancy(df):
    """
    calculate occupancy at each stop along a route

    :param df: dataframe with board/alight values for all stops along a single trip
    """
    tmp = df.copy()

    tmp['initial_load'] = pd.to_numeric(tmp['initial_load'], errors='coerce')
    tmp['board_count'] = pd.to_numeric(tmp['board_count'], errors='coerce')
    tmp['alight_count'] = pd.to_numeric(tmp['alight_count'], errors='coerce')

    # calc occupancy net change
    tmp['occupancy_net_change'] = tmp['board_count'] - tmp['alight_count']

    # calc cumulative sum in occupancy net change as an intermediate step
    tmp_sum_df = pd.DataFrame(tmp['occupancy_net_change'].cumsum())
    tmp_sum_df.columns = ['tmp_sum']

    # merge tmp sum (cumulative sum) into tmp
    tmp = tmp.merge(tmp_sum_df, left_index=True, right_index=True)

    # calc occupancy for a particular stop
    tmp['occupancy'] = tmp['tmp_sum'] + tmp['initial_load']

    return tmp.drop(columns=['tmp_sum', 'occupancy_net_change'])


def remove_duplicates(df):
    df = df.sort_values(by=['time_diff'])
    df_result = df.drop_duplicates(subset=['stop_seq'], keep='first')
    return df_result
    
        

result = []

print(f"There are {df_grouped.shape[0]} total rows in df_grouped")
start_time = time.time()
for k, v in df_grouped.iterrows():
    apc_trip_id, ride_check_date, version = v['apc_trip_id'], v['ride_check_date'], v['version']
    tripdata_slice = tripdata[(tripdata['apc_trip_id']==apc_trip_id) & (tripdata['version']==version)]
    apc_df_slice = apc_df[(apc_df['apc_trip_id']==apc_trip_id) & (apc_df['ride_check_date']==ride_check_date) & (apc_df['version']==version)]
    df = tripdata_slice.merge(apc_df_slice, how='left', left_on=['apc_trip_id', 'version', 'stop_id'], right_on=['apc_trip_id', 'version', 'stop_abbr'], suffixes=['_gtfs', None], indicator=True).sort_values(by=['stop_seq'])
    df = df.sort_values('stop_seq').reset_index(drop=True)
    
    # fill nan
    df[['board_count', 'alight_count']] = df[['board_count', 'alight_count']].fillna(0.0)
    
    # propagate missing values
    df[['route_id', 'trip_start_time', 'direction', 'apc_trip_id', 'version', 'initial_load', 'line', 'block_name', 'bus_number', 'service_id', 'ride_check_date', 'pattern_id', 'ride_check_mode']] = df[['route_id', 'trip_start_time', 'direction', 'apc_trip_id', 'version', 'initial_load', 'line', 'block_name', 'bus_number', 'service_id', 'ride_check_date', 'pattern_id', 'ride_check_mode']].fillna(method='ffill').fillna(method='bfill')
    
    # remove duplicates (if stop appears more than once on this trip)
    df = remove_duplicates(df)
    df = df.sort_values('stop_seq').reset_index(drop=True)
    
    # calculate occupancy 
    df = calc_bus_occupancy(df)
    result.append(df)
    
    if (k % 10000) == 0:
        end_time = time.time() - start_time
        print(f"Done with {k}, took {end_time} seconds")
        start_time = time.time()

NameError: name 'df_grouped' is not defined

In [None]:
df1 = pd.concat(result, ignore_index=True)
df1.head(5)

In [None]:
file_path = os.path.join(os.getcwd(), 'output', 'apc_cleaned_version_merged.parquet')
df1.to_parquet(file_path, compression='gzip')

# VOID

In [5]:
def fix_times(time):
    """
    :param time: time in seconds past midnight 
    :return: time converted from seconds past midnight to HH:MM:SS
    """
    time = int(time)
    if time == -1 :
        return None 
    else: 
        fixed_time = datetime.timedelta(seconds = time) 
        dt = datetime.datetime(year=2020, month=1, day=1, hour=0, minute=0, second=0)
        result = dt + fixed_time
        return result.time().isoformat(timespec='seconds')

def fix_dates(d):
    year, month, day = d[0:4], d[4:6], d[6:]
    return f"{year}-{month}-{day}"

def get_date_time(row):
    return f"{row['date_apc']} {row['arrival_time_apc']}"

apc_df['arrival_time_apc'] = apc_df['arrival_time_apc'].apply(lambda x: fix_times(x))
apc_df['date_apc'] = apc_df['date_apc'].apply(lambda x: fix_dates(x))
apc_df['date_time_apc'] = apc_df.apply(lambda row: get_date_time(row), axis=1)
apc_df.head(2)

Unnamed: 0,initial_load_apc,stop_id_apc,stop_number_apc,arrival_time_apc,stop_sequence_apc,board_count_apc,alight_count_apc,ridecheck_type_apc,date_apc,trip_id_apc,stop_lat_apc,stop_lon_apc,ridecheck_mode_apc,date_time_apc
0,1,MCC5_11,1,06:07:06,1,4,0,0,2019-04-01,163531,36166590.0,-86781996.0,2,2019-04-01 06:07:06
1,1,100OAKS,3,06:31:44,26,0,4,0,2019-04-01,163531,36106851.0,-86762639.0,2,2019-04-01 06:31:44


In [6]:
# load GTFS (df_stop_times, df_stops, df_trips)

def load_gtfs(file_name, gtfs_path):
    dfs = []
    for gtfs_dir_name in os.listdir(gtfs_path):
        if '-' in gtfs_dir_name:
            file_path = os.path.join(gtfs_path, gtfs_dir_name, file_name)
            temp = pd.read_csv(file_path, index_col=False)
            temp['gtfs_start_date'] = gtfs_dir_name
            dfs.append(temp)
    df = pd.concat(dfs, ignore_index=True)
    df['gtfs_start_date_dt'] = pd.to_datetime(df['gtfs_start_date'])
    df = df.sort_values(by=['gtfs_start_date_dt'])
    if 'trip_id' in df.columns:
        df['trip_id'] = df['trip_id'].astype(int)
    df = df.drop(columns=['gtfs_start_date_dt', 'gtfs_start_date'])
    return df

gtfs_path = os.path.join(os.getcwd(), 'data', 'gtfs')
df_stop_times = load_gtfs('stop_times.txt', gtfs_path)
df_stops = load_gtfs('stops.txt', gtfs_path)
df_trips = load_gtfs('trips.txt', gtfs_path)

df_stop_times = df_stop_times.drop_duplicates(subset=['trip_id', 'stop_id', 'stop_sequence'], keep='last')
df_stops = df_stops.drop_duplicates(subset=['stop_id'], keep='last')
df_trips = df_trips.drop_duplicates(subset=['trip_id'], keep='last')


In [7]:
# join with stops.txt

df = apc_df.merge(df_stops, left_on=['stop_id_apc'], right_on=['stop_id'], how='left', suffixes=[None, "_gtfs_stops"],validate='many_to_one')

y = len(df)
x = len(df[~df['stop_id'].isnull()])
print(y, x)
per_mis = (y-x)/y
print(f"Precentage of trips missing: {per_mis}")

df = df[~df['stop_id'].isnull()]
df.head(2)

4939701 4939651
Precentage of trips missing: 1.012207014149237e-05


Unnamed: 0,initial_load_apc,stop_id_apc,stop_number_apc,arrival_time_apc,stop_sequence_apc,board_count_apc,alight_count_apc,ridecheck_type_apc,date_apc,trip_id_apc,stop_lat_apc,stop_lon_apc,ridecheck_mode_apc,date_time_apc,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding,platform_code
0,1,MCC5_11,1,06:07:06,1,4,0,0,2019-04-01,163531,36166590.0,-86781996.0,2,2019-04-01 06:07:06,MCC5_11,MCC5_11,CENTRAL 5TH AVE - BAY 11,MCC - BAY 11 & 5TH AVE N,36.16659,-86.781996,,,0.0,MTAMCC,,0.0,
1,1,100OAKS,3,06:31:44,26,0,4,0,2019-04-01,163531,36106851.0,-86762639.0,2,2019-04-01 06:31:44,100OAKS,100OAKS,100 OAKS MALL,ARMORY OAKS DR & WILBUR DR,36.106851,-86.762639,,,,,,,


In [8]:
# merge with gtfs trips

df = df.merge(df_trips, left_on=['trip_id_apc'], right_on=['trip_id'], how='left', suffixes=[None, "_gtfs_trips"],validate='many_to_one')

y = len(df)
x = len(df[~df['trip_id'].isnull()])
print(y, x)
per_mis = (y-x)/y
print(f"Precentage of trips missing: {per_mis}")

df = df[~df['trip_id'].isnull()]
df.head(2)

4939651 4939013
Precentage of trips missing: 0.00012915892236111418


Unnamed: 0,initial_load_apc,stop_id_apc,stop_number_apc,arrival_time_apc,stop_sequence_apc,board_count_apc,alight_count_apc,ridecheck_type_apc,date_apc,trip_id_apc,stop_lat_apc,stop_lon_apc,ridecheck_mode_apc,date_time_apc,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding,platform_code,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible,bikes_allowed
0,1,MCC5_11,1,06:07:06,1,4,0,0,2019-04-01,163531,36166590.0,-86781996.0,2,2019-04-01 06:07:06,MCC5_11,MCC5_11,CENTRAL 5TH AVE - BAY 11,MCC - BAY 11 & 5TH AVE N,36.16659,-86.781996,,,0.0,MTAMCC,,0.0,,1_merged_193260,1_merged_193341,163531.0,100 OAKS EXPRESS,,0.0,a_17704,12375.0,0.0,0.0
1,1,100OAKS,3,06:31:44,26,0,4,0,2019-04-01,163531,36106851.0,-86762639.0,2,2019-04-01 06:31:44,100OAKS,100OAKS,100 OAKS MALL,ARMORY OAKS DR & WILBUR DR,36.106851,-86.762639,,,,,,,,1_merged_193260,1_merged_193341,163531.0,100 OAKS EXPRESS,,0.0,a_17704,12375.0,0.0,0.0


In [9]:
df_stop_times.head(2)

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint
377264,165078,6:59:52,6:59:52,UNI3AWF,61,,0.0,0.0,15.555,0.0
366381,164027,10:21:30,10:21:30,12ADIVSF,10,,0.0,0.0,2.4237,0.0


In [17]:
df['trip_id'] = df['trip_id'].astype(int)
df['stop_sequence_apc'] = df['stop_sequence_apc'].astype(int)
df_stop_times['trip_id'] = df_stop_times['trip_id'].astype(int)
df_stop_times['stop_sequence'] = df_stop_times['stop_sequence'].astype(int)
df1 = df.merge(df_stop_times, left_on=['trip_id', 'stop_id', 'stop_sequence_apc'], right_on=['trip_id', 'stop_id', 'stop_sequence'], how='left', validate="many_to_one", suffixes=(None, '_right'))



In [18]:
df1.head(2)

Unnamed: 0,initial_load_apc,stop_id_apc,stop_number_apc,arrival_time_apc,stop_sequence_apc,board_count_apc,alight_count_apc,ridecheck_type_apc,date_apc,trip_id_apc,stop_lat_apc,stop_lon_apc,ridecheck_mode_apc,date_time_apc,stop_id,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding,platform_code,route_id,service_id,trip_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible,bikes_allowed,arrival_time,departure_time,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint
0,1,MCC5_11,1,06:07:06,1,4,0,0,2019-04-01,163531,36166590.0,-86781996.0,2,2019-04-01 06:07:06,MCC5_11,MCC5_11,CENTRAL 5TH AVE - BAY 11,MCC - BAY 11 & 5TH AVE N,36.16659,-86.781996,,,0.0,MTAMCC,,0.0,,1_merged_193260,1_merged_193341,163531,100 OAKS EXPRESS,,0.0,a_17704,12375.0,0.0,0.0,06:18:00,06:18:00,1.0,,,1.0,,
1,1,100OAKS,3,06:31:44,26,0,4,0,2019-04-01,163531,36106851.0,-86762639.0,2,2019-04-01 06:31:44,100OAKS,100OAKS,100 OAKS MALL,ARMORY OAKS DR & WILBUR DR,36.106851,-86.762639,,,,,,,,1_merged_193260,1_merged_193341,163531,100 OAKS EXPRESS,,0.0,a_17704,12375.0,0.0,0.0,,,,,,,,


In [21]:
df_stop_times[(df_stop_times['trip_id']==163531)]

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint
1423158,163531,06:18:00,06:18:00,MCC5_11,1,,,1.0,,
1423159,163531,06:19:23,06:19:23,CHA7AWN,2,,,,0.3178,
1423160,163531,06:20:00,06:20:00,CHA8AWN,3,,,,0.4601,
1423161,163531,06:22:13,06:22:13,CXONGULC,4,,,,0.9683,
1423162,163531,06:36:00,06:36:00,100OAKS,5,,1.0,,10.4874,


In [19]:
y = len(df1)
x = len(df1[~df1['stop_sequence'].isnull()])
print(y, x)
per_mis = (y-x)/y
print(f"Precentage of trips missing: {per_mis}")

#df1 = df1[~df1['trip_id'].isnull()]
#df1.head(2)

4939013 2606336
Precentage of trips missing: 0.4722961854929315


In [None]:
drop_cols = []
for col in df_dup.columns:
    if (col not in df1.columns) or ("_right" in col):
        drop_cols.append(col)
df_dup = df_dup.drop(drop_cols, axis=1)
df1 = df1.drop(drop_cols, axis=1)
df1 = pd.concat([df1, df_dup], ignore_index=True)


print(f"final df length: {df1.shape[0]}")

y = len(df1)
x = len(df1[~df1['stop_sequence'].isnull()])
print(y, x)
per_mis = (y-x)/y
print(f"Precentage of stop_times missing: {per_mis}")

df1 = df1[~df1['stop_sequence'].isnull()]

In [None]:
df_stop_times_dup_first = df_stop_times_dup.loc[df_stop_times_dup.duplicated(subset=['trip_id', 'stop_id'], keep='first')]
df_dup_first = df_dup_first.merge(df_stop_times_dup_first, left_on=['trip_id', 'stop_id'], right_on=['trip_id', 'stop_id'], how='left', validate="many_to_one", suffixes=(None, '_right'))

In [13]:
df_stop_times_dup_first[df_stop_times_dup_first['trip_id']==165065].sort_values(['stop_sequence'])

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint
1379425,165065,08:00:00,08:00:00,MCC4_21,71,,1.0,,16.5492,


In [14]:
df_stop_times_dup_last[df_stop_times_dup_last['trip_id']==165065].sort_values(['stop_sequence'])

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint
1379355,165065,07:15:00,07:15:00,MCC4_21,1,,,1.0,,


In [16]:
df_stop_times_dup[df_stop_times_dup['trip_id']==165065]

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint
1379355,165065,07:15:00,07:15:00,MCC4_21,1,,,1.0,,
1379425,165065,08:00:00,08:00:00,MCC4_21,71,,1.0,,16.5492,


In [17]:
df_stop_times_dup_first = df_stop_times_dup.loc[df_stop_times_dup.duplicated(subset=['trip_id', 'stop_id'], keep='first')]

df_stop_times_dup_first[df_stop_times_dup_first['trip_id']==165065]

Unnamed: 0,trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint
1379425,165065,08:00:00,08:00:00,MCC4_21,71,,1.0,,16.5492,
