# CARTA APC Dataprep

Goal: clean and processing CARTA APC data.

Directory structure:

    * data
        * cartaapc2019: all carta 2019 APC data
        * cartaapc2020: all carta 2020 APC data
        * cartagtfs: carta static GTFS for all years
        * ridecheckstops: stop information from ridecheck (this is a substitute for that GTFS stops.txt does not include all stops in the APC data)
    * output: output data files
    
This notebook generates 3 files

* cartaapc_cleaned_{year}.csv: temporary file
* cartaapc_merged_{year}.csv: temporary file
* cartaapc_dashboard_{year}.csv: final output file

In [32]:
import pandas as pd
import os
import zipfile
import dateparser
import swifter

In [33]:
file_path = os.path.join(os.getcwd(), 'data.zip')
with zipfile.ZipFile(file_path,"r") as zip_ref:
    zip_ref.extractall(os.getcwd())

In [34]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# 1. Clean up the raw APC Data

We need to deal with some duplicates and erroneous data in the raw APC file. 

Inputs: raw APC data

Outputs: 'cartaapc_cleaned_{year}.csv'

In [35]:
year = '2019'

In [36]:
if year == '2019':
    apc_path = os.path.join(os.getcwd(), 'data', f"cartaapc{year}")
    dfs = []
    for file in os.listdir(apc_path):
        df_temp = pd.read_csv(os.path.join(apc_path, file), index_col=0)
        dfs.append(df_temp)
    apc_df = pd.concat(dfs, ignore_index=True)
elif year == '2020':
    apc_path = os.path.join(os.getcwd(), 'data', f"cartaapc{year}", 'chattanooga_apc_jan20_through_jun20.csv')
    apc_df = pd.read_csv(apc_path, index_col=0)
print(f"shape of apc_df: {apc_df.shape}")
#apc_df.head()



shape of apc_df: (9690269, 71)


In [37]:
def fix_times (time) :
    """
    convert time to HH:MM format - removing 12/30/1899  in front of non-null times
    
    :param time: a time value 
    :return: time converted to HH:MM format
    """
    
    time = str(time)
    if time == 'NaN':
        return time

    # take last characters for hours:minutes:seconds format
    fixed_time = time[10:19]
    return fixed_time


for l in ['TRIP_START_TIME','TIME_SCHEDULED','TIME_ACTUAL_ARRIVE','TIME_ACTUAL_DEPART']:
    print(l)
    apc_df[l]= apc_df.swifter.set_npartitions(20).apply(lambda x: fix_times(x[l]),axis=1)

TRIP_START_TIME
TIME_SCHEDULED
TIME_ACTUAL_ARRIVE
TIME_ACTUAL_DEPART


In [38]:
print(f"unique DIRECTION_NAME vals: {apc_df['DIRECTION_NAME'].unique()}")

print("DIRECTION_NAME should be either INBOUND or OUTBOUND")

apc_df.loc[apc_df['DIRECTION_NAME'] == 'OUTYBOUND', ['DIRECTION_NAME']] = 'OUTBOUND'
apc_df.loc[apc_df['DIRECTION_NAME'] == '0', ['DIRECTION_NAME']] = 'OUTBOUND'
apc_df.loc[apc_df['DIRECTION_NAME'] == '1', ['DIRECTION_NAME']] = 'INBOUND'

print(f"unique DIRECTION_NAME vals: {apc_df['DIRECTION_NAME'].unique()}")

unique DIRECTION_NAME vals: ['OUTBOUND' 'INBOUND' 'OUTYBOUND' '0' '1']
DIRECTION_NAME should be either INBOUND or OUTBOUND
unique DIRECTION_NAME vals: ['OUTBOUND' 'INBOUND']


In [39]:
#convert survey_date to a datetime object and create a new 'Date' column. 
apc_df['DATE'] = pd.to_datetime(apc_df['SURVEY_DATE'])

In [40]:
print(f"Number of readings in raw apc_df: {apc_df.shape[0]}")

apc_df = apc_df.drop_duplicates(['TRIP_KEY','SURVEY_DATE','DIRECTION_NAME','STOP_ID','SORT_ORDER'],keep='first')
print(f"Number of readings after dropping duplicates: {apc_df.shape[0]}")

apc_df = apc_df[(apc_df.ROUTE_NUMBER != 33) & (apc_df.ROUTE_NUMBER != 34) & (apc_df.ROUTE_NUMBER != 14)]
print(f"Number of readings after dropping routes 33, 34, 14: {apc_df.shape[0]}")

Number of readings in raw apc_df: 9690269
Number of readings after dropping duplicates: 9683633
Number of readings after dropping routes 33, 34, 14: 8238702


In [41]:
out_path = os.path.join(os.getcwd(), 'output', f"cartaapc_cleaned_{year}.csv")
apc_df.to_csv(out_path, index=False)

# 2. GTFS Join

Join the CARTA APC data with GTFS and ridecheck stops

Inputs: cartaapc_cleaned_{year}.csv, GTFS (trips.txt, stop_times.txt), ridecheck

Outputs: cartaapc_merged_{year}.csv

In [42]:
out_path = os.path.join(os.getcwd(), 'output', f"cartaapc_cleaned_{year}.csv")
apc_df = pd.read_csv(out_path, index_col=None)
apc_df['DATE'] = pd.to_datetime(apc_df['DATE'])
print(f"Number of rows in cartaapc_cleaned_{year}.csv: {apc_df.shape[0]}")
#apc_df.head()



Number of rows in cartaapc_cleaned_2019.csv: 8238702


In [43]:
# load GTFS (df_stop_times, df_stops, df_trips)

def load_gtfs(file_name, gtfs_list, gtfs_path):
    dfs = []
    for gtfs in gtfs_list:
        file_path = os.path.join(gtfs_path, gtfs[0], file_name)
        temp = pd.read_csv(file_path, index_col=False)
        temp['gtfs_start_date'] = gtfs[0]
        temp['gtfs_end_date'] = gtfs[1]
        dfs.append(temp)
    df = pd.concat(dfs, ignore_index=True)
    df['gtfs_start_date_dt'] = pd.to_datetime(df['gtfs_start_date'])
    df['gtfs_end_date_dt'] = pd.to_datetime(df['gtfs_end_date'])
    if 'trip_id' in df.columns:
        df['trip_id'] = df['trip_id'].astype(str)
        df['trip_id'] = df['trip_id'].apply(lambda x: x[0:-3])
    if 'stop_id' in df.columns:
        df['stop_id'] = df['stop_id'].astype(int)
        df['stop_id'] = df['stop_id'].astype(str)
    return df

gtfs_path = os.path.join(os.getcwd(), 'data', 'cartagtfs')
df_stop_times, df_stops, df_trips = [], [], []
if year == '2019':
    gtfs_list = [('2018-08-19', '2019-05-05'), ('2019-05-05', '2019-08-18'), ('2019-08-18', '2020-01-02')]
elif year == '2020':
    gtfs_list = [('2019-08-18', '2020-04-13'), ('2020-04-13', '2020-08-16'), ('2020-08-16', '2021-01-02')]
    
df_stop_times = load_gtfs('stop_times.txt', gtfs_list, gtfs_path=gtfs_path)
df_stops = load_gtfs('stops.txt', gtfs_list, gtfs_path=gtfs_path)
df_trips = load_gtfs('trips.txt', gtfs_list, gtfs_path=gtfs_path)

df_stop_times = df_stop_times.drop(['gtfs_start_date', 'gtfs_end_date'], axis=1)
df_stops = df_stops.drop(['gtfs_start_date', 'gtfs_end_date'], axis=1)
df_trips = df_trips.drop(['gtfs_start_date', 'gtfs_end_date'], axis=1)

df_stop_times = df_stop_times.drop_duplicates(subset=['trip_id','arrival_time','departure_time','stop_id','stop_sequence'], keep='last')
df_stops = df_stops.drop_duplicates(subset=['stop_id'], keep='last')
df_trips = df_trips.drop_duplicates(subset=['trip_id'], keep='last')

apc_df['STOP_ID'] = apc_df['STOP_ID'].astype(int)
apc_df['STOP_ID'] = apc_df['STOP_ID'].astype(str)
apc_df['TRIP_KEY'] = apc_df['TRIP_KEY'].astype(str)

In [44]:
# join apc with ridecheck stops

file_path = os.path.join(os.getcwd(), 'data', 'ridecheckstops', 'STOPS.xlsx')
apc_stops_df = pd.read_excel(file_path)[['STOP_ID', 'MAIN_STREET', 'CROSS_STREET', 'LATITUDE', 'LONGITUDE']]
apc_stops_df['STOP_ID'] = apc_stops_df['STOP_ID'].astype(int)
apc_stops_df['STOP_ID'] = apc_stops_df['STOP_ID'].astype(str)

df = apc_df.merge(apc_stops_df, left_on='STOP_ID', right_on='STOP_ID', how='left', validate='many_to_one')

y = len(df)
x = len(df[~df['LATITUDE'].isnull()])
print(y, x)
per_mis = (y-x)/y
print(f"Precentage of stops missing: {per_mis}")

8238702 8238702
Precentage of stops missing: 0.0


In [45]:
# join apc data with GTFS trips.txt

df = df.merge(df_trips, left_on=['TRIP_KEY'], right_on=['trip_id'], how='left', validate='many_to_one')

y = len(df)
x = len(df[~df['trip_id'].isnull()])
print(y, x)
per_mis = (y-x)/y
print(f"Precentage of trips missing: {per_mis}")

df = df[~df['trip_id'].isnull()]

8238702 8230391
Precentage of trips missing: 0.0010087754114665149


In [46]:
# join apc data with GTFS stop_times.txt
# Note that this code block handles the fact that a stop can appear more than once in a single GTFS trip

df_dup = df.loc[df.duplicated(subset=['SURVEY_DATE', 'TRIP_START_TIME', 'trip_id', 'STOP_ID'], keep=False)]
df_dup = df_dup.sort_values(by=['SORT_ORDER'])
df_dup_first = df_dup.loc[df_dup.duplicated(subset=['SURVEY_DATE', 'TRIP_START_TIME', 'trip_id', 'STOP_ID'], keep='first')]
df_dup_last = df_dup.loc[df_dup.duplicated(subset=['SURVEY_DATE', 'TRIP_START_TIME', 'trip_id', 'STOP_ID'], keep='last')]
print(f"len df_dup: {df_dup.shape[0]}, len df_dup_first: {df_dup_first.shape[0]}, len df_dup_last: {df_dup_last.shape[0]}")

df_stop_times_dup = df_stop_times.loc[df_stop_times.duplicated(subset=['trip_id', 'stop_id'], keep=False)]
df_stop_times_nodup = df_stop_times.drop_duplicates(subset=['trip_id', 'stop_id'], keep=False)
df_stop_times_dup.sort_values(by=['stop_sequence'])
df_stop_times_dup_first = df_stop_times_dup.loc[df_stop_times_dup.duplicated(subset=['trip_id', 'stop_id'], keep='first')]
df_stop_times_dup_last = df_stop_times_dup.loc[df_stop_times_dup.duplicated(subset=['trip_id', 'stop_id'], keep='last')]
print(f"len df_stop_times_dup: {df_stop_times_dup.shape[0]} len df_stop_times_dup_first: {df_stop_times_dup_first.shape[0]}, len df_stop_times_dup_last: {df_stop_times_dup_last.shape[0]}, df_stop_times_nodup: {df_stop_times_nodup.shape[0]}")

print(f"length of df: {df.shape[0]}")
df1 = df.drop_duplicates(subset=['SURVEY_DATE', 'TRIP_START_TIME', 'trip_id', 'STOP_ID'], keep=False)
print(f"length of df after dropping all duplicates: {df1.shape[0]}")
df1 = df1.merge(df_stop_times_nodup, left_on=['TRIP_KEY', 'STOP_ID'], right_on=['trip_id', 'stop_id'], how='left', validate="many_to_one", suffixes=(None, '_right'))
print(f"length of df after merging with df_stop_times: {df1.shape[0]}")

df_dup_first = df_dup_first.merge(df_stop_times_dup_first, left_on=['TRIP_KEY', 'STOP_ID'], right_on=['trip_id', 'stop_id'], how='left', validate="many_to_one", suffixes=(None, '_right'))
df_dup_last = df_dup_last.merge(df_stop_times_dup_last, left_on=['TRIP_KEY', 'STOP_ID'], right_on=['trip_id', 'stop_id'], how='left', validate="many_to_one", suffixes=(None, '_right'))
df_dup = pd.concat([df_dup_first, df_dup_last], ignore_index=True)
print(f"len df_dup_first: {df_dup_first.shape[0]} len df_dup_last: {df_dup_last.shape[0]} len df_dup: {df_dup.shape[0]}")

drop_cols = []
for col in df_dup.columns:
    if (col not in df1.columns) or ("_right" in col):
        drop_cols.append(col)
df_dup = df_dup.drop(drop_cols, axis=1)
df1 = df1.drop(drop_cols, axis=1)
df1 = pd.concat([df1, df_dup], ignore_index=True)


print(f"final df length: {df1.shape[0]}")

y = len(df1)
x = len(df1[~df1['stop_sequence'].isnull()])
print(y, x)
per_mis = (y-x)/y
print(f"Precentage of stop_times missing: {per_mis}")

df1 = df1[~df1['stop_sequence'].isnull()]

len df_dup: 968, len df_dup_first: 484, len df_dup_last: 484
len df_stop_times_dup: 42 len df_stop_times_dup_first: 21, len df_stop_times_dup_last: 21, df_stop_times_nodup: 255592
length of df: 8230391
length of df after dropping all duplicates: 8229423
length of df after merging with df_stop_times: 8229423
len df_dup_first: 484 len df_dup_last: 484 len df_dup: 968
final df length: 8230391
8230391 7678358
Precentage of stop_times missing: 0.06707251210786948


In [47]:
out_path = os.path.join(os.getcwd(), 'output', f"cartaapc_merged_{year}.csv")
df1.to_csv(out_path, index=False)

# 3. Reformat column names for final dataset

Inputs: cartaapc_merged_{year}.csv

Outputs: cartaapc_dashboard_{year}.csv

In [48]:
out_path = os.path.join(os.getcwd(), 'output', f"cartaapc_merged_{year}.csv")
apc_df = pd.read_csv(out_path, index_col=None)
print(f"Number of rows in cartaapc_merged_{year}.csv: {apc_df.shape[0]}")
#apc_df.head()



Number of rows in cartaapc_merged_2019.csv: 7678358


In [49]:
apc_df = apc_df[['trip_id', 'arrival_time', 'STOP_ID', 'stop_sequence',  
         'LATITUDE', 'LONGITUDE', 'route_id', 'direction_id', 'SURVEY_DATE', 
         'PASSENGERS_ON', 'PASSENGERS_OFF', 'PASSENGERS_IN', 'DIRECTION_NAME', 'SERVICE_PERIOD']]

apc_df = apc_df.rename(columns={'PASSENGERS_ON' : 'board_count',
                                'PASSENGERS_OFF' : 'alight_count',
                                'PASSENGERS_IN' : 'occupancy',
                                'DIRECTION_NAME' : 'direction_desc',
                                'SERVICE_PERIOD' : 'service_period',
                                'LATITUDE': 'stop_lat',
                                'LONGITUDE': 'stop_lon',
                                'STOP_ID': 'stop_id'})

print(f"apc_df length before dropping null values: {apc_df.shape[0]}")
apc_df = apc_df.dropna()
print(f"apc_df length after dropping null values: {apc_df.shape[0]}")

apc_df.isna().sum()

apc_df length before dropping null values: 7678358
apc_df length after dropping null values: 7678358


trip_id           0
arrival_time      0
stop_id           0
stop_sequence     0
stop_lat          0
stop_lon          0
route_id          0
direction_id      0
SURVEY_DATE       0
board_count       0
alight_count      0
occupancy         0
direction_desc    0
service_period    0
dtype: int64

In [50]:
# change column names and drop null values

# format date and time fields
apc_df['date'] = pd.to_datetime(apc_df['SURVEY_DATE'])
apc_df = apc_df.drop(columns=['SURVEY_DATE'])

apc_df['date'] = apc_df['date'].astype(str)
apc_df['date_time'] = apc_df['date'] + " " + apc_df['arrival_time']


sorted_by_time = apc_df.sort_values('arrival_time')
trip_start_time = apc_df.drop_duplicates('trip_id', keep='first')
trip_start_time = trip_start_time[['trip_id', 'arrival_time']]
trip_start_time.columns = ['trip_id', 'trip_start_time']
apc_df = apc_df.merge(trip_start_time, on='trip_id', how='left')
apc_df['trip_name'] = apc_df['trip_start_time'] + ' (trip ID: ' + apc_df['trip_id'].astype(str) + ')'

apc_df['date'] = pd.to_datetime(apc_df['date'])
apc_df['day_of_week'] = apc_df['date'].dt.dayofweek


In [51]:
apc_df.head(2)

Unnamed: 0,trip_id,arrival_time,stop_id,stop_sequence,stop_lat,stop_lon,route_id,direction_id,board_count,alight_count,occupancy,direction_desc,service_period,date,date_time,trip_start_time,trip_name,day_of_week
0,139145,08:51:00,354,1.0,35.056167,-85.268713,16,0.0,0,0,0,OUTBOUND,Weekday,2019-11-01,2019-11-01 08:51:00,08:51:00,08:51:00 (trip ID: 139145),4
1,139145,08:54:59,505,2.0,35.056017,-85.28108,16,0.0,0,0,0,OUTBOUND,Weekday,2019-11-01,2019-11-01 08:54:59,08:51:00,08:51:00 (trip ID: 139145),4


In [52]:
out_path = os.path.join(os.getcwd(), 'output', f"cartaapc_dashboard_{year}.csv")
apc_df.to_csv(out_path, index=False)