# CARTA APC Dataprep

Goal: clean and processing CARTA APC data.

Directory structure:

    * data
        * cartaapc2019: all carta 2019 APC data
        * cartaapc2020: all carta 2020 APC data
        * cartagtfs: carta static GTFS for all years
        * ridecheckstops: stop information from ridecheck (this is a substitute for that GTFS stops.txt does not include all stops in the APC data)
    * output: output data files
    
This notebook generates 3 files

* cartaapc_cleaned_{year}.csv: temporary file
* cartaapc_merged_{year}.csv: temporary file
* cartaapc_dashboard_{year}.csv: final output file
* cartaacp_dashboard.csv: final output (combined for 2019 and 2020)

In [1]:
import pandas as pd
import os
import zipfile
import dateparser
import swifter
import datetime

In [33]:
#file_path = os.path.join(os.getcwd(), 'data.zip')
#with zipfile.ZipFile(file_path,"r") as zip_ref:
#    zip_ref.extractall(os.getcwd())

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# 1. Clean up the raw APC Data

We need to deal with some duplicates and erroneous data in the raw APC file.

Inputs: raw APC data

Outputs: 'cartaapc_cleaned_{year}.csv'

In [3]:
year = '2020'

In [None]:
def load_raw_2019():
    apc_path = os.path.join(os.getcwd(), 'data', f"cartaapc{year}")
    dfs = []
    for file in os.listdir(apc_path):
        df_temp = pd.read_csv(os.path.join(apc_path, file), index_col=0)
        dfs.append(df_temp)
    apc_df = pd.concat(dfs, ignore_index=True)
    return apc_df


def load_raw_2020():
    apc_path = os.path.join(os.getcwd(), 'data', f"cartaapc{year}", 'chattanooga_apc_jan20_through_jun20.csv')
    apc_df = pd.read_csv(apc_path, index_col=0)
    return apc_df

if year == '2019':
    apc_df = load_raw_2019()
elif year == '2020':
    apc_df = load_raw_2020()
print(f"shape of apc_df: {apc_df.shape}")

  interactivity=interactivity, compiler=compiler, result=result)


Time and date values are in an improper format, fix formating for time and dates with the next cell block.

In [None]:
def fix_times (time) :
    """
    convert time to HH:MM format - removing 12/30/1899  in front of non-null times
    
    :param time: a time value 
    :return: time converted to HH:MM format
    """
    
    time = str(time)
    if time == 'NaN':
        return time

    # take last characters for hours:minutes:seconds format
    fixed_time = time[10:19]
    return fixed_time


for l in ['TRIP_START_TIME','TIME_SCHEDULED','TIME_ACTUAL_ARRIVE','TIME_ACTUAL_DEPART']:
    print(l)
    apc_df[l]= apc_df.swifter.set_npartitions(20).apply(lambda x: fix_times(x[l]),axis=1)

#convert survey_date to a datetime object and create a new 'Date' column. 
apc_df['DATE'] = pd.to_datetime(apc_df['SURVEY_DATE'])

There are some inproperly formated names for DIRECTION_NAME. Fix this so that DIRECTION_NAME is either OUTBOUND or INBOUND.

In [None]:
print(f"unique DIRECTION_NAME vals: {apc_df['DIRECTION_NAME'].unique()}")

print("DIRECTION_NAME should be either INBOUND or OUTBOUND")

apc_df.loc[apc_df['DIRECTION_NAME'] == 'OUTYBOUND', ['DIRECTION_NAME']] = 'OUTBOUND'
apc_df.loc[apc_df['DIRECTION_NAME'] == '0', ['DIRECTION_NAME']] = 'OUTBOUND'
apc_df.loc[apc_df['DIRECTION_NAME'] == '1', ['DIRECTION_NAME']] = 'INBOUND'

print(f"unique DIRECTION_NAME vals: {apc_df['DIRECTION_NAME'].unique()}")

Drop all duplicate rows. Duplicates are defined by: ['TRIP_KEY','SURVEY_DATE','DIRECTION_NAME','STOP_ID','SORT_ORDER']. Also remove shuttle routes.

In [None]:
print(f"Number of readings in raw apc_df: {apc_df.shape[0]}")

apc_df = apc_df.drop_duplicates(['TRIP_KEY','SURVEY_DATE','DIRECTION_NAME','STOP_ID','SORT_ORDER'],keep='first')
print(f"Number of readings after dropping duplicates: {apc_df.shape[0]}")

apc_df = apc_df[(apc_df.ROUTE_NUMBER != 33) & (apc_df.ROUTE_NUMBER != 34) & (apc_df.ROUTE_NUMBER != 14)]
print(f"Number of readings after dropping routes 33, 34, 14: {apc_df.shape[0]}")

In [None]:
out_path = os.path.join(os.getcwd(), 'output', f"cartaapc_cleaned_{year}.csv")
apc_df.to_csv(out_path, index=False)

# 2. GTFS Join

Join the CARTA APC data with GTFS and ridecheck stops

Inputs: cartaapc_cleaned_{year}.csv, GTFS (trips.txt, stop_times.txt), ridecheck

Outputs: cartaapc_merged_{year}.csv

In [None]:
# load cartaapc_cleaned_{year}.csv

out_path = os.path.join(os.getcwd(), 'output', f"cartaapc_cleaned_{year}.csv")
apc_df = pd.read_csv(out_path, index_col=None)
apc_df['DATE'] = pd.to_datetime(apc_df['DATE'])
print(f"Number of rows in cartaapc_cleaned_{year}.csv: {apc_df.shape[0]}")
#apc_df.head()

Load trips.txt, stop_times.txt and stops.txt from GTFS.

In [None]:
# load GTFS (df_stop_times, df_stops, df_trips)

def load_gtfs(file_name, gtfs_list, gtfs_path):
    dfs = []
    for gtfs in gtfs_list:
        file_path = os.path.join(gtfs_path, gtfs[0], file_name)
        temp = pd.read_csv(file_path, index_col=False)
        temp['gtfs_start_date'] = gtfs[0]
        temp['gtfs_end_date'] = gtfs[1]
        dfs.append(temp)
    df = pd.concat(dfs, ignore_index=True)
    df['gtfs_start_date_dt'] = pd.to_datetime(df['gtfs_start_date'])
    df['gtfs_end_date_dt'] = pd.to_datetime(df['gtfs_end_date'])
    if 'trip_id' in df.columns:
        df['trip_id'] = df['trip_id'].astype(str)
        df['trip_id'] = df['trip_id'].apply(lambda x: x[0:-3])
    if 'stop_id' in df.columns:
        df['stop_id'] = df['stop_id'].astype(int)
        df['stop_id'] = df['stop_id'].astype(str)
    return df

gtfs_path = os.path.join(os.getcwd(), 'data', 'cartagtfs')
df_stop_times, df_stops, df_trips = [], [], []
if year == '2019':
    gtfs_list = [('2018-08-19', '2019-05-05'), ('2019-05-05', '2019-08-18'), ('2019-08-18', '2020-01-02')]
elif year == '2020':
    gtfs_list = [('2019-08-18', '2020-04-13'), ('2020-04-13', '2020-08-16'), ('2020-08-16', '2021-01-02')]
    
df_stop_times = load_gtfs('stop_times.txt', gtfs_list, gtfs_path=gtfs_path)
df_stops = load_gtfs('stops.txt', gtfs_list, gtfs_path=gtfs_path)
df_trips = load_gtfs('trips.txt', gtfs_list, gtfs_path=gtfs_path)

df_stop_times = df_stop_times.drop(['gtfs_start_date', 'gtfs_end_date'], axis=1)
df_stops = df_stops.drop(['gtfs_start_date', 'gtfs_end_date'], axis=1)
df_trips = df_trips.drop(['gtfs_start_date', 'gtfs_end_date'], axis=1)

df_stop_times = df_stop_times.drop_duplicates(subset=['trip_id','arrival_time','departure_time','stop_id','stop_sequence'], keep='last')
df_stops = df_stops.drop_duplicates(subset=['stop_id'], keep='last')
df_trips = df_trips.drop_duplicates(subset=['trip_id'], keep='last')

apc_df['STOP_ID'] = apc_df['STOP_ID'].astype(int)
apc_df['STOP_ID'] = apc_df['STOP_ID'].astype(str)
apc_df['TRIP_KEY'] = apc_df['TRIP_KEY'].astype(str)

We found many issues with stops that were in the APC data but not the stops.txt. We had Philip give us a file called 'STOPS.xlsx' which was generated from APC and includes all stops. Therefore rather than joining the APC data with stops.txt, we merge it with STOPS.xlsx.

In [None]:
# join apc with ridecheck stops

file_path = os.path.join(os.getcwd(), 'data', 'ridecheckstops', 'STOPS.xlsx')
apc_stops_df = pd.read_excel(file_path)[['STOP_ID', 'MAIN_STREET', 'CROSS_STREET', 'LATITUDE', 'LONGITUDE']]
apc_stops_df['STOP_ID'] = apc_stops_df['STOP_ID'].astype(int)
apc_stops_df['STOP_ID'] = apc_stops_df['STOP_ID'].astype(str)

df = apc_df.merge(apc_stops_df, left_on='STOP_ID', right_on='STOP_ID', how='left', validate='many_to_one')

y = len(df)
x = len(df[~df['LATITUDE'].isnull()])
print(y, x)
per_mis = (y-x)/y
print(f"Precentage of stops missing: {per_mis}")

Now join APC data with GTFS trips.txt

In [None]:
# join apc data with GTFS trips.txt

df = df.merge(df_trips, left_on=['TRIP_KEY'], right_on=['trip_id'], how='left', validate='many_to_one')

y = len(df)
x = len(df[~df['trip_id'].isnull()])
print(y, x)
per_mis = (y-x)/y
print(f"Precentage of trips missing: {per_mis}")

df = df[~df['trip_id'].isnull()]

Now join APC with stop_times.txt. One issue is that a straight many-to-one left join (APC & stop_times.txt) will not work because for some trips the same stop is visited twice. Therefore we have to do some processing to ensure we have a valid join.

In [None]:
# join apc data with GTFS stop_times.txt
# Note that this code block handles the fact that a stop can appear more than once in a single GTFS trip

df_dup = df.loc[df.duplicated(subset=['SURVEY_DATE', 'TRIP_START_TIME', 'trip_id', 'STOP_ID'], keep=False)]
df_dup = df_dup.sort_values(by=['SORT_ORDER'])
df_dup_first = df_dup.loc[df_dup.duplicated(subset=['SURVEY_DATE', 'TRIP_START_TIME', 'trip_id', 'STOP_ID'], keep='last')]
df_dup_last = df_dup.loc[df_dup.duplicated(subset=['SURVEY_DATE', 'TRIP_START_TIME', 'trip_id', 'STOP_ID'], keep='first')]
print(f"len df_dup: {df_dup.shape[0]}, len df_dup_first: {df_dup_first.shape[0]}, len df_dup_last: {df_dup_last.shape[0]}")

df_stop_times_dup = df_stop_times.loc[df_stop_times.duplicated(subset=['trip_id', 'stop_id'], keep=False)]
df_stop_times_nodup = df_stop_times.drop_duplicates(subset=['trip_id', 'stop_id'], keep=False)
df_stop_times_dup = df_stop_times_dup.sort_values(by=['stop_sequence'])
df_stop_times_dup_first = df_stop_times_dup.loc[df_stop_times_dup.duplicated(subset=['trip_id', 'stop_id'], keep='last')]
df_stop_times_dup_last = df_stop_times_dup.loc[df_stop_times_dup.duplicated(subset=['trip_id', 'stop_id'], keep='first')]
print(f"len df_stop_times_dup: {df_stop_times_dup.shape[0]} len df_stop_times_dup_first: {df_stop_times_dup_first.shape[0]}, len df_stop_times_dup_last: {df_stop_times_dup_last.shape[0]}, df_stop_times_nodup: {df_stop_times_nodup.shape[0]}")

print(f"length of df: {df.shape[0]}")
df1 = df.drop_duplicates(subset=['SURVEY_DATE', 'TRIP_START_TIME', 'trip_id', 'STOP_ID'], keep=False)
print(f"length of df after dropping all duplicates: {df1.shape[0]}")
df1 = df1.merge(df_stop_times_nodup, left_on=['TRIP_KEY', 'STOP_ID'], right_on=['trip_id', 'stop_id'], how='left', validate="many_to_one", suffixes=(None, '_right'))
print(f"length of df after merging with df_stop_times: {df1.shape[0]}")

df_dup_first = df_dup_first.merge(df_stop_times_dup_first, left_on=['TRIP_KEY', 'STOP_ID'], right_on=['trip_id', 'stop_id'], how='left', validate="many_to_one", suffixes=(None, '_right'))
df_dup_last = df_dup_last.merge(df_stop_times_dup_last, left_on=['TRIP_KEY', 'STOP_ID'], right_on=['trip_id', 'stop_id'], how='left', validate="many_to_one", suffixes=(None, '_right'))
df_dup = pd.concat([df_dup_first, df_dup_last], ignore_index=True)
print(f"len df_dup_first: {df_dup_first.shape[0]} len df_dup_last: {df_dup_last.shape[0]} len df_dup: {df_dup.shape[0]}")

drop_cols = []
for col in df_dup.columns:
    if (col not in df1.columns) or ("_right" in col):
        drop_cols.append(col)
df_dup = df_dup.drop(drop_cols, axis=1)
df1 = df1.drop(drop_cols, axis=1)
df1 = pd.concat([df1, df_dup], ignore_index=True)


print(f"final df length: {df1.shape[0]}")

y = len(df1)
x = len(df1[~df1['stop_sequence'].isnull()])
print(y, x)
per_mis = (y-x)/y
print(f"Precentage of stop_times missing: {per_mis}")

df1 = df1[~df1['stop_sequence'].isnull()]

In [None]:
out_path = os.path.join(os.getcwd(), 'output', f"cartaapc_merged_{year}.csv")
df1.to_csv(out_path, index=False)

# 3. Reformat column names for final dataset

Inputs: cartaapc_merged_{year}.csv

Outputs: cartaapc_dashboard_{year}.csv

In [None]:
out_path = os.path.join(os.getcwd(), 'output', f"cartaapc_merged_{year}.csv")
apc_df = pd.read_csv(out_path, index_col=None)
print(f"Number of rows in cartaapc_merged_{year}.csv: {apc_df.shape[0]}")
#apc_df.head()

Reformat column names, drop unnecessary columns and drop null values. Note that at this stage there should not be any null values.

In [None]:
# change column names and drop null values

apc_df = apc_df[['trip_id', 'arrival_time', 'STOP_ID', 'stop_sequence',  
         'LATITUDE', 'LONGITUDE', 'route_id', 'direction_id', 'SURVEY_DATE', 
         'PASSENGERS_ON', 'PASSENGERS_OFF', 'PASSENGERS_IN', 'DIRECTION_NAME', 'SERVICE_PERIOD']]

apc_df = apc_df.rename(columns={'PASSENGERS_ON' : 'board_count',
                                'PASSENGERS_OFF' : 'alight_count',
                                'PASSENGERS_IN' : 'occupancy',
                                'DIRECTION_NAME' : 'direction_desc',
                                'SERVICE_PERIOD' : 'service_period',
                                'LATITUDE': 'stop_lat',
                                'LONGITUDE': 'stop_lon',
                                'STOP_ID': 'stop_id'})

print(f"apc_df length before dropping null values: {apc_df.shape[0]}")
apc_df = apc_df.dropna()
print(f"apc_df length after dropping null values: {apc_df.shape[0]}")

apc_df.isna().sum()

Format date and time fields, add in day_of_week.

In [None]:
# format date and time fields
apc_df['date'] = pd.to_datetime(apc_df['SURVEY_DATE'])
apc_df = apc_df.drop(columns=['SURVEY_DATE'])

apc_df['date'] = apc_df['date'].astype(str)
apc_df['date_time'] = apc_df['date'] + " " + apc_df['arrival_time']


sorted_by_time = apc_df.sort_values('arrival_time')
trip_start_time = apc_df.drop_duplicates('trip_id', keep='first')
trip_start_time = trip_start_time[['trip_id', 'arrival_time']]
trip_start_time.columns = ['trip_id', 'trip_start_time']
apc_df = apc_df.merge(trip_start_time, on='trip_id', how='left')
apc_df['trip_name'] = apc_df['trip_start_time'] + ' (trip ID: ' + apc_df['trip_id'].astype(str) + ')'

apc_df['date'] = pd.to_datetime(apc_df['date'])
apc_df['day_of_week'] = apc_df['date'].dt.dayofweek


In [None]:
def fix_date_time(date_time):
    d, t = date_time.split(" ")
    h, m, s = t.split(":")
    year, month, day = d.split("-")
    if h == "24":
        hh = "00"
        dd = datetime.date(int(year), int(month), int(day))
        dd = dd + datetime.timedelta(days=1)
        dat = dd.strftime("%Y-%m-%d")
    else:
        hh = h
        dat = d
    result = f"{dat} {hh}:{m}:{s}"
    return result

apc_df = apc_df.drop(['trip_name'], axis=1)
apc_df['trip_date'] = apc_df['date']
apc_df['date_time'] = apc_df['date_time'].apply(lambda x: fix_date_time(x))
apc_df['arrival_time'] = apc_df['date_time'].apply(lambda x: x.split(" ")[1])
apc_df['date'] = apc_df['date_time'].apply(lambda x: x.split(" ")[0])
apc_df['hour'] = apc_df['arrival_time'].apply(lambda x: int(x.split(":")[0]))

apc_df['trip_id'] = apc_df['trip_id'].astype(int)
apc_df['stop_id'] = apc_df['stop_id'].astype(int)
apc_df['stop_lat'] = apc_df['stop_lat'].astype(float)
apc_df['stop_lon'] = apc_df['stop_lon'].astype(float)
apc_df['stop_sequence'] = apc_df['stop_sequence'].astype(int)
apc_df['direction_id'] = apc_df['direction_id'].astype(int)
apc_df['board_count'] = apc_df['board_count'].astype(int)
apc_df['alight_count'] = apc_df['alight_count'].astype(int)
apc_df['occupancy'] = apc_df['occupancy'].astype(int)
apc_df['day_of_week'] = apc_df['day_of_week'].astype(int)
apc_df['hour'] = apc_df['hour'].astype(int)

In [None]:
apc_df.head(2)

In [None]:
out_path = os.path.join(os.getcwd(), 'output', f"cartaapc_dashboard_{year}.csv")
apc_df.to_csv(out_path, index=False)

# 4. Combine 2019 and 2020 data into one CSV

In [7]:
file_path = os.path.join(os.getcwd(), 'output', f"cartaapc_dashboard_2019.csv")
df_2019 = pd.read_csv(file_path, index_col=None)

file_path = os.path.join(os.getcwd(), 'output', f"cartaapc_dashboard_2020.csv")
df_2020 = pd.read_csv(file_path, index_col=None)

apc_df = pd.concat([df_2019, df_2020], ignore_index=True)

print(f"Length of 2019: {df_2019.shape[0]}, 2020: {df_2020.shape[0]} Total: {apc_df.shape[0]}")

In [6]:
apc_df['trip_id'] = apc_df['trip_id'].astype(int)
apc_df['stop_id'] = apc_df['stop_id'].astype(int)
apc_df['stop_lat'] = apc_df['stop_lat'].astype(float)
apc_df['stop_lon'] = apc_df['stop_lon'].astype(float)
apc_df['stop_sequence'] = apc_df['stop_sequence'].astype(int)
apc_df['direction_id'] = apc_df['direction_id'].astype(int)
apc_df['board_count'] = apc_df['board_count'].astype(int)
apc_df['alight_count'] = apc_df['alight_count'].astype(int)
apc_df['occupancy'] = apc_df['occupancy'].astype(int)
apc_df['day_of_week'] = apc_df['day_of_week'].astype(int)
apc_df['hour'] = apc_df['hour'].astype(int)

In [47]:
apc_df.head(2)

Unnamed: 0,trip_id,arrival_time,stop_id,stop_sequence,stop_lat,stop_lon,route_id,direction_id,board_count,alight_count,occupancy,direction_desc,service_period,date,date_time,trip_start_time,day_of_week,trip_date,hour
0,139145,08:51:00,354,1,35.056167,-85.268713,16,0,0,0,0,OUTBOUND,Weekday,2019-11-01,2019-11-01 08:51:00,08:51:00,4,2019-11-01,8
1,139145,08:54:59,505,2,35.056017,-85.28108,16,0,0,0,0,OUTBOUND,Weekday,2019-11-01,2019-11-01 08:54:59,08:51:00,4,2019-11-01,8


In [48]:
out_path = os.path.join(os.getcwd(), 'output', f"cartaapc_dashboard.csv")
apc_df.to_csv(out_path, index=False)