In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Load Chattanooga APC Data 

This data comes from Teams. You can find it at: General > APC > CARTA > chattanooga_apc_jan20_through_jun20.csv


In [2]:
apc_df = pd.read_csv('chattanooga_apc_jan20_through_jun20.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


In [3]:
apc_df.shape[0]

3471268

# Raw APC data processing

We need to deal with some duplicates and erroneous data in the raw APC file. 

In [4]:
#change 'outybound' to outbound (typo here)
apc_df.loc[apc_df['DIRECTION_NAME'] == 'OUTYBOUND', ['DIRECTION_NAME']] = 'OUTBOUND'

In [5]:
#drop all duplicates based on trip_key, survey_date, direction_name, stop_id, and trip_start_time
#later append this with subset of dups we want to keep
apc_no_dups = apc_df.drop_duplicates(['TRIP_KEY','SURVEY_DATE','DIRECTION_NAME','STOP_ID','TRIP_START_TIME'],keep=False)

In [6]:
#get all duplicates
apc_dups = apc_df.loc[apc_df.duplicated(subset=['TRIP_KEY','SURVEY_DATE','DIRECTION_NAME','STOP_ID','TRIP_START_TIME'], keep=False)]

In [7]:
apc_dups.shape[0]

4942

In [8]:
#drop routes 33, 34, 14
apc_dups_dropped_routes = apc_dups[(apc_dups.ROUTE_NUMBER != 33) & (apc_dups.ROUTE_NUMBER != 34)
                                   & (apc_dups.ROUTE_NUMBER != 14)]

In [9]:
apc_dups_dropped_routes.shape[0]

2774

In [10]:
#convert survey_date to a datetime object and create a new 'Date' column. 
apc_no_dups['DATE'] = pd.to_datetime(apc_no_dups['SURVEY_DATE'])
apc_dups_dropped_routes['DATE'] = pd.to_datetime(apc_dups_dropped_routes['SURVEY_DATE'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
def does_trip_date_match (trip_key, date) :
    """
    check if data matches APC data based on trip_key and survey_date
    
    :param: trip_key, date of trip
    :return: false if no match, true if matched
    """
    df = apc_no_dups.loc[(apc_no_dups['TRIP_KEY'] == trip_key) & (apc_no_dups['DATE'] == date)]
    return df.shape[0] != 0

In [12]:
apc_dups_dropped_routes['trip_date_match'] = apc_dups_dropped_routes.apply(lambda row: does_trip_date_match(row['TRIP_KEY'], 
                                                            row['DATE']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [13]:
#need to drop dups which are unique trip - date combinations that don't occur in the APC data without duplicates.
dups_notin_apc = apc_dups_dropped_routes.loc[apc_dups_dropped_routes['trip_date_match'] == False]

In [14]:
dups_notin_apc.shape[0]

2570

In [15]:
#get the data frame of duplicated values trip-date combinations that match the APC without duplicates.
dups_in_apc = apc_dups_dropped_routes.loc[apc_dups_dropped_routes['trip_date_match'] == True]

After some examination of these duplicates, we want to keep all of these duplicates even though this trip id has
duplicates on stop id. On this trip, stops 292-293-294 are visited twice (sort order differs for each set of repeated
stops), so we must keep all these duplicates. 

In [16]:
dups_in_apc.shape[0]

204

# Extra processing steps

These commands were not run, but serve as extra processing steps to further eliminate duplicates. 
1)Split the dups_in_apc frame into 2 parts - one where board and alight counts are the same, and one where board-alight is different. For duplicates where alight and board are the same, drop the duplicates (reduce to 1).
2)For the data frame where board and alight are different, check which duplicate to keep depending on which one of the combination produces the least amount of negative calculated occupancies.

In [17]:
#for duplicates where alight and board are the same, drop dups. later will merge this with apc_no_dups.
board_alight_same = dups_in_apc.loc[dups_in_apc.duplicated(subset=['TRIP_KEY','SURVEY_DATE','DIRECTION_NAME',
                                                                   'STOP_ID','TRIP_START_TIME', 'PASSENGERS_ON', 
                                                                   'PASSENGERS_OFF'], keep='first')]

In [19]:
#get all duplicates where board and alight are not the same. we want to see what is happening with negative occupancies.
board_alight_diff = dups_in_apc.drop_duplicates(['TRIP_KEY','SURVEY_DATE','DIRECTION_NAME','STOP_ID',
                                                 'TRIP_START_TIME', 'PASSENGERS_ON', 'PASSENGERS_OFF'],keep=False)

In [21]:
def calc_occ (df) :
    
    """
    calculate occupancy at each stop along a route
    
    :param: dataframe with board/alight values for all stops along a single trip
    :return: data frame with calculated occupancies column
    """
    
    tmp = df.copy()
    # calc initial load as the passengers in at the first stop
    tmp['initial_load'] = tmp['PASSENGERS_IN'].iloc[0]
    
    tmp['initial_load'] = pd.to_numeric(tmp['initial_load'], errors='coerce')
    tmp['PASSENGERS_ON'] = pd.to_numeric(tmp['PASSENGERS_ON'], errors='coerce')
    tmp['PASSENGERS_OFF'] = pd.to_numeric(tmp['PASSENGERS_OFF'], errors='coerce')

    # calc occupancy net change
    tmp['occupancy_net_change'] = tmp['PASSENGERS_ON'] - tmp['PASSENGERS_OFF']

    # calc cumulative sum in occupancy net change as an intermediate step
    tmp_sum_df = pd.DataFrame(tmp['occupancy_net_change'].cumsum())
    tmp_sum_df.columns = ['tmp_sum']

    # merge tmp sum (cumulative sum) into tmp
    tmp = tmp.merge(tmp_sum_df, left_index=True, right_index=True)

    # calc occupancy for a particular stop
    tmp['calc_occupancy'] = tmp['tmp_sum'] + tmp['initial_load']

    return tmp.drop(columns=['tmp_sum', 'occupancy_net_change'])

In [22]:
def number_neg_occ (trip_key, survey_date, stop_id, sort_order, passengers_on, passengers_off, passengers_in) :
    
    
    """
    calculate number of negative occupancies for a given row
    
    :param: trip_key, date, stop_id, stop_order, passengers_on, passengers_off, passengers_in for a row
    :return: number of negative occupancies
    """
        
    #df = all stops on a trip. matching to APC data set without any dups on trip_key and survey_date.
    df = apc_no_dups.loc[(apc_no_dups['TRIP_KEY'] == trip_key) & (apc_no_dups['DATE'] == survey_date)]     
    
    #drop all columns except alight, board, stop sequence, stop id 
    df1 = df.loc[ : , ('PASSENGERS_ON', 'PASSENGERS_OFF', 'PASSENGERS_IN', 'SORT_ORDER', 'STOP_ID', 'DATE')]
    
    #df2 = data frame of row 
    d = {'PASSENGERS_ON' : [passengers_on], 'PASSENGERS_OFF' : [passengers_off], 'PASSENGERS_IN' : [passengers_in],
         'SORT_ORDER' : [sort_order], 'STOP_ID': [stop_id], 'DATE': [survey_date]}
    df2 = pd.DataFrame(data = d)
    
    #append 
    df3 = df1.append(df2)
    df4 = df3.sort_values(['SORT_ORDER'],ascending=True)
    
    #calc occupancy
    calc_occ_df = calc_occ(df4)
    
    #count how many negatives are in the calculated occupancy column
    return calc_occ_df.loc[calc_occ_df['calc_occupancy'] < 0].shape[0]

In [23]:
board_alight_diff['number_neg_occ'] = board_alight_diff.apply(lambda row: number_neg_occ(row['TRIP_KEY'], 
                                                            row['DATE'],                            
                                                            row['STOP_ID'], row['SORT_ORDER'], 
                                                            row['PASSENGERS_ON'], row['PASSENGERS_OFF'], 
                                                            row['PASSENGERS_IN']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [24]:
def is_equal (number_neg_occ, minimum) :
    
    """
    check if number of neg occupancies is equal to the minimum of this for a particular trip-date-stopID combination
    
    :param: number of negative occupancy, minimum negative occupancy of a row
    :return: boolean if parameters are equal
    """
    
    return number_neg_occ == minimum

In [25]:
def drop_highest_neg_occ (df) :
    
    """
    keep row of a trip-date-stopID combination with the lowest number of negative occupancies
    
    :param: data frame to drop the highest negative occupancies of per trip-date-stopID combination
    :return: data frame with entries that have the lowest number of neg occupancies per trip-date-stopID combination
    """

    grouped = df.groupby(['TRIP_KEY', 'STOP_ID', 'DATE'])['number_neg_occ']

    df1 = df.assign(min=grouped.transform(min))

    #mark true if the min matches number_neg_occ, else false
    df1['keep'] = df1.apply(lambda row: is_equal(row['number_neg_occ'], row['min']), axis=1)
    
    #filter data frame where keep=true
    df2 = df1.loc[df1['keep'] == True]
    
    #remove duplicates
    return df2.drop_duplicates(['TRIP_KEY','DATE', 'STOP_ID'],keep='first')


In [26]:
board_alight_diff2 = drop_highest_neg_occ (board_alight_diff)

In [27]:
board_alight_diff2.shape[0]

2

In [28]:
#drop columns
board_alight_diff2 = board_alight_diff2.drop(columns=['DATE', 'trip_date_match', 'number_neg_occ', 'min', 'keep'])

In [31]:
board_alight_same = board_alight_same.drop(columns=['DATE', 'trip_date_match'])

In [32]:
#append data frame (1) dropped duplicates where board and alight counts were the same, and where trip-dates matched to APC
#with (2) cleaned duplicates where board and alight counts were different, and where trip-dates matched to APC
a1 = board_alight_same.append(board_alight_diff2)

# Append the non-duplicated APC data with the duplicates we want to keep

In [19]:
#drop 'Date' column
apc_no_dups = apc_no_dups.drop(columns=['DATE'])
dups_in_apc = dups_in_apc.drop(columns=['DATE'])

In [20]:
#Append 1) APC data without any duplicates with 2) duplicates we want to keep
apc_cleaned = apc_no_dups.append(dups_in_apc)

In [21]:
apc_cleaned.shape[0]

3466530

In [22]:
apc_no_dups.shape[0] + dups_in_apc.shape[0]

3466530

In [23]:
#Get the difference of the cleaned version and original version with rows that are not in the cleaned version but are in the full apc data frame.
diff = apc_cleaned.merge(apc_df, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='right_only']

In [24]:
diff.shape[0]

4738

In [25]:
apc_df.shape[0] - apc_cleaned.shape[0] == diff.shape[0]

True

In [29]:
diff = diff.drop(columns=['_merge', 'trip_date_match'])

In [30]:
diff.head()

Unnamed: 0,SERIAL_NUMBER,SCHEDULE_ID,SCHEDULE_NAME,SIGNUP_NAME,SURVEY_DATE,SURVEY_STATUS,SURVEY_TYPE,SURVEY_SOURCE,PATTERN_ID,ROUTE_NUMBER,ROUTE_NAME,DIRECTION_NAME,BRANCH,SERVICE_CODE,SERVICE_TYPE,SERVICE_CLASS,SERVICE_MODE,TRIP_START_TIME,TIME_PERIOD,SERVICE_PERIOD,TRIP_NUMBER,TRIP_KEY,BLOCK_NUMBER,BLOCK_KEY,BLOCK_ID,BLOCK_NAME,RUN_NUMBER,RUN_KEY,VEHICLE_NUMBER,VEHICLE_DESCRIPTION,VEHICLE_SEATS,REVENUE_START,REVENUE_END,REVENUE_NET,ODOM_START,ODOM_END,ODOM_NET,CONDITION_NUMBER,CHECKER_NAME,GARAGE_NAME,DIVISION_NAME,OPERATOR_ID,FAREBOX,MATCH_COUNT,COMMENTS,SORT_ORDER,STOP_ID,MAIN_CROSS_STREET,TRAVEL_DIRECTION,TIMEPOINT,SEGMENT_MILES,TIME_SCHEDULED,TIME_ACTUAL_ARRIVE,TIME_ACTUAL_DEPART,DWELL_TIME,RUNNING_TIME_ACTUAL,PASSENGERS_ON,PASSENGERS_OFF,PASSENGERS_IN,PASSENGERS_SPOT,WHEELCHAIRS,BICYCLES,MATCH_DISTANCE,TIMEPOINT_MILES,NON_STUDENT_FARE,CHILD,NR_BOARD,NR_ALIGHT,KNEELS,COMMENT_NUMBER,CHECKER_TIME,FIRST_LAST_STOP
3466530,4850144,112,Aug19 (Weekday),8/18/19,1/2/20 0:00,2,1,3,247,33.0,Route #DTS:33,INBOUND,[DTS]DOWNTOWN SHUTTLE IB,Route #DTS,,,Bus,1/0/00 20:22,PM Late,Weekday,250,140477,6402,,0,,7,7,704,AVS Elec 1995,22.0,,,,,,,0,,***Unknown Garage***,,160969.0,,18.0,Trip starts at 8:22p at SPN and ends at 8:37p ...,10,100075,SPN,X,-1,,1/0/00 20:22,1/0/00 20:23,1/0/00 20:23,,10.1,0,0,0,,0,0,323.0,1.56,,,,,0,,,1
3466531,4850144,112,Aug19 (Weekday),8/18/19,1/2/20 0:00,2,1,3,247,33.0,Route #DTS:33,INBOUND,[DTS]DOWNTOWN SHUTTLE IB,Route #DTS,,,Bus,1/0/00 20:22,PM Late,Weekday,250,140477,6402,,0,,7,7,704,AVS Elec 1995,22.0,,,,,,,0,,***Unknown Garage***,,160969.0,,18.0,Trip starts at 8:22p at SPN and ends at 8:37p ...,20,1566,SHUTTLE PARK NORTH - INTERNAL,E,0,0.08,,1/0/00 20:23,1/0/00 20:23,,,0,0,0,,0,0,323.0,,,,,,0,,,1
3466532,4850144,112,Aug19 (Weekday),8/18/19,1/2/20 0:00,2,1,3,247,33.0,Route #DTS:33,INBOUND,[DTS]DOWNTOWN SHUTTLE IB,Route #DTS,,,Bus,1/0/00 20:22,PM Late,Weekday,250,140477,6402,,0,,7,7,704,AVS Elec 1995,22.0,,,,,,,0,,***Unknown Garage***,,160969.0,,18.0,Trip starts at 8:22p at SPN and ends at 8:37p ...,30,793,BROAD/3RD,S,0,0.1,,1/0/00 20:23,1/0/00 20:23,0.0,,0,0,0,,0,0,0.0,,,,,,0,,,2
3466533,4850144,112,Aug19 (Weekday),8/18/19,1/2/20 0:00,2,1,3,247,33.0,Route #DTS:33,INBOUND,[DTS]DOWNTOWN SHUTTLE IB,Route #DTS,,,Bus,1/0/00 20:22,PM Late,Weekday,250,140477,6402,,0,,7,7,704,AVS Elec 1995,22.0,,,,,,,0,,***Unknown Garage***,,160969.0,,18.0,Trip starts at 8:22p at SPN and ends at 8:37p ...,40,1537,BROAD/4TH,S,0,0.09,,1/0/00 20:25,1/0/00 20:25,0.0,,0,0,0,,0,0,1.0,,,,,,0,,,2
3466534,4850144,112,Aug19 (Weekday),8/18/19,1/2/20 0:00,2,1,3,247,33.0,Route #DTS:33,INBOUND,[DTS]DOWNTOWN SHUTTLE IB,Route #DTS,,,Bus,1/0/00 20:22,PM Late,Weekday,250,140477,6402,,0,,7,7,704,AVS Elec 1995,22.0,,,,,,,0,,***Unknown Garage***,,160969.0,,18.0,Trip starts at 8:22p at SPN and ends at 8:37p ...,50,794,BROAD/5TH,S,0,0.09,,1/0/00 20:26,1/0/00 20:26,0.0,,0,0,0,,0,0,0.0,,,,,,0,,,2


In [31]:
#sort cleaned data
cleaned_chattanooga_apc_jan20_through_jun20 = apc_cleaned.sort_values(['TRIP_KEY', 'SURVEY_DATE', 'STOP_ID', 'SORT_ORDER'], ascending=False)

In [32]:
#sort difference data
diff_chattanooga_apc_jan20_through_jun20 = diff.sort_values(['TRIP_KEY', 'SURVEY_DATE', 'STOP_ID', 'SORT_ORDER'], ascending=False)

In [33]:
cleaned_chattanooga_apc_jan20_through_jun20.to_csv('cleaned_chattanooga_apc_jan20_through_jun20.csv')

In [34]:
diff_chattanooga_apc_jan20_through_jun20.to_csv('diff_chattanooga_apc_jan20_through_jun20.csv')

# Start the GTFS Join - load GTFS Data

This data comes from our GitHub repository: https://github.com/hdemma/transit-hub/tree/develop/data-connect/mongo-connect/data-fusion/carta-data-join/data/GTFS. 

In [39]:
apc_df = pd.read_csv('cleaned_chattanooga_apc_jan20_through_jun20.csv', index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)
  mask |= (ar1 == a)


In [35]:
# This feed went into effect sometime after 2019-08-18
aug19_trips_df = pd.read_csv('carta_gtfs_august_2019_trips.txt')
aug19_stops_df = pd.read_csv('carta_gtfs_august_2019_stops.txt')
aug19_stop_times_df = pd.read_csv('carta_gtfs_august_2019_stop_times.txt')

aug19_gtfs_df = aug19_trips_df.merge(aug19_stop_times_df)
aug19_gtfs_df = aug19_gtfs_df.merge(aug19_stops_df)
aug19_gtfs_df['gtfs_start_date'] = '2019-08-18' # add gtfs_start_date so it can be joined with APC data

aug19_gtfs_df.head(2)

Unnamed: 0,trip_id,route_id,service_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible,bikes_allowed,block_name,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding,gtfs_start_date
0,138337020,1,1,ALTON PARK,,0,102,shp-1-03,2,2,101,04:55:00,04:55:00,1351,1,,0,0,0.0,1,2184,MARKET + 4TH,,35.052658,-85.309722,,,,,,2,2019-08-18
1,138338020,1,1,ALTON PARK,,0,1002,shp-1-04,2,2,104,05:15:00,05:15:00,1351,10,,0,0,4340.75,1,2184,MARKET + 4TH,,35.052658,-85.309722,,,,,,2,2019-08-18


In [36]:
# This feed went into effect sometime after 2020-04-13
apr20_trips_df = pd.read_csv('carta_gtfs_may_2020_trips.txt')
apr20_stops_df = pd.read_csv('carta_gtfs_may_2020_stops.txt')
apr20_stop_times_df = pd.read_csv('carta_gtfs_may_2020_stop_times.txt')

apr20_gtfs_df = apr20_trips_df.merge(apr20_stop_times_df)
apr20_gtfs_df = apr20_gtfs_df.merge(apr20_stops_df)
apr20_gtfs_df['gtfs_start_date'] = '2020-04-13' # add gtfs_start_date so it can be joined with APC data

apr20_gtfs_df.head(2)

Unnamed: 0,trip_id,route_id,service_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible,bikes_allowed,block_name,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding,gtfs_start_date
0,149213010,1,1,ALTON PARK,,0,101,shp-1-04,2,2,101,08:51:00,08:51:00,354,1,,0,0,0.0,1,2942,SHOLAR + CARTA,,35.056167,-85.268713,,,,,,2,2020-04-13
1,149317010,10A,1,AVONDALE,,0,1101,shp-10A-04,2,2,152,12:21:00,12:21:00,354,1,,0,0,0.0,1,2942,SHOLAR + CARTA,,35.056167,-85.268713,,,,,,2,2020-04-13


In [37]:
# combine gtfs feeds into single df
gtfs_df = aug19_gtfs_df.append(apr20_gtfs_df)

# Create GTFS trip_key field

The software that generates GTFS feeds for CARTA appends 3 digits to the end of the APC TRIP_KEY to create a GTFS trip_id. In order to join these two datasets, we need to create a new column called trip_key in the GTFS dataset that removes the last 3 digits of trip_id.

In [38]:
gtfs_df['trip_id'] = gtfs_df['trip_id'].astype(str)
gtfs_df['trip_key'] = gtfs_df['trip_id'].str.slice(0, -3)

# Join APC and GTFS datasets. 

For full details and extra processing steps, chattanooga_bus_occupancy_jan20_through_jun20.ipynb file has more information. I skipped some steps that compare the CARTA APC to the Nashville APC. I am also joining on trip_key and stop_id, since we know that trip_ids do not repeat across GTFS feed updates.

In [40]:
# convert to str; data types need to match for join to work
gtfs_df['stop_id'] = gtfs_df['stop_id'].astype(str)
gtfs_df['trip_key'] = gtfs_df['trip_key'].astype(str)

apc_df['STOP_ID'] = apc_df['STOP_ID'].astype(str)
apc_df['TRIP_KEY'] = apc_df['TRIP_KEY'].astype(str)

In [41]:
test = apc_df.merge(gtfs_df, left_on=['TRIP_KEY', 'STOP_ID'], right_on=['trip_key', 'stop_id'], how='left')
test.head(2)

Unnamed: 0,SERIAL_NUMBER,SCHEDULE_ID,SCHEDULE_NAME,SIGNUP_NAME,SURVEY_DATE,SURVEY_STATUS,SURVEY_TYPE,SURVEY_SOURCE,PATTERN_ID,ROUTE_NUMBER,ROUTE_NAME,DIRECTION_NAME,BRANCH,SERVICE_CODE,SERVICE_TYPE,SERVICE_CLASS,SERVICE_MODE,TRIP_START_TIME,TIME_PERIOD,SERVICE_PERIOD,TRIP_NUMBER,TRIP_KEY,BLOCK_NUMBER,BLOCK_KEY,BLOCK_ID,BLOCK_NAME,RUN_NUMBER,RUN_KEY,VEHICLE_NUMBER,VEHICLE_DESCRIPTION,VEHICLE_SEATS,REVENUE_START,REVENUE_END,REVENUE_NET,ODOM_START,ODOM_END,ODOM_NET,CONDITION_NUMBER,CHECKER_NAME,GARAGE_NAME,DIVISION_NAME,OPERATOR_ID,FAREBOX,MATCH_COUNT,COMMENTS,SORT_ORDER,STOP_ID,MAIN_CROSS_STREET,TRAVEL_DIRECTION,TIMEPOINT,SEGMENT_MILES,TIME_SCHEDULED,TIME_ACTUAL_ARRIVE,TIME_ACTUAL_DEPART,DWELL_TIME,RUNNING_TIME_ACTUAL,PASSENGERS_ON,PASSENGERS_OFF,PASSENGERS_IN,PASSENGERS_SPOT,WHEELCHAIRS,BICYCLES,MATCH_DISTANCE,TIMEPOINT_MILES,NON_STUDENT_FARE,CHILD,NR_BOARD,NR_ALIGHT,KNEELS,COMMENT_NUMBER,CHECKER_TIME,FIRST_LAST_STOP,trip_date_match,trip_id,route_id,service_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible,bikes_allowed,block_name,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding,gtfs_start_date,trip_key
0,4927964,114,Apr20 (Weekday),April 12 2020,6/9/20,2,1,3,988,4.0,Route #4,INBOUND,[4]HAMMALL >> SHOLAR + CARTA,Route #4,,,Bus,1/0/00,AM Peak,Weekday,2,152324,3502,4094,0,409.0,401,401,111,Gillig HF 2002,30.0,,,,,,,0,,WILCOX GARAGE,CARTA,161006.0,,90.0,Trip starts at 5:50a at HamMall and ends at 7:...,10,100255,HamMall,X,-1,,1/0/00,1/0/00,1/0/00,,24.0,0,0,0,,0,0,336.0,7.185,,,,,0,,,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,4927964,114,Apr20 (Weekday),April 12 2020,6/9/20,2,1,3,988,4.0,Route #4,INBOUND,[4]HAMMALL >> SHOLAR + CARTA,Route #4,,,Bus,1/0/00,AM Peak,Weekday,2,152324,3502,4094,0,409.0,401,401,111,Gillig HF 2002,30.0,,,,,,,0,,WILCOX GARAGE,CARTA,161006.0,,90.0,Trip starts at 5:50a at HamMall and ends at 7:...,280,100251,GreWal-1,X,-1,,1/0/00,1/0/00,1/0/00,5.28,,0,0,2,,0,0,295.0,3.9733,,,,,0,,,2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [42]:
test.shape[0] - apc_df.shape[0]


204

# Getting extra 204 during join
We are getting 204 extra rows in our join. This is because there are duplicate combinations of trip_key and stop_id in GTFS. In other words, on a particular trip, a single stop_id can appear more than once. We need to deal with these duplicates.

In [43]:
# look at all duplicates
gtfs_df.loc[gtfs_df.duplicated(subset=['trip_id', 'stop_id'], keep=False)]

Unnamed: 0,trip_id,route_id,service_id,trip_headsign,trip_short_name,direction_id,block_id,shape_id,wheelchair_accessible,bikes_allowed,block_name,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint,stop_code,stop_name,stop_desc,stop_lat,stop_lon,zone_id,stop_url,location_type,parent_station,stop_timezone,wheelchair_boarding,gtfs_start_date,trip_key
29219,138668020,10G,1,DOWNTOWN VIA HWY 58 STUART,,1,3302,shp-10G-52,2,2,1519,19:20:05,19:20:05,292,2,,0,0,132.9,0,1481,CHAMBERLAIN + CUSHMAN,,35.069305,-85.246922,,,,,,2,2019-08-18,138668
29220,138668020,10G,1,DOWNTOWN VIA HWY 58 STUART,,1,3302,shp-10G-52,2,2,1519,19:30:30,19:30:30,292,32,,0,0,15385.34,0,1481,CHAMBERLAIN + CUSHMAN,,35.069305,-85.246922,,,,,,2,2019-08-18,138668
29272,138668020,10G,1,DOWNTOWN VIA HWY 58 STUART,,1,3302,shp-10G-52,2,2,1519,19:20:11,19:20:11,293,3,,0,0,232.28,0,1471,CHAMBERLAIN + BOONE,,35.068513,-85.247398,,,,,,2,2019-08-18,138668
29273,138668020,10G,1,DOWNTOWN VIA HWY 58 STUART,,1,3302,shp-10G-52,2,2,1519,19:30:36,19:30:36,293,33,,0,0,15484.72,0,1471,CHAMBERLAIN + BOONE,,35.068513,-85.247398,,,,,,2,2019-08-18,138668
29325,138668020,10G,1,DOWNTOWN VIA HWY 58 STUART,,1,3302,shp-10G-52,2,2,1519,19:20:17,19:20:17,294,4,,0,0,390.78,0,1485,CHAMBERLAIN + FAIRLEIGH,,35.06721,-85.248115,,,,,,2,2019-08-18,138668
29326,138668020,10G,1,DOWNTOWN VIA HWY 58 STUART,,1,3302,shp-10G-52,2,2,1519,19:30:42,19:30:42,294,34,,0,0,15643.22,0,1485,CHAMBERLAIN + FAIRLEIGH,,35.06721,-85.248115,,,,,,2,2019-08-18,138668


It looks like all the extra rows result from duplicate stop_ids on trip_id: 138668020
To remove these duplicates, we want to:

keep the FIRST occurence of date, trip_id = 138668020, gtfs_start_date, stop_id when stop_sequence = 2, 3, 4

keep the LAST occurence of date, trip_id = 138668020, gtfs_start_date, stop_id when stop_sequence = 32, 33, 34

We will accomplish this by:

divide the dataframe into 2 parts

      where trip_id == 138668020
      where trip_id != 138668020
      for the dataframe where trip_id == 138668020

keep the FIRST occurence of date, trip_id = 138668020, gtfs_start_date, stop_id when stop_sequence = 2, 3, 4

keep the LAST occurence of date, trip_id = 138668020, gtfs_start_date, stop_id when stop_sequence = 32, 33, 34

combine the two dataframes back together

In [44]:
# 1. divide the dataframe into 2 parts
no_duplicates = test.loc[test['trip_id'] != '138668020']
duplicates = test.loc[test['trip_id'] == '138668020']

# check how many entries are in the duplicates dataframe (3400)
duplicates.shape[0]

3400

In [45]:
# 2. For the dataframe where trip_id == 138668020:

# keep FIRST occurence of date, trip_id = 138668020, gtfs_start_date, stop_id when stop_sequence = 2, 3, 4
keep_first = duplicates.loc[duplicates['stop_sequence'].isin([2, 3, 4])]
keep_first = keep_first.drop_duplicates(subset=['SURVEY_DATE','stop_id'], keep='first')

# keep LAST occurence of date, trip_id = 138668020, gtfs_start_date, stop_id when stop_sequence = 32, 33, 34
keep_last = duplicates.loc[duplicates['stop_sequence'].isin([32, 33, 34])]
keep_last = keep_last.drop_duplicates(subset=['SURVEY_DATE','stop_id'],keep='last')

# for all other entries where trip_id == 138668020, do nothing
keep_all = duplicates.loc[~duplicates['stop_sequence'].isin([2, 3, 4, 32, 33, 34])]

# merge everything together
duplicates_removed = keep_first.append(keep_last)
duplicates_removed = duplicates_removed.append(keep_all)

In [46]:
# we should have removed 204 entries
duplicates.shape[0] - duplicates_removed.shape[0]

204

In [47]:
# 3. combine the 2 dataframes back together
test2 = no_duplicates.append(duplicates_removed)

In [48]:
# confirm that there are 3,466,530 rows
test2.shape[0]

3466530

# Load RideCheck Stops data
Since GTFS wasn't able to provide the lat/lon of all stops, there is additional information that we can pull in straight from RideCheck (the software that produces the APC data). All stop_ids should be present, so we don't have to worry about missing data.

This data comes from Teams. You can find it under General > datasets > data-archive-to-be-cleaned > Datasets > STOPS.xlsx.

In [50]:
apc_stops_df = pd.read_excel('STOPS.xlsx')[['STOP_ID', 'MAIN_STREET', 'CROSS_STREET', 'LATITUDE', 'LONGITUDE']]

In [51]:
apc_stops_df.head(2)


Unnamed: 0,STOP_ID,MAIN_STREET,CROSS_STREET,LATITUDE,LONGITUDE
0,5,STOP4,,35.060145,-85.26493
1,12,MARKET,FAMILY DOLLAR,35.050703,-85.309532


# Join RideCheck Stop data

We perform a LEFT join with RideCheck stop data (from STOPS.xlsx) on STOP_ID. There should be no null values.

In [52]:
test2['STOP_ID'] = test2['STOP_ID'].astype(str)
apc_stops_df['STOP_ID'] = apc_stops_df['STOP_ID'].astype(str)

test3 = test2.merge(apc_stops_df, left_on='STOP_ID', right_on='STOP_ID', how='left')

In [53]:
# check for null values
test3.loc[test3['LATITUDE'].isnull()].shape[0]

0

In [54]:
# check that no rows were added
test3.shape[0] - test2.shape[0]

0

# Save final dataset

In [55]:
test3.to_csv('chattanooga_bus_occupancy_jan20_through_jun20.csv')

# Create dataset for Chattanooga dashboard 
A few more things need to be done for the dataset to be used in the Chattanooga occupancy dashboard:

Dashboard dataset should have the following columns:

'trip_id', 'arrival_time', 'stop_id', 'stop_sequence', 'stop_name', 'stop_lat', 'stop_lon', 'route_id', 'direction_id', 'date', 'board_count', 'alight_count', 'occupancy', 'direction_desc', 'date_time', 'trip_start_time', 'trip_name', 'day_of_week', 'service_period'

1.Drop rows with null values (where GTFS did not get matched)

2.Calculate additional fields used in the dataset if they do not already exist

3.Change column names (column names used in the dashboard should be the same between Nashville and Chattanooga)

In [57]:
# change column names
chattanooga_dashboard_df = test3[['trip_id', 'arrival_time', 'stop_id', 'stop_sequence', 'stop_name',
                                    'stop_lat', 'stop_lon', 'route_id', 'direction_id', 'SURVEY_DATE', 
                                    'PASSENGERS_ON', 'PASSENGERS_OFF', 'PASSENGERS_IN', 'DIRECTION_NAME', 'SERVICE_PERIOD']]
print("num rows:", chattanooga_dashboard_df.shape[0])
chattanooga_dashboard_df.head(2)

num rows: 3466530


Unnamed: 0,trip_id,arrival_time,stop_id,stop_sequence,stop_name,stop_lat,stop_lon,route_id,direction_id,SURVEY_DATE,PASSENGERS_ON,PASSENGERS_OFF,PASSENGERS_IN,DIRECTION_NAME,SERVICE_PERIOD
0,,,,,,,,,,6/9/20,0,0,0,INBOUND,Weekday
1,,,,,,,,,,6/9/20,0,0,2,INBOUND,Weekday


In [58]:
# drop null values
chattanooga_dashboard_df = chattanooga_dashboard_df.dropna()
print("num rows after null values are dropped:", chattanooga_dashboard_df.shape[0])

num rows after null values are dropped: 3150191


In [59]:
# calculate date field and drop SURVEY_DATE
chattanooga_dashboard_df['date'] = pd.to_datetime(chattanooga_dashboard_df['SURVEY_DATE'])
chattanooga_dashboard_df['date'].sample(5)

1481276   2020-01-09
2731942   2020-01-30
2722671   2020-01-30
766546    2020-06-11
2092807   2020-03-04
Name: date, dtype: datetime64[ns]

In [60]:
# drop SURVEY_DATE (will use 'date' field in dashboard instead)
chattanooga_dashboard_df = chattanooga_dashboard_df.drop(columns=['SURVEY_DATE'])

In [61]:
# add date_time field
chattanooga_dashboard_df['date'] = chattanooga_dashboard_df['date'].astype(str)
chattanooga_dashboard_df['date_time'] = chattanooga_dashboard_df['date'] + " " + chattanooga_dashboard_df['arrival_time']
chattanooga_dashboard_df['date_time'].sample(5)

1130791    2020-03-13 19:17:32
2093002    2020-03-23 07:18:47
1281366    2020-02-20 22:26:48
2087538    2020-01-31 06:48:56
2502393    2020-02-19 15:40:00
Name: date_time, dtype: object

In [62]:
chattanooga_dashboard_df.head(2)

Unnamed: 0,trip_id,arrival_time,stop_id,stop_sequence,stop_name,stop_lat,stop_lon,route_id,direction_id,PASSENGERS_ON,PASSENGERS_OFF,PASSENGERS_IN,DIRECTION_NAME,SERVICE_PERIOD,date,date_time
6,152324020,06:22:37,2245,38.0,Brainerd & United Methodist Church,35.016055,-85.2355,4,1.0,0,0,7,INBOUND,Weekday,2020-06-09,2020-06-09 06:22:37
7,152324020,06:23:06,2223,40.0,Brainerd Rd & Old Food City-1,35.017769,-85.238315,4,1.0,0,0,7,INBOUND,Weekday,2020-06-09,2020-06-09 06:23:06


In [63]:
# add trip_start_time
sorted_by_time = chattanooga_dashboard_df.sort_values('arrival_time')
trip_start_time = chattanooga_dashboard_df.drop_duplicates('trip_id', keep='first')
trip_start_time = trip_start_time[['trip_id', 'arrival_time']]
trip_start_time.columns = ['trip_id', 'trip_start_time']
chattanooga_dashboard_df = chattanooga_dashboard_df.merge(trip_start_time, on='trip_id', how='left')

chattanooga_dashboard_df[['trip_id', 'date_time', 'trip_start_time', 'arrival_time', 'stop_sequence']].sample(5)

Unnamed: 0,trip_id,date_time,trip_start_time,arrival_time,stop_sequence
2792646,138620020,2020-03-23 16:29:11,16:03:41,16:29:11,85.0
1962966,139643020,2020-01-02 06:03:53,06:15:00,06:03:53,101.0
296230,151998020,2020-06-18 18:55:57,18:51:13,18:55:57,59.0
679060,151754020,2020-06-05 13:12:38,13:45:00,13:12:38,69.0
1584338,139740020,2020-04-01 19:24:48,19:20:00,19:24:48,32.0


In [64]:
# add trip name
chattanooga_dashboard_df['trip_name'] = chattanooga_dashboard_df['trip_start_time'] + ' (trip ID: ' + chattanooga_dashboard_df['trip_id'] + ')'
chattanooga_dashboard_df['trip_name'].sample(5)

1322170    05:30:00 (trip ID: 140097020)
438690     15:40:00 (trip ID: 151922020)
1815181    15:25:00 (trip ID: 139678020)
1665024    11:10:00 (trip ID: 139719020)
2416293    15:12:29 (trip ID: 138977020)
Name: trip_name, dtype: object

In [65]:
# add day of week
chattanooga_dashboard_df['date'] = pd.to_datetime(chattanooga_dashboard_df['date'])
chattanooga_dashboard_df['day_of_week'] = chattanooga_dashboard_df['date'].dt.dayofweek

In [66]:
# check final columns
chattanooga_dashboard_df.head(2)

Unnamed: 0,trip_id,arrival_time,stop_id,stop_sequence,stop_name,stop_lat,stop_lon,route_id,direction_id,PASSENGERS_ON,PASSENGERS_OFF,PASSENGERS_IN,DIRECTION_NAME,SERVICE_PERIOD,date,date_time,trip_start_time,trip_name,day_of_week
0,152324020,06:22:37,2245,38.0,Brainerd & United Methodist Church,35.016055,-85.2355,4,1.0,0,0,7,INBOUND,Weekday,2020-06-09,2020-06-09 06:22:37,06:22:37,06:22:37 (trip ID: 152324020),1
1,152324020,06:23:06,2223,40.0,Brainerd Rd & Old Food City-1,35.017769,-85.238315,4,1.0,0,0,7,INBOUND,Weekday,2020-06-09,2020-06-09 06:23:06,06:22:37,06:22:37 (trip ID: 152324020),1


In [67]:
chattanooga_dashboard_df =  chattanooga_dashboard_df.rename(columns={'PASSENGERS_ON' : 'board_count',
                                                                     'PASSENGERS_OFF' : 'alight_count',
                                                                     'PASSENGERS_IN' : 'occupancy',
                                                                     'DIRECTION_NAME' : 'direction_desc',
                                                                     'SERVICE_PERIOD' : 'service_period'})
chattanooga_dashboard_df.head(2)

Unnamed: 0,trip_id,arrival_time,stop_id,stop_sequence,stop_name,stop_lat,stop_lon,route_id,direction_id,board_count,alight_count,occupancy,direction_desc,service_period,date,date_time,trip_start_time,trip_name,day_of_week
0,152324020,06:22:37,2245,38.0,Brainerd & United Methodist Church,35.016055,-85.2355,4,1.0,0,0,7,INBOUND,Weekday,2020-06-09,2020-06-09 06:22:37,06:22:37,06:22:37 (trip ID: 152324020),1
1,152324020,06:23:06,2223,40.0,Brainerd Rd & Old Food City-1,35.017769,-85.238315,4,1.0,0,0,7,INBOUND,Weekday,2020-06-09,2020-06-09 06:23:06,06:22:37,06:22:37 (trip ID: 152324020),1


In [68]:
chattanooga_dashboard_df.to_csv('chattanooga_bus_occupancy_dashboard_20200925.csv')