In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
#Read raw chattoonooga APC data. This csv file is located in teams under datasets > apc > carta > chattanooga_apc_jan20_through_jun20.csv.zip
apc_df = pd.read_csv('chattanooga_apc_jan20_through_jun20.csv', index_col=0)

In [3]:
apc_df.shape[0]

3471268

In [4]:
#change 'outybound' to outbound (typo here)
apc_df.loc[apc_df['DIRECTION_NAME'] == 'OUTYBOUND', ['DIRECTION_NAME']] = 'OUTBOUND'

In [5]:
#drop all duplicates based on trip_key, survey_date, direction_name, stop_id, and trip_start_time
#later append this with subset of dups we want to keep
apc_no_dups = apc_df.drop_duplicates(['TRIP_KEY','SURVEY_DATE','DIRECTION_NAME','STOP_ID','TRIP_START_TIME'],keep=False)

In [6]:
#get all duplicates
apc_dups = apc_df.loc[apc_df.duplicated(subset=['TRIP_KEY','SURVEY_DATE','DIRECTION_NAME','STOP_ID','TRIP_START_TIME'], keep=False)]

In [7]:
#drop routes 33, 34, 14
apc_dups_dropped_routes = apc_dups[(apc_dups.ROUTE_NUMBER != 33) & (apc_dups.ROUTE_NUMBER != 34)
                                   & (apc_dups.ROUTE_NUMBER != 14)]

In [8]:
apc_dups_dropped_routes.shape[0]

2774

In [9]:
#convert survey_date to a datetime object and create a new 'Date' column. 
apc_no_dups['DATE'] = pd.to_datetime(apc_no_dups['SURVEY_DATE'])
apc_dups_dropped_routes['DATE'] = pd.to_datetime(apc_dups_dropped_routes['SURVEY_DATE'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
def does_trip_date_match (trip_key, date) :
    """
    check if data matches APC data based on trip_key and survey_date
    
    :param: trip_key, date of trip
    :return: false if no match, true if matched
    """
    df = apc_no_dups.loc[(apc_no_dups['TRIP_KEY'] == trip_key) & (apc_no_dups['DATE'] == date)]
    return df.shape[0] != 0

In [11]:
apc_dups_dropped_routes['trip_date_match'] = apc_dups_dropped_routes.apply(lambda row: does_trip_date_match(row['TRIP_KEY'], 
                                                            row['DATE']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [12]:
#need to drop these which are unique trip - date combinations that don't occur in the APC data without duplicates.
dups_notin_apc = apc_dups_dropped_routes.loc[apc_dups_dropped_routes['trip_date_match'] == False]

In [13]:
dups_notin_apc.shape[0]

2570

In [14]:
#get the data frame of duplicated values trip-date combinations match the APC data.
dups_in_apc = apc_dups_dropped_routes.loc[apc_dups_dropped_routes['trip_date_match'] == True]

In [15]:
dups_in_apc.shape[0]

204

In [16]:
#for duplicates where alight and board are the same, drop dups. later will merge this with apc_no_dups.
board_alight_same = dups_in_apc.loc[dups_in_apc.duplicated(subset=['TRIP_KEY','SURVEY_DATE','DIRECTION_NAME',
                                                                   'STOP_ID','TRIP_START_TIME', 'PASSENGERS_ON', 
                                                                   'PASSENGERS_OFF'], keep='first')]

In [17]:
board_alight_same.shape[0]

100

In [18]:
#get all duplicates where board and alight are not the same. we want to see what is happening with negative occupancies.
board_alight_diff = dups_in_apc.drop_duplicates(['TRIP_KEY','SURVEY_DATE','DIRECTION_NAME','STOP_ID',
                                                 'TRIP_START_TIME', 'PASSENGERS_ON', 'PASSENGERS_OFF'],keep=False)

In [19]:
board_alight_diff.shape[0]

4

In [20]:
def calc_occ (df) :
    
    """
    calculate occupancy at each stop along a route
    
    :param: dataframe with board/alight values for all stops along a single trip
    :return: data frame with calculated occupancies column
    """
    
    tmp = df.copy()
    # calc initial load as the passengers in at the first stop
    tmp['initial_load'] = tmp['PASSENGERS_IN'].iloc[0]
    
    tmp['initial_load'] = pd.to_numeric(tmp['initial_load'], errors='coerce')
    tmp['PASSENGERS_ON'] = pd.to_numeric(tmp['PASSENGERS_ON'], errors='coerce')
    tmp['PASSENGERS_OFF'] = pd.to_numeric(tmp['PASSENGERS_OFF'], errors='coerce')

    # calc occupancy net change
    tmp['occupancy_net_change'] = tmp['PASSENGERS_ON'] - tmp['PASSENGERS_OFF']

    # calc cumulative sum in occupancy net change as an intermediate step
    tmp_sum_df = pd.DataFrame(tmp['occupancy_net_change'].cumsum())
    tmp_sum_df.columns = ['tmp_sum']

    # merge tmp sum (cumulative sum) into tmp
    tmp = tmp.merge(tmp_sum_df, left_index=True, right_index=True)

    # calc occupancy for a particular stop
    tmp['calc_occupancy'] = tmp['tmp_sum'] + tmp['initial_load']

    return tmp.drop(columns=['tmp_sum', 'occupancy_net_change'])

In [21]:
def number_neg_occ (trip_key, survey_date, stop_id, sort_order, passengers_on, passengers_off, passengers_in) :
    
    
    """
    calculate number of negative occupancies for a given row
    
    :param: trip_key, date, stop_id, stop_order, passengers_on, passengers_off, passengers_in for a row
    :return: number of negative occupancies
    """
        
    #df = all stops on a trip. matching to APC data set without any dups on trip_key and survey_date.
    df = apc_no_dups.loc[(apc_no_dups['TRIP_KEY'] == trip_key) & (apc_no_dups['DATE'] == survey_date)]     
    
    #drop all columns except alight, board, stop sequence, stop id 
    df1 = df.loc[ : , ('PASSENGERS_ON', 'PASSENGERS_OFF', 'PASSENGERS_IN', 'SORT_ORDER', 'STOP_ID', 'DATE')]
    
    #df2 = data frame of row 
    d = {'PASSENGERS_ON' : [passengers_on], 'PASSENGERS_OFF' : [passengers_off], 'PASSENGERS_IN' : [passengers_in],
         'SORT_ORDER' : [sort_order], 'STOP_ID': [stop_id], 'DATE': [survey_date]}
    df2 = pd.DataFrame(data = d)
    
    #append 
    df3 = df1.append(df2)
    df4 = df3.sort_values(['SORT_ORDER'],ascending=True)
    
    #calc occupancy
    calc_occ_df = calc_occ(df4)
    
    #count how many negatives are in the calculated occupancy column
    return calc_occ_df.loc[calc_occ_df['calc_occupancy'] < 0].shape[0]

In [22]:
board_alight_diff['number_neg_occ'] = board_alight_diff.apply(lambda row: number_neg_occ(row['TRIP_KEY'], 
                                                            row['DATE'],                            
                                                            row['STOP_ID'], row['SORT_ORDER'], 
                                                            row['PASSENGERS_ON'], row['PASSENGERS_OFF'], 
                                                            row['PASSENGERS_IN']), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [23]:
def is_equal (number_neg_occ, minimum) :
    
    """
    check if number of neg occupancies is equal to the minimum of this for a particular trip-date-stopID combination
    
    :param: number of negative occupancy, minimum negative occupancy of a row
    :return: boolean if parameters are equal
    """
    
    return number_neg_occ == minimum

In [24]:
def drop_highest_neg_occ (df) :
    
    """
    keep row of a trip-date-stopID combination with the lowest number of negative occupancies
    
    :param: data frame to drop the highest negative occupancies of per trip-date-stopID combination
    :return: data frame with entries that have the lowest number of neg occupancies per trip-date-stopID combination
    """

    grouped = df.groupby(['TRIP_KEY', 'STOP_ID', 'DATE'])['number_neg_occ']

    df1 = df.assign(min=grouped.transform(min))

    #mark true if the min matches number_neg_occ, else false
    df1['keep'] = df1.apply(lambda row: is_equal(row['number_neg_occ'], row['min']), axis=1)
    
    #filter data frame where keep=true
    df2 = df1.loc[df1['keep'] == True]
    
    #remove duplicates
    return df2.drop_duplicates(['TRIP_KEY','DATE', 'STOP_ID'],keep='first')


In [25]:
board_alight_diff2 = drop_highest_neg_occ (board_alight_diff)

In [26]:
board_alight_diff2.shape[0]

2

In [27]:
#drop columns
board_alight_diff2 = board_alight_diff2.drop(columns=['DATE', 'trip_date_match', 'number_neg_occ', 'min', 'keep'])

In [28]:
board_alight_diff2

Unnamed: 0,SERIAL_NUMBER,SCHEDULE_ID,SCHEDULE_NAME,SIGNUP_NAME,SURVEY_DATE,SURVEY_STATUS,SURVEY_TYPE,SURVEY_SOURCE,PATTERN_ID,ROUTE_NUMBER,ROUTE_NAME,DIRECTION_NAME,BRANCH,SERVICE_CODE,SERVICE_TYPE,SERVICE_CLASS,SERVICE_MODE,TRIP_START_TIME,TIME_PERIOD,SERVICE_PERIOD,TRIP_NUMBER,TRIP_KEY,BLOCK_NUMBER,BLOCK_KEY,BLOCK_ID,BLOCK_NAME,RUN_NUMBER,RUN_KEY,VEHICLE_NUMBER,VEHICLE_DESCRIPTION,VEHICLE_SEATS,REVENUE_START,REVENUE_END,REVENUE_NET,ODOM_START,ODOM_END,ODOM_NET,CONDITION_NUMBER,CHECKER_NAME,GARAGE_NAME,DIVISION_NAME,OPERATOR_ID,FAREBOX,MATCH_COUNT,COMMENTS,SORT_ORDER,STOP_ID,MAIN_CROSS_STREET,TRAVEL_DIRECTION,TIMEPOINT,SEGMENT_MILES,TIME_SCHEDULED,TIME_ACTUAL_ARRIVE,TIME_ACTUAL_DEPART,DWELL_TIME,RUNNING_TIME_ACTUAL,PASSENGERS_ON,PASSENGERS_OFF,PASSENGERS_IN,PASSENGERS_SPOT,WHEELCHAIRS,BICYCLES,MATCH_DISTANCE,TIMEPOINT_MILES,NON_STUDENT_FARE,CHILD,NR_BOARD,NR_ALIGHT,KNEELS,COMMENT_NUMBER,CHECKER_TIME,FIRST_LAST_STOP
370571,4859335,112,Aug19 (Weekday),8/18/19,1/15/20 0:00,2,1,3,921,10.9,Route #10G 58:10G,INBOUND,[10G]GLENWOOD HWY58 STUART IB,Route #10,,,Bus,1/0/00 19:20,PM Late,Weekday,52,138668,3302,15191,0,,29,29,135,Gillig HF 2006,30.0,,,,,,,0,,***Unknown Garage***,,160856.0,,68.0,Trip starts at 7:20p at StuCha-1 and ends at 8...,80,293,CHAMBERLAIN/BOONE,S,0,0.09,,1/0/00 19:22,1/0/00 19:22,0.07,,1,0,6,,0,0,97.0,,,,,,0,,,2
479043,4862011,112,Aug19 (Weekday),8/18/19,1/17/20 0:00,2,1,3,921,10.9,Route #10G 58:10G,INBOUND,[10G]GLENWOOD HWY58 STUART IB,Route #10,,,Bus,1/0/00 19:20,PM Late,Weekday,52,138668,3302,15191,0,,29,29,150,Gillig LFH 2012 Diesel,32.0,,,,,,,0,,***Unknown Garage***,,160856.0,,67.0,Trip starts at 7:20p at StuCha-1 and ends at 8...,80,293,CHAMBERLAIN/BOONE,S,0,0.09,,1/0/00 19:24,1/0/00 19:24,0.05,,1,0,2,,0,0,26.0,,,,,,0,,,2


In [29]:
apc_no_dups = apc_no_dups.drop(columns=['DATE'])

In [30]:
board_alight_same = board_alight_same.drop(columns=['DATE', 'trip_date_match'])

In [31]:
#append data frame (1) dropped duplicates where board and alight counts were the same, and where trip-dates matched to APC
#with (2) cleaned duplicates where board and alight counts were different, and where trip-dates matched to APC
a1 = board_alight_same.append(board_alight_diff2)

In [32]:
#append data frame (1) cleaned duplicates with (2) apc data with no duplicates
a2 = apc_no_dups.append(a1)

In [34]:
a2.head()

Unnamed: 0,SERIAL_NUMBER,SCHEDULE_ID,SCHEDULE_NAME,SIGNUP_NAME,SURVEY_DATE,SURVEY_STATUS,SURVEY_TYPE,SURVEY_SOURCE,PATTERN_ID,ROUTE_NUMBER,ROUTE_NAME,DIRECTION_NAME,BRANCH,SERVICE_CODE,SERVICE_TYPE,SERVICE_CLASS,SERVICE_MODE,TRIP_START_TIME,TIME_PERIOD,SERVICE_PERIOD,TRIP_NUMBER,TRIP_KEY,BLOCK_NUMBER,BLOCK_KEY,BLOCK_ID,BLOCK_NAME,RUN_NUMBER,RUN_KEY,VEHICLE_NUMBER,VEHICLE_DESCRIPTION,VEHICLE_SEATS,REVENUE_START,REVENUE_END,REVENUE_NET,ODOM_START,ODOM_END,ODOM_NET,CONDITION_NUMBER,CHECKER_NAME,GARAGE_NAME,DIVISION_NAME,OPERATOR_ID,FAREBOX,MATCH_COUNT,COMMENTS,SORT_ORDER,STOP_ID,MAIN_CROSS_STREET,TRAVEL_DIRECTION,TIMEPOINT,SEGMENT_MILES,TIME_SCHEDULED,TIME_ACTUAL_ARRIVE,TIME_ACTUAL_DEPART,DWELL_TIME,RUNNING_TIME_ACTUAL,PASSENGERS_ON,PASSENGERS_OFF,PASSENGERS_IN,PASSENGERS_SPOT,WHEELCHAIRS,BICYCLES,MATCH_DISTANCE,TIMEPOINT_MILES,NON_STUDENT_FARE,CHILD,NR_BOARD,NR_ALIGHT,KNEELS,COMMENT_NUMBER,CHECKER_TIME,FIRST_LAST_STOP
0,4850095,112,Aug19 (Weekday),8/18/19,1/2/20 0:00,2,1,3,598,10.9,Route #10G 58:10G,INBOUND,[10G]GLENWOOD IB,Route #10,,,Bus,1/0/00 16:55,PM Peak,Weekday,44,138650,3102,15141,0,,59,59,137,Gillig HF 2006,30.0,,,,,,,0,,***Unknown Garage***,,160962.0,,69.0,Trip starts at 4:55p at StuDod-1 and ends at 5...,20,100008,StuDod-1,X,-1,,1/0/00 16:55,1/0/00 16:55,1/0/00 16:55,,14.98,0,0,0,,0,0,357.0,4.17,,,,,0,,,1
1,4850095,112,Aug19 (Weekday),8/18/19,1/2/20 0:00,2,1,3,598,10.9,Route #10G 58:10G,INBOUND,[10G]GLENWOOD IB,Route #10,,,Bus,1/0/00 16:55,PM Peak,Weekday,44,138650,3102,15141,0,,59,59,137,Gillig HF 2006,30.0,,,,,,,0,,***Unknown Garage***,,160962.0,,69.0,Trip starts at 4:55p at StuDod-1 and ends at 5...,30,217,STUART/DODSON,E,0,0.06,,,,,,0,0,0,,0,0,,,,,,,0,,,1
2,4850095,112,Aug19 (Weekday),8/18/19,1/2/20 0:00,2,1,3,598,10.9,Route #10G 58:10G,INBOUND,[10G]GLENWOOD IB,Route #10,,,Bus,1/0/00 16:55,PM Peak,Weekday,44,138650,3102,15141,0,,59,59,137,Gillig HF 2006,30.0,,,,,,,0,,***Unknown Garage***,,160962.0,,69.0,Trip starts at 4:55p at StuDod-1 and ends at 5...,40,289,STUART/TAYLOR,E,0,0.08,,1/0/00 16:55,1/0/00 16:55,0.0,,0,0,0,,0,0,0.0,,,,,,0,,,2
3,4850095,112,Aug19 (Weekday),8/18/19,1/2/20 0:00,2,1,3,598,10.9,Route #10G 58:10G,INBOUND,[10G]GLENWOOD IB,Route #10,,,Bus,1/0/00 16:55,PM Peak,Weekday,44,138650,3102,15141,0,,59,59,137,Gillig HF 2006,30.0,,,,,,,0,,***Unknown Garage***,,160962.0,,69.0,Trip starts at 4:55p at StuDod-1 and ends at 5...,50,290,STUART/WHEELER,E,0,0.08,,1/0/00 16:55,1/0/00 16:55,0.0,,0,0,0,,0,0,0.0,,,,,,0,,,2
4,4850095,112,Aug19 (Weekday),8/18/19,1/2/20 0:00,2,1,3,598,10.9,Route #10G 58:10G,INBOUND,[10G]GLENWOOD IB,Route #10,,,Bus,1/0/00 16:55,PM Peak,Weekday,44,138650,3102,15141,0,,59,59,137,Gillig HF 2006,30.0,,,,,,,0,,***Unknown Garage***,,160962.0,,69.0,Trip starts at 4:55p at StuDod-1 and ends at 5...,60,291,STUART/CHAMBERLAIN,S,0,0.08,,1/0/00 16:56,1/0/00 16:56,0.0,,0,0,0,,0,0,0.0,,,,,,0,,,2


In [35]:
apc_df.shape[0] - a2.shape[0]

4840

In [36]:
#Get the difference of the cleaned version and original version with rows that are not in the cleaned version but are in the full apc data frame.
diff = a2.merge(apc_df, how = 'outer' ,indicator=True).loc[lambda x : x['_merge']=='right_only']

In [37]:
diff.shape[0]

4840

In [38]:
diff = diff.drop(columns=['_merge'])

In [39]:
diff.head()

Unnamed: 0,SERIAL_NUMBER,SCHEDULE_ID,SCHEDULE_NAME,SIGNUP_NAME,SURVEY_DATE,SURVEY_STATUS,SURVEY_TYPE,SURVEY_SOURCE,PATTERN_ID,ROUTE_NUMBER,ROUTE_NAME,DIRECTION_NAME,BRANCH,SERVICE_CODE,SERVICE_TYPE,SERVICE_CLASS,SERVICE_MODE,TRIP_START_TIME,TIME_PERIOD,SERVICE_PERIOD,TRIP_NUMBER,TRIP_KEY,BLOCK_NUMBER,BLOCK_KEY,BLOCK_ID,BLOCK_NAME,RUN_NUMBER,RUN_KEY,VEHICLE_NUMBER,VEHICLE_DESCRIPTION,VEHICLE_SEATS,REVENUE_START,REVENUE_END,REVENUE_NET,ODOM_START,ODOM_END,ODOM_NET,CONDITION_NUMBER,CHECKER_NAME,GARAGE_NAME,DIVISION_NAME,OPERATOR_ID,FAREBOX,MATCH_COUNT,COMMENTS,SORT_ORDER,STOP_ID,MAIN_CROSS_STREET,TRAVEL_DIRECTION,TIMEPOINT,SEGMENT_MILES,TIME_SCHEDULED,TIME_ACTUAL_ARRIVE,TIME_ACTUAL_DEPART,DWELL_TIME,RUNNING_TIME_ACTUAL,PASSENGERS_ON,PASSENGERS_OFF,PASSENGERS_IN,PASSENGERS_SPOT,WHEELCHAIRS,BICYCLES,MATCH_DISTANCE,TIMEPOINT_MILES,NON_STUDENT_FARE,CHILD,NR_BOARD,NR_ALIGHT,KNEELS,COMMENT_NUMBER,CHECKER_TIME,FIRST_LAST_STOP
3466428,4850144,112,Aug19 (Weekday),8/18/19,1/2/20 0:00,2,1,3,247,33.0,Route #DTS:33,INBOUND,[DTS]DOWNTOWN SHUTTLE IB,Route #DTS,,,Bus,1/0/00 20:22,PM Late,Weekday,250,140477,6402,,0,,7,7,704,AVS Elec 1995,22.0,,,,,,,0,,***Unknown Garage***,,160969.0,,18.0,Trip starts at 8:22p at SPN and ends at 8:37p ...,10,100075,SPN,X,-1,,1/0/00 20:22,1/0/00 20:23,1/0/00 20:23,,10.1,0,0,0,,0,0,323.0,1.56,,,,,0,,,1
3466429,4850144,112,Aug19 (Weekday),8/18/19,1/2/20 0:00,2,1,3,247,33.0,Route #DTS:33,INBOUND,[DTS]DOWNTOWN SHUTTLE IB,Route #DTS,,,Bus,1/0/00 20:22,PM Late,Weekday,250,140477,6402,,0,,7,7,704,AVS Elec 1995,22.0,,,,,,,0,,***Unknown Garage***,,160969.0,,18.0,Trip starts at 8:22p at SPN and ends at 8:37p ...,20,1566,SHUTTLE PARK NORTH - INTERNAL,E,0,0.08,,1/0/00 20:23,1/0/00 20:23,,,0,0,0,,0,0,323.0,,,,,,0,,,1
3466430,4850144,112,Aug19 (Weekday),8/18/19,1/2/20 0:00,2,1,3,247,33.0,Route #DTS:33,INBOUND,[DTS]DOWNTOWN SHUTTLE IB,Route #DTS,,,Bus,1/0/00 20:22,PM Late,Weekday,250,140477,6402,,0,,7,7,704,AVS Elec 1995,22.0,,,,,,,0,,***Unknown Garage***,,160969.0,,18.0,Trip starts at 8:22p at SPN and ends at 8:37p ...,30,793,BROAD/3RD,S,0,0.1,,1/0/00 20:23,1/0/00 20:23,0.0,,0,0,0,,0,0,0.0,,,,,,0,,,2
3466431,4850144,112,Aug19 (Weekday),8/18/19,1/2/20 0:00,2,1,3,247,33.0,Route #DTS:33,INBOUND,[DTS]DOWNTOWN SHUTTLE IB,Route #DTS,,,Bus,1/0/00 20:22,PM Late,Weekday,250,140477,6402,,0,,7,7,704,AVS Elec 1995,22.0,,,,,,,0,,***Unknown Garage***,,160969.0,,18.0,Trip starts at 8:22p at SPN and ends at 8:37p ...,40,1537,BROAD/4TH,S,0,0.09,,1/0/00 20:25,1/0/00 20:25,0.0,,0,0,0,,0,0,1.0,,,,,,0,,,2
3466432,4850144,112,Aug19 (Weekday),8/18/19,1/2/20 0:00,2,1,3,247,33.0,Route #DTS:33,INBOUND,[DTS]DOWNTOWN SHUTTLE IB,Route #DTS,,,Bus,1/0/00 20:22,PM Late,Weekday,250,140477,6402,,0,,7,7,704,AVS Elec 1995,22.0,,,,,,,0,,***Unknown Garage***,,160969.0,,18.0,Trip starts at 8:22p at SPN and ends at 8:37p ...,50,794,BROAD/5TH,S,0,0.09,,1/0/00 20:26,1/0/00 20:26,0.0,,0,0,0,,0,0,0.0,,,,,,0,,,2


In [40]:
#sort cleaned data
cleaned_chattanooga_apc_jan20_through_jun20 = a2.sort_values(['TRIP_KEY', 'SURVEY_DATE', 'STOP_ID'], ascending=False)

In [41]:
#sort difference data
diff_chattanooga_apc_jan20_through_jun20 = diff.sort_values(['TRIP_KEY', 'SURVEY_DATE', 'STOP_ID'], ascending=False)

In [43]:
cleaned_chattanooga_apc_jan20_through_jun20.to_csv('cleaned_chattanooga_apc_jan20_through_jun20.csv')

In [44]:
diff_chattanooga_apc_jan20_through_jun20.to_csv('diff_chattanooga_apc_jan20_through_jun20.csv')