In [89]:
from geopy.distance import distance
import json
import matplotlib
matplotlib.rcParams['figure.figsize'] = [4, 3]
matplotlib.rcParams['font.size'] = 8
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [90]:
# from google.colab import drive
# drive.mount('/content/drive')
# # %cd /content/drive/My Drive/Colab Notebooks/draws

# Data merging

## data from replica survey

In [None]:
import pandas as pd
# Load the Replica trip data
replica = pd.read_csv('./data/Replica-IMD/trips-end-at-imd-2024-spring-thursday.csv')

  replica = pd.read_csv('./data/Replica-IMD/trips-end-at-imd-2024-spring-thursday.csv')


Unnamed: 0,activity_id,trip_taker_age,trip_taker_individual_income,trip_taker_household_size,trip_taker_household_income,destination_bgrp_fips_2020,destination_trct_fips_2020,trip_duration_seconds,trip_distance_meters
count,53273.0,51786.0,51786.0,51786.0,51786.0,53273.0,53273.0,53273.0,53273.0
mean,9.21068e+18,42.802823,62523.35,3.144788,133364.0,170318400000.0,17031840000.0,1530.832974,18006.82
std,5.310537e+18,16.417662,89433.78,2.112252,141545.1,394.4556,39.41493,2289.230794,65261.94
min,98881380000000.0,3.0,-8290.0,1.0,-2293.0,170318400000.0,17031840000.0,0.0,0.0
25%,4.650743e+18,30.0,13681.0,2.0,47428.0,170318400000.0,17031840000.0,575.0,3643.653
50%,9.163604e+18,40.0,40268.0,3.0,96097.5,170318400000.0,17031840000.0,1203.0,10160.31
75%,1.382392e+19,55.0,78173.0,4.0,170328.0,170318400000.0,17031840000.0,2034.0,19769.74
max,1.844626e+19,94.0,1383783.0,17.0,1557700.0,170318400000.0,17031840000.0,77411.0,2821083.0


In [92]:
# Keep only trips where household income > 0 and primary mode is not 'other_travel_mode'
replica = replica.query(
    "trip_taker_household_income > 0 and primary_mode != 'other_travel_mode'"
)

# Select relevant columns
replica = replica[[
    'activity_id',                      # Unique ID for the trip
    'primary_mode',                     # Main travel mode used
    'trip_purpose',                     # Purpose of the trip (e.g., work, shopping)
    'origin_bgrp_fips_2020',             # Origin block group FIPS code
    'destination_bgrp_fips_2020',        # Destination block group FIPS code
    'trip_taker_age',                    # Age of the person taking the trip
    'trip_taker_sex',                    # Gender of the trip taker
    'trip_taker_household_size',         # Household size of the trip taker
    'trip_taker_household_income',       # Household income of the trip taker
    'trip_taker_available_vehicles',     # Number of vehicles available to the household
    'trip_taker_education',              # Education level of the trip taker
    'transit_agency',                  # Transit agency used (if transit mode is used)
    'transit_route',                     # Transit route information (if transit mode is used)
    'trip_start_time',                   # Trip start time
    'trip_duration_seconds',             # Trip duration in seconds
    'trip_distance_meters',               # Trip distance in meters
    'trip_taker_school_grade_attending', # School grade of the trip taker (if applicable)
]]

# Rename columns for easier reference
replica = replica.rename(columns={
    'trip_taker_age': 'age',                       # Age
    'trip_taker_household_size': 'hhsize',          # Household size
    'trip_taker_household_income': 'hhinc',         # Household income
    'trip_duration_seconds': 'time',                # Trip duration (seconds)
    'trip_distance_meters': 'dist',                 # Trip distance (meters)
    'origin_bgrp_fips_2020': 'o_fips',               # Origin FIPS code
    'destination_bgrp_fips_2020': 'd_fips',           # Destination FIPS code
    'trip_purpose':'purpose'               # Trip purpose
})


In [93]:
# create a new column for if is not "not_attending_school" 
replica['student'] = replica['trip_taker_school_grade_attending'].apply(lambda x: 0 if x == 'not_attending_school' else 1)

# Create binary variable for high education
high_education_levels = {'bachelors_degree', 'some_college', 'advanced_degree'}
replica['higheduc'] = replica['trip_taker_education'].apply(lambda x: 1 if x in high_education_levels else 0)

# Create binary variable for male 
replica['male'] = replica['trip_taker_sex'].map(lambda x: 1 if x == 'male' else 0)

# Map vehicle ownership levels to numerical values
vehicle_mapping = {
    'zero': 0,
    'one': 1,
    'two': 2,
    'three_plus': 3
}
replica['numvec'] = replica['trip_taker_available_vehicles'].map(vehicle_mapping)

# Map primary modes to four kinds: auto, transit, bike, walk
mode_mapping_four_kinds = {
    'private_auto': 0,
    'auto_passenger': 0,
    'on_demand_auto': 0,
    'public_transit': 1,
    'walking': 3,
    'biking': 2
}
replica['mode_four_kinds'] = replica['primary_mode'].map(mode_mapping_four_kinds)

# Map primary modes to three kinds: auto, transit, active (bike/walk)
mode_mapping_three_kinds = {
    'private_auto': 0,
    'auto_passenger': 0,
    'on_demand_auto': 0,
    'public_transit': 1,
    'walking': 2,
    'biking': 2
}
replica['mode_three_kinds'] = replica['primary_mode'].map(mode_mapping_three_kinds)

In [94]:
# Convert FIPS codes to string type
replica['d_fips'] = replica['d_fips'].astype(str)
replica['o_fips'] = replica['o_fips'].astype(str)

# Remove "Out of Region" OD pairs
replica = replica[~replica['d_fips'].str.contains("Out of Region", na=False)]
replica = replica[~replica['o_fips'].str.contains("Out of Region", na=False)]
replica = replica[['activity_id','age','male','numvec','higheduc','hhsize','hhinc','mode_four_kinds','mode_three_kinds','purpose','o_fips','d_fips','transit_route','transit_agency','trip_start_time','time','dist','student']]
replica

Unnamed: 0,activity_id,age,male,numvec,higheduc,hhsize,hhinc,mode_four_kinds,mode_three_kinds,purpose,o_fips,d_fips,transit_route,transit_agency,trip_start_time,time,dist,student
1016,11195776143997613937,55.0,0,2,0,2.0,79041.0,0,0,maintenance,170119653001,170318382002,,,12:29:00,7327,182558.015306,0
1018,15890157103751998101,56.0,0,3,0,9.0,159615.0,0,0,eat,170316709001,170318382002,,,18:48:00,1128,17653.253722,0
1019,8544156882190917720,29.0,1,3,1,3.0,42183.0,0,0,school,170316709001,170318381001,,,04:04:00,945,11027.964554,0
1021,3359228516309820827,54.0,0,1,1,2.0,385863.0,0,0,maintenance,170310701021,170318382002,,,19:52:00,1119,9769.715614,0
1022,2269080699016021358,53.0,0,2,1,2.0,62342.0,0,0,maintenance,170310102012,170318382002,,,13:08:00,2619,23045.035253,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52797,15425404685523401736,35.0,0,1,0,4.0,65665.0,0,0,eat,551270017012,170318382001,,,18:18:00,5338,118633.370709,0
52798,615909538818080941,52.0,0,1,0,2.0,93807.0,0,0,eat,551270017012,170318382001,,,17:20:00,6017,120623.557158,0
52799,13987594770938249186,68.0,1,2,0,8.0,94039.0,0,0,maintenance,550790137001,170318382001,,,10:19:00,5638,151677.459202,0
52800,10567502182196272243,30.0,0,1,0,2.0,2671.0,0,0,eat,551332013001,170318382002,,,08:53:00,6222,167850.598172,0


## Add Transit Fare according to transit route

CTA: https://www.transitchicago.com/fares/

Metra: https://metra.com/fare-table

PACE: https://www.pacebus.com/fares

NICTD: https://mysouthshoreline.com/wp-content/uploads/2024/10/Copy-of-Fare-Chart-July2018-1.pdf

In [95]:
replica['transit_agency'].value_counts()

transit_agency
Chicago Transit Authority                                                                                  2393
Chicago Transit Authority, Chicago Transit Authority                                                       1078
Metra, Chicago Transit Authority                                                                            417
Metra                                                                                                       138
Chicago Transit Authority, Chicago Transit Authority, Chicago Transit Authority                              51
PACE                                                                                                         26
PACE, Chicago Transit Authority                                                                              24
PACE, Chicago Transit Authority, Chicago Transit Authority                                                   22
PACE, Metra, Chicago Transit Authority                                                   

In [96]:
PACE = pd.DataFrame({
    'Full':    {'Bus fare': 2.00},
    'Student': {'Bus fare': 1.00}
})

CTA = pd.DataFrame({
    'Full': {'Line train fare': 2.50, 'Bus fare': 2.25},
    'Student': {'Line train fare': 0.75, 'Bus fare': 0.75}
})

metra = pd.DataFrame({
    'Full': {
        '1-2':   3.75,
        '1-2-3': 5.50,
        '1-2-3-4': 6.75,
        '2-3-4': 3.75
    },
    'Reduced': {
        '1-2':   1.75,
        '1-2-3': 2.75,
        '1-2-3-4': 3.25,
        '2-3-4': 1.75
    }
})

NICTD = pd.DataFrame({
    'Full': {
        '1-3':   6.25,   # Zone 1→Zone 3
        '2-3':   5.50,   # Zone 2→Zone 3
        '3-4':   4.25,   # Zone 3→Zone 4
        '3-5':   6.00,   # Zone 3→Zone 5
        '3-6':   7.25,   # Zone 3→Zone 6
        '3-7':   8.00,   # Zone 3→Zone 7
        '3-8':   9.00,   # Zone 3→Zone 8
        '3-10': 12.25,  # Zone 3→Zone 10
        '4-5':   4.50,   # Zone 4→Zone 5
        '4-6':   6.50,   # Zone 4→Zone 6
        '4-7':   7.50,   # Zone 4→Zone 7
        '4-8':   8.00,   # Zone 4→Zone 8
        '4-10':11.75,  # Zone 4→Zone 10
        '5-6':   6.25,   # Zone 5→Zone 6
        '5-7':   7.25,   # Zone 5→Zone 7
        '5-8':   7.50,   # Zone 5→Zone 8
        '5-10':11.00,  # Zone 5→Zone 10
        '6-7':   6.25,   # Zone 6→Zone 7
        '6-8':   6.50,   # Zone 6→Zone 8
        '6-10':10.00,  # Zone 6→Zone 10
        '7-8':   4.50,   # Zone 7→Zone 8
        '7-10': 9.00,   # Zone 7→Zone 10
        '8-10': 7.50    # Zone 8→Zone 10
    },
    'Reduced': {
        '1-3':   3.00,
        '2-3':   2.75,
        '3-4':   2.00,
        '3-5':   3.00,
        '3-6':   3.50,
        '3-7':   4.00,
        '3-8':   4.50,
        '3-10':  6.00,
        '4-5':   2.25,
        '4-6':   3.25,
        '4-7':   3.75,
        '4-8':   4.00,
        '4-10':  5.75,
        '5-6':   3.00,
        '5-7':   3.50,
        '5-8':   3.75,
        '5-10':  5.50,
        '6-7':   3.00,
        '6-8':   3.25,
        '6-10':  5.00,
        '7-8':   2.25,
        '7-10':  4.50,
        '8-10':  3.75
    }
})

NICTD.index.name = 'Trip (Zone→Zone)'


In [97]:
replica[replica['transit_agency']=='Northern Indiana Commuter Transportation District']['transit_route'].value_counts()

transit_route
South Shore Line    10
Name: count, dtype: int64

## Add travel time from replica itself,create a new OD pairs dataset with travel time

In [98]:
OD_pairs_with_time = pd.DataFrame()
# Map numeric mode codes to readable mode names
mode_mapping = {0: 'auto', 1: 'transit', 2: 'bike', 3: 'walk'}
replica['mode_name'] = replica['mode_four_kinds'].map(mode_mapping)

# Prepare OD pairs (origin-destination combinations)
OD_pairs_with_time = replica[['o_fips', 'd_fips']].drop_duplicates().reset_index(drop=True)

# Add columns for each mode's travel time and distance
modes = ['auto', 'transit', 'bike', 'walk']
for mode in modes:
    OD_pairs_with_time[f'{mode}_time'] = np.nan
    OD_pairs_with_time[f'{mode}_dist'] = np.nan

# Create a lookup copy of the data
replica_lookup = replica.copy()

# Fill time and distance values for each OD pair
for idx, row in OD_pairs_with_time.iterrows():
    o = row['o_fips']
    d = row['d_fips']
    
    # Find all trips with the same origin and destination
    group = replica_lookup[
        (replica_lookup['o_fips'] == o) &
        (replica_lookup['d_fips'] == d)
    ]
    
    if group.empty:
        continue
    
    for mode in modes:
        mode_group = group[group['mode_name'] == mode]
        if not mode_group.empty:
            best_trip = mode_group.iloc[0]  # Pick the first available trip for this mode
            OD_pairs_with_time.at[idx, f'{mode}_time'] = best_trip['time'] / 60    # Convert seconds to minutes
            OD_pairs_with_time.at[idx, f'{mode}_dist'] = best_trip['dist'] / 1000   # Convert meters to kilometers

In [99]:
print(len(OD_pairs_with_time))
OD_pairs_with_time.describe()

8219


Unnamed: 0,auto_time,auto_dist,transit_time,transit_dist,bike_time,bike_dist,walk_time,walk_dist
count,7935.0,7935.0,1638.0,1638.0,492.0,492.0,354.0,354.0
mean,32.203619,27.454702,52.507957,15.737196,30.261009,8.640281,37.401836,3.116952
std,23.745154,35.363527,25.633275,13.103581,19.185694,5.738571,15.311911,1.275672
min,1.0,0.39378,4.616667,0.655806,2.816667,0.188858,1.116667,0.093824
25%,18.1,11.012485,34.8625,7.254229,16.208333,4.432275,27.154167,2.264133
50%,27.583333,18.699579,49.608333,12.074904,25.533333,7.13736,37.95,3.163289
75%,41.175,34.135163,65.4625,19.484764,40.554167,11.932742,48.495833,4.041541
max,461.55,697.398798,183.216667,92.837447,95.233333,28.032171,68.183333,5.681851


In [100]:
# create OD pairs for Google API to collect travel time

import geopandas as gpd
block_groups = gpd.read_file('./data/tl_2024_17_bg/tl_2024_17_bg.shp').to_crs(epsg=4326)

# Calculate centroids for each block group polygon
block_groups['centroid'] = block_groups.geometry.centroid
block_groups['lon'] = block_groups.centroid.x
block_groups['lat'] = block_groups.centroid.y

# Keep only GEOID and coordinates
fips_coords = block_groups[['GEOID', 'lon', 'lat']]

# Suppose your OD_pairs DataFrame contains 'o_fips' and 'd_fips'
# Merge origin coordinates
OD_pairs_for_google = OD_pairs_with_time.merge(
    fips_coords.rename(columns={'GEOID': 'o_fips', 'lon': 'o_lon', 'lat': 'o_lat'}),
    on='o_fips',
    how='left'
)

# Merge destination coordinates
OD_pairs_for_google = OD_pairs_for_google.merge(
    fips_coords.rename(columns={'GEOID': 'd_fips', 'lon': 'd_lon', 'lat': 'd_lat'}),
    on='d_fips',
    how='left'
)
OD_pairs_for_google.dropna(subset=['o_lon', 'o_lat', 'd_lon', 'd_lat'], inplace=True)
OD_pairs_for_google.to_csv('./data/google_api_responses/replica/OD_pairs_for_google_replica.csv', index=False)


  block_groups['centroid'] = block_groups.geometry.centroid

  block_groups['lon'] = block_groups.centroid.x

  block_groups['lat'] = block_groups.centroid.y


## Add travel time from google api

In [101]:
import geopandas as gpd
import pandas as pd
import numpy as np

# Load and prepare shapefile
gdf = gpd.read_file('./data/tl_2024_17_bg/tl_2024_17_bg.shp')
gdf = gdf.to_crs(epsg=4326)  # Ensure it's in WGS84

# Load google data
google_api = pd.read_csv('./data/google_api_responses/replica/combined_google_updated_replica.csv', low_memory=False)

# Replace False and 'false' globally
google_api.replace([False, 'false'], np.nan, inplace=True)

# Replace numeric 0s with np.nan only in selected time/distance columns
cols_to_clean = ['transit_time', 'transit_dist',
                 'auto_time', 'auto_dist',
                 'bike_time', 'bike_dist',
                 'walk_time', 'walk_dist']
for col in cols_to_clean:
    google_api[col] = pd.to_numeric(google_api[col], errors='coerce')  # Ensure numeric
    google_api[col] = google_api[col].replace(0, np.nan)  # Replace 0 with NaN


# Force columns to numeric, coercing errors to NaN
for col in cols_to_clean:
    google_api[col] = pd.to_numeric(google_api[col], errors='coerce')

# Origin matching
gdf_o = gpd.GeoDataFrame(google_api, geometry=gpd.points_from_xy(google_api['o_lon'], google_api['o_lat']), crs="EPSG:4326")
gdf_o_matched = gpd.sjoin(gdf_o, gdf[['GEOID', 'geometry']], how='left', predicate='within')
google_api['o_fips'] = gdf_o_matched['GEOID']

# Destination matching
gdf_d = gpd.GeoDataFrame(google_api, geometry=gpd.points_from_xy(google_api['d_lon'], google_api['d_lat']), crs="EPSG:4326")
gdf_d_matched = gpd.sjoin(gdf_d, gdf[['GEOID', 'geometry']], how='left', predicate='within')
google_api['d_fips'] = gdf_d_matched['GEOID']

# Group by OD and take mean
OD_pairs_with_time2 = google_api[['o_fips', 'd_fips'] + cols_to_clean].groupby(
    ['o_fips', 'd_fips'], as_index=False).mean()

In [102]:
print(len(OD_pairs_with_time2))
OD_pairs_with_time2.describe()

7765


Unnamed: 0,transit_time,transit_dist,auto_time,auto_dist,bike_time,bike_dist,walk_time,walk_dist
count,5856.0,5856.0,283.0,283.0,7380.0,7380.0,7510.0,7510.0
mean,76.527221,25.470042,25.768551,24.068905,86.884867,25.164118,315.067841,22.697027
std,35.142525,20.904722,10.491355,16.525095,83.905949,26.910971,320.539383,23.527224
min,10.15,0.751,6.85,2.652,2.583333,0.751,18.283333,1.291
25%,51.804167,12.09325,18.891667,13.3045,41.133333,10.981,145.104167,10.339
50%,70.783333,19.5675,24.65,19.257,64.275,17.939,229.983333,16.47
75%,94.520833,33.2825,31.108333,31.0035,111.670833,32.28725,400.958333,28.85575
max,526.033333,516.504,67.0,99.662,1457.2,485.613,5821.516667,430.08


## Add travel cost from mydailytravel

In [103]:
# merge place and location to get the lat/lon of each place
place = pd.read_csv('./data/Chicago_Data/MyDailyTravelData/place.csv', low_memory=False)
loc = pd.read_csv('./data/Chicago_Data/MyDailyTravelData/location.csv', low_memory=False)
pl = place[['sampno', 'perno', 'locno', 'arrtime', 
         'deptime', 'fare', 'plaza_total','pkamt']]
l = loc[['sampno', 'locno', 'latitude', 'longitude']]
pl_l = pd.merge(pl, l, left_on=['sampno', 'locno'], right_on=['sampno', 'locno'], how='left')
pl_l['arrtime']= pd.to_datetime(pl_l['arrtime'])
pl_l = pl_l.sort_values(by=['sampno', 'perno', 'arrtime'])

# create travel data of each trips
trips = pl_l.rename(columns={'latitude': 'd_lat', 'longitude': 'd_lon'})
trips = trips.sort_values(by=['sampno', 'perno', 'arrtime']).reset_index(drop=True)

for i in range(1, len(trips)):
    if trips.loc[i-1, 'sampno'] == trips.loc[i, 'sampno'] and trips.loc[i-1, 'perno'] == trips.loc[i, 'perno']:
        trips.loc[i, 'o_lat'] = trips.loc[i-1, 'd_lat']
        trips.loc[i, 'o_lon'] = trips.loc[i-1, 'd_lon']
        

    else:
        trips.loc[i, 'o_lat'] = np.nan
        trips.loc[i, 'o_lon'] = np.nan

# replace positive values with NaN
trips.loc[trips['plaza_total'] < 0, 'plaza_total'] = np.nan
trips.loc[trips['pkamt'] < 0, 'pkamt'] = np.nan
trips.loc[trips['fare'] < 0, 'fare'] = np.nan

# rename columns
trips = trips.rename(columns={
    'plaza_total': 'toll_cost', 
    'pkamt': 'parking_cost',
    'fare': 'transit_cost'
})

In [104]:
import geopandas as gpd
import pandas as pd

# Load block group shapefile and convert to WGS84
gdf = gpd.read_file('./data/tl_2024_17_bg/tl_2024_17_bg.shp').to_crs(epsg=4326)

# Extract relevant columns
OD_cols = ['o_lat', 'o_lon', 'd_lat', 'd_lon', 'toll_cost', 'transit_cost', 'parking_cost']
OD_pairs_with_cost = trips[OD_cols].copy().dropna(subset=['o_lat', 'o_lon', 'd_lat', 'd_lon'])

# Match origin FIPS
gdf_o = gpd.GeoDataFrame(OD_pairs_with_cost,
                         geometry=gpd.points_from_xy(OD_pairs_with_cost['o_lon'], OD_pairs_with_cost['o_lat']),
                         crs="EPSG:4326")
OD_pairs_with_cost['o_fips'] = gpd.sjoin(gdf_o, gdf[['GEOID', 'geometry']], how='left', predicate='within')['GEOID']

# Match destination FIPS
gdf_d = gpd.GeoDataFrame(OD_pairs_with_cost.copy(),
                         geometry=gpd.points_from_xy(OD_pairs_with_cost['d_lon'], OD_pairs_with_cost['d_lat']),
                         crs="EPSG:4326")
OD_pairs_with_cost['d_fips'] = gpd.sjoin(gdf_d, gdf[['GEOID', 'geometry']], how='left', predicate='within')['GEOID']

# Drop unmatched rows
OD_pairs_with_cost = OD_pairs_with_cost.dropna(subset=['o_fips', 'd_fips'])

# Group by OD pairs and compute mean cost
OD_pairs_with_cost = OD_pairs_with_cost.groupby(['o_fips', 'd_fips'], as_index=False)[
    ['toll_cost', 'transit_cost', 'parking_cost']
].mean()


OD_pairs_with_cost.describe()

Unnamed: 0,toll_cost,transit_cost,parking_cost
count,3922.0,1757.0,849.0
mean,1.344446,2.894723,39.138544
std,0.736907,2.613474,85.113985
min,0.2,0.0,0.0
25%,0.75,2.25,3.0
50%,1.2,2.5,10.0
75%,1.6875,2.5,27.583333
max,6.9,60.0,725.0


## merge the replica, OD_pairs_with_time and OD_pairs_with_cost

In [105]:
# Drop trips with missing key information
travel = replica[[
    'activity_id', 'age', 'male', 'numvec', 'higheduc', 'hhsize', 'hhinc',
    'mode_four_kinds', 'mode_three_kinds','purpose', 'o_fips', 'd_fips',
]].dropna().reset_index(drop=True)

# Merge OD-based cost information
travel2 = travel.merge(
    OD_pairs_with_cost[['o_fips', 'd_fips', 'transit_cost', 'toll_cost',"parking_cost"]],
    how='left',
    on=['o_fips', 'd_fips']
)

# Merge OD-based time information 
OD_pairs_time = OD_pairs_with_time.set_index(['o_fips','d_fips'])
OD_pairs_time2 = OD_pairs_with_time2.set_index(['o_fips','d_fips'])
OD_pairs_with_time_full = OD_pairs_time.combine_first(OD_pairs_time2).reset_index()
travel3 = travel2.merge(
    OD_pairs_with_time_full[
        ['o_fips', 'd_fips',
         'transit_time', 'transit_dist',
         'auto_time', 'auto_dist',
         'bike_time', 'bike_dist',
         'walk_time', 'walk_dist']
    ],
    how='left',
    on=['o_fips', 'd_fips']
)

travel4 = travel2.merge(
    OD_pairs_with_time[
        ['o_fips', 'd_fips',
         'transit_time', 'transit_dist',
         'auto_time', 'auto_dist',
         'bike_time', 'bike_dist',
         'walk_time', 'walk_dist']
    ],
    how='left',
    on=['o_fips', 'd_fips']
)

travel3.to_csv('./data/Replica_processed/processed_with_google-original-trips-end-at-imd-2024-spring-thursday.csv', index=False)
travel4.to_csv('./data/Replica_processed/processed-original-trips-end-at-imd-2024-spring-thursday.csv', index=False)
travel4.describe()


Unnamed: 0,activity_id,age,male,numvec,higheduc,hhsize,hhinc,mode_four_kinds,mode_three_kinds,transit_cost,toll_cost,parking_cost,transit_time,transit_dist,auto_time,auto_dist,bike_time,bike_dist,walk_time,walk_dist
count,48971.0,48971.0,48971.0,48971.0,48971.0,48971.0,48971.0,48971.0,48971.0,337.0,0.0,840.0,24548.0,24548.0,48466.0,48466.0,14293.0,14293.0,14412.0,14412.0
mean,9.220889e+18,43.257438,0.400564,1.596394,0.720978,3.238509,139962.1,0.427641,0.325499,2.684718,,49.254738,33.878615,8.749351,21.044955,15.599122,15.94285,4.381283,24.635799,2.053044
std,5.308069e+18,16.17373,0.490018,0.917216,0.448523,2.115697,141981.3,0.944578,0.677078,0.870206,,53.37807,24.488019,9.98149,18.048568,21.228606,16.024746,4.770845,18.78397,1.564118
min,98881380000000.0,3.0,0.0,0.0,0.0,1.0,22.0,0.0,0.0,2.25,,0.0,4.616667,0.655806,1.0,0.39378,2.816667,0.188858,1.116667,0.093824
25%,4.66147e+18,30.0,0.0,1.0,0.0,2.0,54617.0,0.0,0.0,2.25,,3.41,13.733333,1.897885,7.883333,4.184427,4.466667,1.094366,7.416667,0.619397
50%,9.173686e+18,41.0,0.0,2.0,1.0,3.0,102068.0,0.0,0.0,2.25,,6.0,27.55,5.553046,17.75,10.554904,9.933333,2.466875,20.383333,1.698823
75%,1.383357e+19,55.0,1.0,2.0,1.0,4.0,175139.0,0.0,0.0,2.5,,100.0,47.566667,11.767978,29.3,19.728495,20.45,5.740441,39.316667,3.27767
max,1.844626e+19,94.0,1.0,3.0,1.0,17.0,1557700.0,3.0,2.0,5.0,,125.0,183.216667,92.837447,461.55,697.398798,95.233333,28.032171,68.183333,5.681851
