In [3]:
import pandas as pd
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.5f}'.format
import matplotlib.pyplot as plt
import geopandas as gpd

# Create venue (polygon) and shuttle (points) geometry

In [4]:
%%time

# Create a polygon for event venue
from shapely.geometry import Polygon

lat_point_list = [30.131962, 30.146337, 30.140386, 30.122629]
lon_point_list = [-97.647388, -97.635686, -97.619652, -97.631465]
polygon_geom = Polygon(zip(lon_point_list, lat_point_list))
gdf_venue = gpd.GeoDataFrame(index=[0], crs='epsg:4326', geometry=[polygon_geom])       
gdf_venue['name']  = 'the Circuit of the Americas'
# gdf_venue = gdf_venue.to_crs("EPSG:3857") 
gdf_venue

CPU times: user 7.15 ms, sys: 0 ns, total: 7.15 ms
Wall time: 6.8 ms


Unnamed: 0,geometry,name
0,"POLYGON ((-97.64739 30.13196, -97.63569 30.14634, -97.61965 30.14039, -97.63146 30.12263, -97.64739 30.13196))",the Circuit of the Americas


In [5]:
import shapely.geometry as geom

# Create Shapely point geometries
point_waterloo_park = geom.Point(-97.736285, 30.273726) # (longitude, latitude)
point_barton_creek_square = geom.Point(-97.805046, 30.257509)
point_expo_center = geom.Point(-97.622544, 30.297062)
# point_ridehailing = geom.Point(-97.614135, 30.178718)

point_shuttle_list = [point_waterloo_park,point_barton_creek_square,point_expo_center] #,point_ridehailing
gdf_shuttle = gpd.GeoDataFrame({'Shuttle_Location': ['Shuttle_Waterloo_Park', 'Shuttle_Barton_Creek_Square', 'Shuttle_Expo_Center'], #, 'Uber_DelValle_HighSchool'
                        'geometry': point_shuttle_list},
                        crs="EPSG:4326")
# gdf_shuttle = gdf_shuttle.to_crs("EPSG:3857") 
gdf_shuttle

Unnamed: 0,Shuttle_Location,geometry
0,Shuttle_Waterloo_Park,POINT (-97.73628 30.27373)
1,Shuttle_Barton_Creek_Square,POINT (-97.80505 30.25751)
2,Shuttle_Expo_Center,POINT (-97.62254 30.29706)


# Load INRIX Road network data

In [6]:
df_road = pd.read_csv("data/TMC_Identification.csv", sep=',', header=0)
df_road = df_road[['tmc_code','intersection','start_latitude','start_longitude','end_latitude','end_longitude','miles']]
print('total number of road segments:',df_road.shape[0])
df_road.head(3)

total number of road segments: 4460


Unnamed: 0,tmc_code,intersection,start_latitude,start_longitude,end_latitude,end_longitude,miles
0,112P13033,W MARTIN LUTHER KING JR BLVD,30.28172,-97.74193,30.28232,-97.74207,0.04249
1,112+08908,FM-20,30.11066,-97.41049,30.11082,-97.37323,2.22783
2,112+16538,GATTIS SCHOOL RD,30.49057,-97.67514,30.49351,-97.67573,0.20831


In [7]:
%%time
from shapely.geometry import LineString
from geopandas import GeoDataFrame

df_road['geometry'] = df_road.apply(
    lambda row: LineString([(row['start_longitude'], row['start_latitude']),
                             (row['end_longitude'], row['end_latitude'])]),
    axis=1
)

# Create a GeoDataFrame for road network
gdf_road = GeoDataFrame(df_road, geometry='geometry',crs="EPSG:4326")

CPU times: user 126 ms, sys: 4.73 ms, total: 131 ms
Wall time: 127 ms


In [8]:
gdf_road.head(2)

Unnamed: 0,tmc_code,intersection,start_latitude,start_longitude,end_latitude,end_longitude,miles,geometry
0,112P13033,W MARTIN LUTHER KING JR BLVD,30.28172,-97.74193,30.28232,-97.74207,0.04249,"LINESTRING (-97.74193 30.28172, -97.74207 30.28232)"
1,112+08908,FM-20,30.11066,-97.41049,30.11082,-97.37323,2.22783,"LINESTRING (-97.41049 30.11066, -97.37323 30.11082)"


In [9]:
# gdf_road.explore()

### Export egde dataframe (only need run onece)

In [15]:
%%time

gdf_roads = gdf_road.to_crs(epsg=3857)

# Define a buffer distance in meters (e.g., 10 meters)
buffer_distance = 10

# Create a buffered version of the geometries
gdf_roads['buffered_geometry'] = gdf_roads.geometry.buffer(buffer_distance)

# Perform a spatial join to find intersections of buffered geometries
buffered_connectivity = gpd.sjoin(
    gdf_roads.set_geometry('buffered_geometry'),
    gdf_roads,
    how='inner',
    predicate='intersects'
)

# Remove self-loops (a segment intersecting itself)
buffered_connectivity = buffered_connectivity[
    buffered_connectivity.tmc_code_left != buffered_connectivity.tmc_code_right
]

# Extract edge list
edge_list = buffered_connectivity[['tmc_code_left', 'tmc_code_right']].rename(
    columns={'tmc_code_left': 'source', 'tmc_code_right': 'target'}
)

print("Edge List with Buffer:")
edge_list

Edge List with Buffer:
CPU times: user 358 ms, sys: 1.02 ms, total: 359 ms
Wall time: 358 ms


Unnamed: 0,source,target
0,112P13033,112P13162
2059,112-13162,112P13162
2061,112-13161,112P13162
2532,112N13033,112P13162
2980,112N13162,112P13162
...,...,...
3876,112P17123,112P17125
4186,112-19750,112-19751
4189,112-19752,112-19751
4190,112-19751,112-19752


In [17]:
edge_list.to_csv('./output/INRIX_network_edge_connectivity.csv',index=False)

# Load OSM driving road network of Austin by bouding box
## 1. Load OSM road network attributes using OSMnx

In [125]:
# %%time
# import osmnx as ox

# # pandana method does not return road attributes so use osmnx to retrieve them
# G_osm = ox.graph_from_bbox(bbox=(30.584984, 30.039047, -97.389445, -97.996048), network_type='drive') 

# # Calculate and add compass bearing attributes to all graph edges.
# G_osm = ox.bearing.add_edge_bearings(G_osm)

# nodes_osm, edges_osm = ox.graph_to_gdfs(G_osm)

CPU times: user 56.8 s, sys: 0 ns, total: 56.8 s
Wall time: 56.7 s


In [133]:
# nodes_osm.head(2) # node data seems not helpful

Unnamed: 0_level_0,y,x,street_count,highway,ref,geometry
osmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
151371439,30.20374,-97.98883,3,,,POINT (-97.98883 30.20374)
151371447,30.20425,-97.98779,1,turning_circle,,POINT (-97.98779 30.20425)


In [135]:
# edges_osm.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,osmid,name,highway,oneway,reversed,length,bearing,geometry,ref,bridge,lanes,maxspeed,access,tunnel,width,junction
u,v,key,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
151371439,151462706,0,15287110,Heritage Drive,residential,False,False,88.55,341.4,"LINESTRING (-97.98883 30.20374, -97.98913 30.20449)",,,,,,,,
151371439,151371447,0,15277135,Hideaway Cove,residential,False,False,116.618,60.4,"LINESTRING (-97.98883 30.20374, -97.98848 30.20385, -97.98836 30.20390, -97.98815 30.20401, -97.98793 30.20416, -97.98779 30.20425)",,,,,,,,


In [53]:
# nodes_osm.to_csv('./output/osm_nodes.csv')
# edges_osm.to_csv('./output/osm_edges.csv')

In [11]:
edges_osm = pd.read_csv('./output/osm_edges.csv', index_col=[0,1])

## 2. Load osm network using pandana because it is faster when computing network distance compared with OSMnx

In [40]:
# %%time
# ## Wall time: 1min
from pandana.loaders import osm
# network = osm.pdna_network_from_bbox(30.039047, -97.996048, 30.584984, -97.389445, network_type='drive') # pandana method does not return road attributes so use osmnx to retrieve them
# network.nodes_df.to_csv('./output/nodes.csv')
# network.edges_df.to_csv('./output/edges.csv')
# network.edges_df.head()

In [60]:
%%time
import pandana
# load existing network data
nodes = pd.read_csv('./output/nodes.csv', index_col=0)
print(nodes.shape[0])
edges = pd.read_csv('./output/edges.csv', index_col=[0,1])
network = pandana.Network(nodes['x'], nodes['y'], 
                          edges['from'], edges['to'], edges[['distance']])

64416
CPU times: user 18.4 s, sys: 181 ms, total: 18.6 s
Wall time: 2.64 s


# Feature engineering 
Dynamic features: event timing, attendance size, proximity to the event venue, proximity to shuttle services, hotel occupancy rates

Static features: population density, hotel density

## 1. Airbnb count (spatial buffers)
Spatial join airbnb locations and road segments to get number of airbnb near roads

In [10]:
df_airbnb = pd.read_csv("data/airbnb_listings_202312.csv", sep=',', header=0)
print('total number of airbnb:',df_airbnb.shape[0])
# df_airbnb.head(2)  

#Converting Pandas DataFrame to GeoDataFrame
from shapely.geometry import Point
from geopandas import GeoDataFrame

geometry = [Point(xy) for xy in zip(df_airbnb.longitude, df_airbnb.latitude)]
gdf_airbnb = GeoDataFrame(df_airbnb, crs="EPSG:4326", geometry=geometry)
gdf_airbnb = gdf_airbnb.to_crs("EPSG:3857") 

total number of airbnb: 15419


In [31]:
%%time
# gdf_airbnb.head(20).explore()
gdf_airbnb_need = gdf_airbnb[['id','geometry']]
gdf_airbnb_need['airbnb_count'] = 1

buffer_distance = 500  # Buffer distance in meters
gdf_polyline_buffered = gdf_road.copy()
gdf_polyline_buffered = gdf_polyline_buffered.to_crs("EPSG:3857")  # because using spatial buffer, so need to use projected coordinate system

# gdf_polyline_buffered = gdf_road.head(100).copy()
gdf_polyline_buffered["geometry"] = gdf_polyline_buffered.geometry.buffer(buffer_distance)

# gdf_road_w_airbnb = gpd.sjoin(gdf_airbnb, gdf_road, predicate="intersects")
gdf_road_w_airbnb = gpd.sjoin(gdf_polyline_buffered, gdf_airbnb_need, predicate="intersects", how="left")
print(gdf_road_w_airbnb.tmc_code.unique().shape[0])
point_counts = gdf_road_w_airbnb.groupby("tmc_code").airbnb_count.count().reset_index()
gdf_road_merged = gdf_road.merge(point_counts, how='left')
gdf_road_merged = gdf_road_merged.fillna(0)

4460
CPU times: user 443 ms, sys: 0 ns, total: 443 ms
Wall time: 442 ms


In [104]:
# gdf_road_merged.explore()

## 2. Network distance to venue and shuttles

In [32]:
gdf_road_merged['Shuttle_Waterloo_Park'] = gdf_shuttle[gdf_shuttle['Shuttle_Location']=='Shuttle_Waterloo_Park'].geometry.iloc[0]
gdf_road_merged['Shuttle_Waterloo_Park'] = gpd.GeoSeries.from_wkt(gdf_road_merged['Shuttle_Waterloo_Park'].astype(str))

gdf_road_merged['Shuttle_Barton_Creek_Square'] = gdf_shuttle[gdf_shuttle['Shuttle_Location']=='Shuttle_Barton_Creek_Square'].geometry.iloc[0]
gdf_road_merged['Shuttle_Barton_Creek_Square'] = gpd.GeoSeries.from_wkt(gdf_road_merged['Shuttle_Barton_Creek_Square'].astype(str))

gdf_road_merged['Shuttle_Expo_Center'] = gdf_shuttle[gdf_shuttle['Shuttle_Location']=='Shuttle_Expo_Center'].geometry.iloc[0]
gdf_road_merged['Shuttle_Expo_Center'] = gpd.GeoSeries.from_wkt(gdf_road_merged['Shuttle_Expo_Center'].astype(str))

venue_centroid = gdf_venue.iloc[0].geometry.centroid
gdf_road_merged["venue_centroid"] = gdf_road_merged.apply(lambda x: venue_centroid, axis=1)

In [33]:
# first, identify the nearest node (of pandana network) to each lon lat coordinate of road segments
road_start_nodes = network.get_node_ids(gdf_road_merged.start_longitude, gdf_road_merged.start_latitude).values
road_end_nodes = network.get_node_ids(gdf_road_merged.end_longitude, gdf_road_merged.end_latitude).values

In [34]:
# Nework distance: use the start and end location of each road segment to calculate the shortest path distance (km) and take the average
# Distance Unit: km
def compute_average_distance_to_poi(df, col):
    shuttle_nodes = network.get_node_ids(df[col].x, df[col].y).values
    distances_start = network.shortest_path_lengths(road_start_nodes, shuttle_nodes)
    distances_end = network.shortest_path_lengths(road_end_nodes, shuttle_nodes)
    df['distance_to_'+col+'_start'] = pd.Series(distances_start) / 1000 # covert to km
    df['distance_to_'+col+'_end'] =  pd.Series(distances_end) / 1000
    df['distance_to_'+col] = (df['distance_to_'+col+'_start'] + df['distance_to_'+col+'_end'])/2
    df.drop(columns=['distance_to_'+col+'_start', 'distance_to_'+col+'_end'], inplace=True)
    return df

In [35]:
gdf_road_merged = compute_average_distance_to_poi(gdf_road_merged, 'venue_centroid')
gdf_road_merged = compute_average_distance_to_poi(gdf_road_merged, 'Shuttle_Waterloo_Park')
gdf_road_merged = compute_average_distance_to_poi(gdf_road_merged, 'Shuttle_Barton_Creek_Square')
gdf_road_merged = compute_average_distance_to_poi(gdf_road_merged, 'Shuttle_Expo_Center')

In [41]:
# exclude some outliers
gdf_road_merged = gdf_road_merged[gdf_road_merged.distance_to_venue_centroid<100]

# exclude some road segments very far from the city
gdf_road_merged = gdf_road_merged[~gdf_road_merged['tmc_code'].isin(['112+53808','112-53807'])]

print(gdf_road_merged.shape[0])

4434


In [42]:
# # these are outliers, for some reason, the distance computed is wrong. Exclude this portion of data.
# vis_example = gdf_road_merged[gdf_road_merged.distance_to_venue_centroid>100]
# print(vis_example.shape[0])
# vis_example.explore()

In [9]:
gdf_road_merged.explore()

## 3. Matching with OSM road data to enrich network attribute

In [46]:
import numpy as np 

# Function to calculate bearing between two points (start and end of a road segment)
def calculate_bearing(start_lat, start_lon, end_lat, end_lon):
    # Convert degrees to radians
    start_lat = np.radians(start_lat)
    start_lon = np.radians(start_lon)
    end_lat = np.radians(end_lat)
    end_lon = np.radians(end_lon)
    
    # Difference in longitudes
    delta_lon = end_lon - start_lon
    
    # Bearing calculation
    x = np.sin(delta_lon) * np.cos(end_lat)
    y = np.cos(start_lat) * np.sin(end_lat) - np.sin(start_lat) * np.cos(end_lat) * np.cos(delta_lon)
    bearing = np.arctan2(x, y)
    
    # Convert from radians to degrees and normalize to 0-360
    bearing = np.degrees(bearing)
    bearing = (bearing + 360) % 360
    return bearing
# The bearing value represents the direction from the starting point to the destination, measured clockwise from true north (0° is north, 90° is east, etc.).

In [47]:
# Apply the function to calculate bearings for road segments
gdf_road_merged['segment_bearing'] = gdf_road_merged.apply(
    lambda row: calculate_bearing(row['start_latitude'], row['start_longitude'], row['end_latitude'], row['end_longitude']), axis=1
)

In [12]:
# preprocessing OSM network data
edges_osm_need = edges_osm.reset_index()
edges_osm_need['geometry'] = gpd.GeoSeries.from_wkt(edges_osm_need['geometry'])
edges_osm_need = gpd.GeoDataFrame(edges_osm_need,crs="EPSG:4326", geometry='geometry')

edges_osm_need['osm_id'] = edges_osm_need['osmid'].astype(str) + edges_osm_need['u'].astype(str) +  edges_osm_need['v'].astype(str) 
edges_osm_need.drop(['u','v','osmid','key', 'ref', 'reversed', 'bridge', 'access','tunnel'], axis=1, inplace=True)
edges_osm_need = edges_osm_need.rename(columns={"highway": "road_type"})
edges_osm_need = edges_osm_need.to_crs("EPSG:3857")
edges_osm_need.head(2)

Unnamed: 0,name,road_type,oneway,length,bearing,geometry,lanes,maxspeed,width,junction,osm_id
0,Heritage Drive,residential,False,88.55,341.4,"LINESTRING (-10908067.216 3529765.696, -10908099.955 3529862.906)",,,,,15287110151371439151462706
1,Hideaway Cove,residential,False,116.618,60.4,"LINESTRING (-10908067.216 3529765.696, -10908027.363 3529780.251, -10908014.562 3529785.790, -10907990.850 3529800.731, -10907966.138 3529820.439, -10907950.887 3529831.903)",,,,,15277135151371439151371447


In [70]:
# edges_osm_need.explore()

In [307]:
%%time
# Before spatial join, find the closest direction
# First, from each inrix link, find the OSM links with similar angle: The abs(delta angle) smaller than 5 or greater than 355
# Then, for each inrix link, always use the start point's closest OSM link as matching target

gdf_road_merged_copy = gdf_road_merged.copy().to_crs("EPSG:3857")
gdf_road_merged_copy['osm_id'] = np.NAN

for idx, road_row in gdf_road_merged_copy.iterrows():
    if idx % 100 == 0:
        print(f"Processing {idx}/{len(gdf_road_merged_copy)}")

    # Filter edges with similar directions
    segment_bearing = road_row['segment_bearing']
    possible_matches = edges_osm_need[
        (abs(edges_osm_need['bearing'] - segment_bearing) < 5) |
        (abs(edges_osm_need['bearing'] - segment_bearing) > 360 - 5)
    ]
    
    # Calculate distances (vectorized for the filtered matches)
    possible_matches['edge_distance'] = possible_matches.geometry.apply(
        lambda geom: road_row.geometry.distance(geom)
    )

    # Keep matches within 5 meters
    nearby_matches = possible_matches[possible_matches['edge_distance'] < 5]

    # Assign the nearest segment's osm_id if matches are found
    if not nearby_matches.empty:
        nearest_match_id = nearby_matches.loc[nearby_matches['edge_distance'].idxmin(), 'osm_id']
        gdf_road_merged_copy.loc[idx, 'osm_id'] = nearest_match_id

Processing 0/4460
Processing 100/4460
Processing 200/4460
Processing 300/4460
Processing 400/4460
Processing 500/4460
Processing 600/4460
Processing 700/4460
Processing 800/4460
Processing 900/4460
Processing 1000/4460
Processing 1100/4460
Processing 1200/4460
Processing 1300/4460
Processing 1400/4460
Processing 1500/4460
Processing 1600/4460
Processing 1700/4460
Processing 1800/4460
Processing 1900/4460
Processing 2000/4460
Processing 2100/4460
Processing 2200/4460
Processing 2300/4460
Processing 2400/4460
Processing 2500/4460
Processing 2600/4460
Processing 2700/4460
Processing 2800/4460
Processing 2900/4460
Processing 3000/4460
Processing 3100/4460
Processing 3200/4460
Processing 3300/4460
Processing 3400/4460
Processing 3500/4460
Processing 3600/4460
Processing 3700/4460
Processing 3800/4460
Processing 3900/4460
Processing 4000/4460
Processing 4100/4460
Processing 4200/4460
Processing 4300/4460
Processing 4400/4460
CPU times: user 6min 43s, sys: 1.17 s, total: 6min 44s
Wall time: 6

In [48]:
# gdf_road_merged_copy.to_csv('output/temp_merge_inrix_osm.csv',index=False)
gdf_road_merged_copy = pd.read_csv('output/temp_merge_inrix_osm.csv')
gdf_road_merged_copy.head(2)

In [49]:
gdf_road_merged_copy_mergeOSM = gdf_road_merged_copy.merge(edges_osm_need[['osm_id','road_type','oneway','lanes','maxspeed']], on='osm_id')

In [55]:
import numpy as np
import ast

def process_lane(val):
    if isinstance(val, int) or isinstance(val, float):  # If it's already a number, return as is
        return val
    elif isinstance(val, str) and val.startswith("["):  # Check if it's a string list
        try:
            num_list = ast.literal_eval(val)  # Convert string to list safely
            num_list = [int(x) for x in num_list]  # Convert elements to int
            return np.mean(num_list)  # Compute mean
        except:
            return None  # Handle any conversion errors gracefully
    elif isinstance(val, str) and not val.startswith("["):
        return float(val)
    else:
        return None  # Return None for other cases

# Apply the function to clean and process the 'lane' column
gdf_road_merged_copy_mergeOSM['lanes_mean'] = gdf_road_merged_copy_mergeOSM['lanes'].apply(process_lane)
# gdf_road_merged_copy_mergeOSM.head(10)

In [56]:
# gdf_road_merged_copy_mergeOSM.head(5)['lanes'].apply(process_lane)
# gdf_road_merged_copy_mergeOSM.maxspeed.unique()
# gdf_road_merged_copy_mergeOSM[gdf_road_merged_copy_mergeOSM['lanes_mean'].isna()].lanes.unique()

In [57]:
# import re

# def process_max_speed(value):
#     if isinstance(value, str):
#         # Extract all numeric values from the string
#         numbers = list(map(int, re.findall(r'\d+', value)))
#         if numbers:
#             return max(numbers)  # Return the maximum value
#     return None

# # Apply the function to the max_speed column
# df_traffic['max_speed'] = df_traffic['maxspeed'].apply(process_max_speed)
# df_traffic.max_speed.value_counts()

def extract_max_speed(speed):
    if pd.isna(speed):  # Handle NaN values
        return np.nan
    elif isinstance(speed, str):
        try:
            if speed.startswith("["):  # Check if it's a list formatted as a string
                speed_list = ast.literal_eval(speed)  # Convert string list to actual list
            else:
                speed_list = [speed]  # Wrap single values in a list for consistency
            
            # Extract numeric values, convert to int, and find the max
            max_speed = max(int(s.replace(' mph', '').strip()) for s in speed_list)
            return max_speed
        except:
            return np.nan  # Return NaN if any unexpected error occurs
    return np.nan

# Apply function to create new column
gdf_road_merged_copy_mergeOSM['maxspeed_value'] = gdf_road_merged_copy_mergeOSM['maxspeed'].apply(extract_max_speed)
print(gdf_road_merged_copy_mergeOSM.maxspeed_value.unique())

[30. 65. 35. nan 40. 45. 55. 60. 50. 25. 70. 75. 80. 85.]


In [69]:
# gdf_road_merged_copy_mergeOSM.head()
hwy_speeds = {
    "motorway": 70,          # Typically 70 mph in rural areas, 55-65 mph in urban areas
    "motorway_link": 50,     # Lower than main motorways, often 45-55 mph
    "trunk": 65,             # Generally 55-65 mph depending on urban/rural setting
    "trunk_link": 50,        # Similar to trunk but slightly lower, around 45-55 mph
    "primary": 55,           # Typically 55 mph in rural areas, 25-45 mph in urban settings
    "primary_link": 45,      # Slightly lower than primary roads, often 25-45 mph
    "secondary": 50,         # Commonly 45-55 mph
    "secondary_link": 45,    # Slightly lower than secondary roads
    "tertiary": 60,          # Usually around 35-45 mph
    "tertiary_link": 55,     # Slightly lower than tertiary roads
    "residential": 35,       # Standard residential speed limit
    "unclassified": 30       # Typically 25-35 mph in less developed areas
}
def fill_missing_speed(row):
    if np.isnan(row['maxspeed_value']):  # Check if maxspeed_value is NaN
        return hwy_speeds.get(row['road_type'], np.nan)  # Get default speed or NaN if road_type is not found
    return row['maxspeed_value']  # Keep original value if not NaN

# Apply function to fill NaN values in 'maxspeed_value'
gdf_road_merged_copy_mergeOSM['maxspeed_value'] = gdf_road_merged_copy_mergeOSM.apply(fill_missing_speed, axis=1)

print(gdf_road_merged_copy_mergeOSM[gdf_road_merged_copy_mergeOSM['maxspeed_value'].isna()].road_type.unique())

[]


In [71]:
gdf_road_merged_copy_mergeOSM.road_type = gdf_road_merged_copy_mergeOSM.road_type.astype(str)
gdf_road_merged_copy_mergeOSM.road_type.value_counts()

secondary         1234
tertiary          609 
motorway          482 
primary           436 
motorway_link     174 
residential       132 
trunk             85  
unclassified      38  
primary_link      18  
secondary_link    13  
trunk_link        8   
tertiary_link     2   
Name: road_type, dtype: int64

In [72]:
gdf_road_merged_copy_mergeOSM.loc[gdf_road_merged_copy_mergeOSM.road_type=="['motorway', 'motorway_link']", 'road_type'] = 'motorway'
gdf_road_merged_copy_mergeOSM.loc[gdf_road_merged_copy_mergeOSM.road_type=="['secondary_link', 'secondary']", 'road_type'] = 'secondary'
gdf_road_merged_copy_mergeOSM.loc[gdf_road_merged_copy_mergeOSM.road_type=="['trunk', 'motorway']", 'road_type'] = 'motorway'
gdf_road_merged_copy_mergeOSM.loc[gdf_road_merged_copy_mergeOSM.road_type=="['tertiary', 'motorway_link']", 'road_type'] = 'tertiary'
gdf_road_merged_copy_mergeOSM.loc[gdf_road_merged_copy_mergeOSM.road_type=="['trunk_link', 'tertiary']", 'road_type'] = 'tertiary'
gdf_road_merged_copy_mergeOSM.loc[gdf_road_merged_copy_mergeOSM.road_type=="['secondary', 'motorway_link']", 'road_type'] = 'secondary'
gdf_road_merged_copy_mergeOSM.loc[gdf_road_merged_copy_mergeOSM.road_type=="['trunk', 'primary']", 'road_type'] = 'primary'
gdf_road_merged_copy_mergeOSM.loc[gdf_road_merged_copy_mergeOSM.road_type=="['trunk', 'secondary']", 'road_type'] = 'secondary'
gdf_road_merged_copy_mergeOSM.loc[gdf_road_merged_copy_mergeOSM.road_type=="['primary', 'motorway']", 'road_type'] = 'primary'

In [84]:
gdf_road_merged_copy_mergeOSM.loc[gdf_road_merged_copy_mergeOSM.road_type=="motorway_link", 'road_type'] = 'motorway'
gdf_road_merged_copy_mergeOSM.loc[gdf_road_merged_copy_mergeOSM.road_type=="primary_link", 'road_type'] = 'primary'
gdf_road_merged_copy_mergeOSM.loc[gdf_road_merged_copy_mergeOSM.road_type=="secondary_link", 'road_type'] = 'secondary'
gdf_road_merged_copy_mergeOSM.loc[gdf_road_merged_copy_mergeOSM.road_type=="trunk_link", 'road_type'] = 'trunk'
gdf_road_merged_copy_mergeOSM.loc[gdf_road_merged_copy_mergeOSM.road_type=="tertiary_link", 'road_type'] = 'tertiary'
gdf_road_merged_copy_mergeOSM.road_type.value_counts()

secondary       1247
motorway        656 
tertiary        611 
primary         454 
residential     132 
trunk           93  
unclassified    38  
Name: road_type, dtype: int64

In [396]:
# gdf_road_merged_copy_mergeOSM.loc[gdf_road_merged_copy_mergeOSM.road_type=="motorway_link", 'road_type'] = 'motorway'
# gdf_road_merged_copy_mergeOSM.loc[gdf_road_merged_copy_mergeOSM.road_type=="primary_link", 'road_type'] = ''
# gdf_road_merged_copy_mergeOSM.loc[gdf_road_merged_copy_mergeOSM.road_type=="secondary_link", 'road_type'] = ''
# gdf_road_merged_copy_mergeOSM.loc[gdf_road_merged_copy_mergeOSM.road_type=="trunk_link", 'road_type'] = ''
# gdf_road_merged_copy_mergeOSM.loc[gdf_road_merged_copy_mergeOSM.road_type=="tertiary_link", 'road_type'] = ''

In [86]:
%%time
gdf_road_merged_copy_mergeOSM.to_csv('output/temp_merge_inrix_osm_final.csv',index=False)

CPU times: user 66.4 ms, sys: 3.6 ms, total: 70 ms
Wall time: 69 ms


# Final: Merge with traffic data for tree-based ML prediction model

In [87]:
df_traffic = pd.read_csv("output/austin_2022_GP_10min_interval_delaydifference.csv", sep=',', header=0)
df_traffic_merged = df_traffic.merge(gdf_road_merged_copy_mergeOSM, on='tmc_code')
df_traffic_merged.drop(['geometry', 'Shuttle_Waterloo_Park','Shuttle_Barton_Creek_Square','Shuttle_Expo_Center','venue_centroid','lanes','maxspeed'], axis=1, inplace=True)

In [88]:
df_traffic_merged['date'] = pd.to_datetime(df_traffic_merged['date'])
df_traffic_merged['day_of_week'] = df_traffic_merged['date'].dt.weekday

In [89]:
print(df_traffic_merged.day_of_week.unique())
print(df_traffic_merged.date.unique())

[0 1 2 3 4 5 6]
['2022-10-17T00:00:00.000000000' '2022-10-18T00:00:00.000000000'
 '2022-10-19T00:00:00.000000000' '2022-10-20T00:00:00.000000000'
 '2022-10-21T00:00:00.000000000' '2022-10-22T00:00:00.000000000'
 '2022-10-23T00:00:00.000000000']


In [90]:
df_traffic_merged.head(2) 

Unnamed: 0,tmc_code,hour_min,delay_baseline,delay_focus,delay_difference,date,intersection,start_latitude,start_longitude,end_latitude,end_longitude,miles,airbnb_count,distance_to_venue_centroid,distance_to_Shuttle_Waterloo_Park,distance_to_Shuttle_Barton_Creek_Square,distance_to_Shuttle_Expo_Center,segment_bearing,osm_id,road_type,oneway,lanes_mean,maxspeed_value,day_of_week
0,112+04760,00:00,-2.22226,3.38907,5.61133,2022-10-17,US-183/EXIT 239-240,30.32874,-97.70568,30.33272,-97.70402,0.29287,49,26.64241,7.39988,14.4158,10.33789,19.79829,"[122981397, 1012363973]66847798019339769835",secondary,True,1.5,55.0,0
1,112+04760,00:10,-2.4344,0.28807,2.72247,2022-10-17,US-183/EXIT 239-240,30.32874,-97.70568,30.33272,-97.70402,0.29287,49,26.64241,7.39988,14.4158,10.33789,19.79829,"[122981397, 1012363973]66847798019339769835",secondary,True,1.5,55.0,0


In [91]:
%%time
df_traffic_merged.to_csv('output/austin_2022_GP_10min_interval_delaydifference_with_features_forML.csv',index=False)

CPU times: user 56.7 s, sys: 869 ms, total: 57.5 s
Wall time: 59 s


# Merge with travel demand feature

In [9]:
df_traffic_merged = pd.read_csv("output/austin_2022_GP_10min_interval_delaydifference_with_features_forML.csv", sep=',', header=0)
print(df_traffic_merged.shape[0])

3236871


In [10]:
df_travel_demand_each10min = pd.read_csv('./output/travel_demand_inrix_road_final.csv')
print(df_travel_demand_each10min.shape[0])
df_travel_demand_each10min.head(2)

4050668


Unnamed: 0,tmc_code,hour_min,no_devices_baseline,no_devices_focus,demand_diff,date
0,112+04758,00:00,1,2,1,2022-10-17
1,112+04758,00:00,1,19,18,2022-10-17


In [11]:
df_traffic_merged2 = df_traffic_merged.merge(df_travel_demand_each10min, on=['tmc_code','date','hour_min'], how='left')
df_traffic_merged2['no_devices'].fillna(0, inplace=True)  # Replace NaN with 0
print(df_traffic_merged2.shape[0])

3549832


In [13]:
df_traffic_merged2.head(2)

Unnamed: 0,tmc_code,hour_min,delay_baseline,delay_focus,delay_difference,date,intersection,start_latitude,start_longitude,end_latitude,end_longitude,miles,airbnb_count,distance_to_venue_centroid,distance_to_Shuttle_Waterloo_Park,distance_to_Shuttle_Barton_Creek_Square,distance_to_Shuttle_Expo_Center,segment_bearing,osm_id,road_type,oneway,lanes_mean,maxspeed_value,day_of_week,no_devices,no_devices_baseline,no_devices_focus,demand_diff
0,112+04760,00:00,-2.22226,3.38907,5.61133,2022-10-17,US-183/EXIT 239-240,30.32874,-97.70568,30.33272,-97.70402,0.29287,49,26.64241,7.39988,14.4158,10.33789,19.79829,"[122981397, 1012363973]66847798019339769835",secondary,True,1.5,55.0,0,0.0,2.0,2.0,0.0
1,112+04760,00:00,-2.22226,3.38907,5.61133,2022-10-17,US-183/EXIT 239-240,30.32874,-97.70568,30.33272,-97.70402,0.29287,49,26.64241,7.39988,14.4158,10.33789,19.79829,"[122981397, 1012363973]66847798019339769835",secondary,True,1.5,55.0,0,0.0,2.0,20.0,18.0


In [2]:
# import pandas as pd 
# df_traffic_merged2 = pd.read_csv('output/austin_2022_GP_10min_interval_delaydifference_with_features_forML_final.csv')
# print(df_traffic_merged2.shape[0]) #3549832

  df_traffic_merged2 = pd.read_csv('output/austin_2022_GP_10min_interval_delaydifference_with_features_forML_final.csv')


In [4]:
%%time
# df_traffic_merged2.no_devices.describe()
df_traffic_merged2 = df_traffic_merged2[df_traffic_merged2['distance_to_venue_centroid']<500]
print(df_traffic_merged2.shape[0])
df_traffic_merged2.to_csv('output/austin_2022_GP_10min_interval_delaydifference_with_features_forML_final.csv',index=False)

3547722
CPU times: user 50.1 s, sys: 0 ns, total: 50.1 s
Wall time: 51.9 s


In [49]:
# edges_osm_need = edges_osm_need.rename(columns={"geometry": "geometry_osm"})
# df_traffic_merged3 = df_traffic_merged.merge(edges_osm_need[['osm_id','geometry_osm']], on='osm_id')
# print(df_traffic_merged3.shape[0])
# # df_traffic_merged3.head(2)

In [24]:
# df_traffic_merged3 = gpd.GeoDataFrame(df_traffic_merged3,crs="EPSG:3857", geometry='geometry_osm')

In [31]:
# df_traffic_merged3[['osm_id', 'geometry_osm']].drop_duplicates().explore()

In [None]:
# gdf_road_merged_copy_mergeOSM = df_traffic_merged.merge(edges_osm_need[['osm_id','road_type','oneway','lanes','maxspeed']], on='osm_id')

# (Optional) Visualizing module

In [54]:
%%time
# congestion_colors = ["#00FF00", "#ADFF2F", "#FFFF00", "#FFA500", "#FF0000", "#8B0000"]

m = gdf_road_merged.explore(
    column='airbnb_count',
    cmap= "rainbow", #"rainbow",
    tiles="CartoDB dark_matter", # OpenStreetMap, CartoDB dark_matter
    categorical=False
)

m = gdf_shuttle.explore(
    m=m,
    markersize=40,
    linewidth=2,
    edgecolor="black",
)

gdf_venue.explore(
    m=m
)

CPU times: user 541 ms, sys: 0 ns, total: 541 ms
Wall time: 581 ms


In [399]:
# show the matched portion of OSM network
edges_osm_need_vis = edges_osm_need[edges_osm_need['osm_id'].isin(gdf_road_merged_copy_mergeOSM.osm_id.unique().tolist())]
print(edges_osm_need_vis.shape[0])

2813


In [69]:
# edges_osm_need_vis.explore()