In [3]:
import pandas as pd
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.5f}'.format
import matplotlib.pyplot as plt

# Create venue (polygon) and shuttle (points) geometry

In [36]:
%%time

# Create a polygon for event venue
from shapely.geometry import Polygon
import geopandas as gpd

lat_point_list = [30.131962, 30.146337, 30.140386, 30.122629]
lon_point_list = [-97.647388, -97.635686, -97.619652, -97.631465]
polygon_geom = Polygon(zip(lon_point_list, lat_point_list))
gdf_venue = gpd.GeoDataFrame(index=[0], crs='epsg:4326', geometry=[polygon_geom])       
gdf_venue['name']  = 'the Circuit of the Americas'
# gdf_venue = gdf_venue.to_crs("EPSG:3857") 
gdf_venue

CPU times: user 14.7 ms, sys: 0 ns, total: 14.7 ms
Wall time: 14.6 ms


Unnamed: 0,geometry,name
0,"POLYGON ((-97.64739 30.13196, -97.63569 30.14634, -97.61965 30.14039, -97.63147 30.12263, -97.64739 30.13196))",the Circuit of the Americas


In [163]:
import shapely.geometry as geom

# Create Shapely point geometries
point_waterloo_park = geom.Point(-97.736285, 30.273726) # (longitude, latitude)
point_barton_creek_square = geom.Point(-97.805046, 30.257509)
point_expo_center = geom.Point(-97.622544, 30.297062)
# point_ridehailing = geom.Point(-97.614135, 30.178718)

point_shuttle_list = [point_waterloo_park,point_barton_creek_square,point_expo_center] #,point_ridehailing
gdf_shuttle = gpd.GeoDataFrame({'Shuttle_Location': ['Shuttle_Waterloo_Park', 'Shuttle_Barton_Creek_Square', 'Shuttle_Expo_Center'], #, 'Uber_DelValle_HighSchool'
                        'geometry': point_shuttle_list},
                        crs="EPSG:4326")
# gdf_shuttle = gdf_shuttle.to_crs("EPSG:3857") 
gdf_shuttle

Unnamed: 0,Shuttle_Location,geometry
0,Shuttle_Waterloo_Park,POINT (-97.73628 30.27373)
1,Shuttle_Barton_Creek_Square,POINT (-97.80505 30.25751)
2,Shuttle_Expo_Center,POINT (-97.62254 30.29706)


# Load INRIX Road network data

In [164]:
df_road = pd.read_csv("data/TMC_Identification.csv", sep=',', header=0)
df_road = df_road[['tmc_code','intersection','start_latitude','start_longitude','end_latitude','end_longitude','miles']]
print('total number of road segments:',df_road.shape[0])
df_road.head(3)

total number of road segments: 4460


Unnamed: 0,tmc_code,intersection,start_latitude,start_longitude,end_latitude,end_longitude,miles
0,112P13033,W MARTIN LUTHER KING JR BLVD,30.28172,-97.74193,30.28232,-97.74207,0.04249
1,112+08908,FM-20,30.11066,-97.41049,30.11082,-97.37323,2.22783
2,112+16538,GATTIS SCHOOL RD,30.49057,-97.67514,30.49351,-97.67573,0.20831


In [165]:
%%time
from shapely.geometry import LineString
from geopandas import GeoDataFrame

df_road['geometry'] = df_road.apply(
    lambda row: LineString([(row['start_longitude'], row['start_latitude']),
                             (row['end_longitude'], row['end_latitude'])]),
    axis=1
)

# Create a GeoDataFrame for road network
gdf_road = GeoDataFrame(df_road, geometry='geometry',crs="EPSG:4326")

CPU times: user 187 ms, sys: 0 ns, total: 187 ms
Wall time: 141 ms


In [166]:
gdf_road.head(2)

Unnamed: 0,tmc_code,intersection,start_latitude,start_longitude,end_latitude,end_longitude,miles,geometry
0,112P13033,W MARTIN LUTHER KING JR BLVD,30.28172,-97.74193,30.28232,-97.74207,0.04249,"LINESTRING (-97.74193 30.28172, -97.74207 30.28232)"
1,112+08908,FM-20,30.11066,-97.41049,30.11082,-97.37323,2.22783,"LINESTRING (-97.41049 30.11066, -97.37323 30.11082)"


# Load OSM driving road network of Austin by bouding box
## 1. Load OSM road network attributes using OSMnx

In [125]:
%%time
import osmnx as ox

# pandana method does not return road attributes so use osmnx to retrieve them
G_osm = ox.graph_from_bbox(bbox=(30.584984, 30.039047, -97.389445, -97.996048), network_type='drive') 

# Calculate and add compass bearing attributes to all graph edges.
G_osm = ox.bearing.add_edge_bearings(G_osm)

nodes_osm, edges_osm = ox.graph_to_gdfs(G_osm)

CPU times: user 56.8 s, sys: 0 ns, total: 56.8 s
Wall time: 56.7 s


In [133]:
nodes_osm.head(2)

Unnamed: 0_level_0,y,x,street_count,highway,ref,geometry
osmid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
151371439,30.20374,-97.98883,3,,,POINT (-97.98883 30.20374)
151371447,30.20425,-97.98779,1,turning_circle,,POINT (-97.98779 30.20425)


In [135]:
edges_osm.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,osmid,name,highway,oneway,reversed,length,bearing,geometry,ref,bridge,lanes,maxspeed,access,tunnel,width,junction
u,v,key,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
151371439,151462706,0,15287110,Heritage Drive,residential,False,False,88.55,341.4,"LINESTRING (-97.98883 30.20374, -97.98913 30.20449)",,,,,,,,
151371439,151371447,0,15277135,Hideaway Cove,residential,False,False,116.618,60.4,"LINESTRING (-97.98883 30.20374, -97.98848 30.20385, -97.98836 30.20390, -97.98815 30.20401, -97.98793 30.20416, -97.98779 30.20425)",,,,,,,,


In [136]:
nodes_osm.to_csv('./output/osm_nodes.csv')
edges_osm.to_csv('./output/osm_edges.csv')

## 2. Load osm network using pandana because it is faster when computing network distance compared with OSMnx

In [40]:
# %%time
# ## Wall time: 1min
from pandana.loaders import osm
# network = osm.pdna_network_from_bbox(30.039047, -97.996048, 30.584984, -97.389445, network_type='drive') # pandana method does not return road attributes so use osmnx to retrieve them
# network.nodes_df.to_csv('./output/nodes.csv')
# network.edges_df.to_csv('./output/edges.csv')
# network.edges_df.head()

In [13]:
%%time
import pandana
# load existing network data
nodes = pd.read_csv('./output/nodes.csv', index_col=0)
print(nodes.shape[0])
edges = pd.read_csv('./output/edges.csv', index_col=[0,1])
network = pandana.Network(nodes['x'], nodes['y'], 
                          edges['from'], edges['to'], edges[['distance']])

64416
CPU times: user 23 s, sys: 0 ns, total: 23 s
Wall time: 3.14 s


# Load Airbnb data

In [167]:
df_airbnb = pd.read_csv("data/airbnb_listings_202312.csv", sep=',', header=0)
print('total number of airbnb:',df_airbnb.shape[0])
# df_airbnb.head(2)  

#Converting Pandas DataFrame to GeoDataFrame
from shapely.geometry import Point
from geopandas import GeoDataFrame

geometry = [Point(xy) for xy in zip(df_airbnb.longitude, df_airbnb.latitude)]
gdf_airbnb = GeoDataFrame(df_airbnb, crs="EPSG:4326", geometry=geometry)
gdf_airbnb = gdf_airbnb.to_crs("EPSG:3857") 

total number of airbnb: 15419


# Feature engineering 
Dynamic features: event timing, attendance size, proximity to the event venue, proximity to shuttle services, hotel occupancy rates

Static features: population density, hotel density

## 1. Airbnb count (spatial buffers)
Spatial join airbnb locations and road segments to get number of airbnb near roads

In [171]:
%%time
# gdf_airbnb.head(20).explore()
gdf_airbnb_need = gdf_airbnb[['id','geometry']]
gdf_airbnb_need['airbnb_count'] = 1

buffer_distance = 500  # Buffer distance in meters
gdf_polyline_buffered = gdf_road.copy()
gdf_polyline_buffered = gdf_polyline_buffered.to_crs("EPSG:3857")  # because using spatial buffer, so need to use projected coordinate system

# gdf_polyline_buffered = gdf_road.head(100).copy()
gdf_polyline_buffered["geometry"] = gdf_polyline_buffered.geometry.buffer(buffer_distance)

# gdf_road_w_airbnb = gpd.sjoin(gdf_airbnb, gdf_road, predicate="intersects")
gdf_road_w_airbnb = gpd.sjoin(gdf_polyline_buffered, gdf_airbnb_need, predicate="intersects", how="left")
print(gdf_road_w_airbnb.tmc_code.unique().shape[0])
point_counts = gdf_road_w_airbnb.groupby("tmc_code").airbnb_count.count().reset_index()
gdf_road_merged = gdf_road.merge(point_counts, how='left')
gdf_road_merged = gdf_road_merged.fillna(0)

4460
CPU times: user 418 ms, sys: 0 ns, total: 418 ms
Wall time: 432 ms


In [104]:
# gdf_road_merged.explore()

## 2. Network distance to venue and shuttles

In [173]:
gdf_road_merged['Shuttle_Waterloo_Park'] = gdf_shuttle[gdf_shuttle['Shuttle_Location']=='Shuttle_Waterloo_Park'].geometry.iloc[0]
gdf_road_merged['Shuttle_Waterloo_Park'] = gpd.GeoSeries.from_wkt(gdf_road_merged['Shuttle_Waterloo_Park'].astype(str))

gdf_road_merged['Shuttle_Barton_Creek_Square'] = gdf_shuttle[gdf_shuttle['Shuttle_Location']=='Shuttle_Barton_Creek_Square'].geometry.iloc[0]
gdf_road_merged['Shuttle_Barton_Creek_Square'] = gpd.GeoSeries.from_wkt(gdf_road_merged['Shuttle_Barton_Creek_Square'].astype(str))

gdf_road_merged['Shuttle_Expo_Center'] = gdf_shuttle[gdf_shuttle['Shuttle_Location']=='Shuttle_Expo_Center'].geometry.iloc[0]
gdf_road_merged['Shuttle_Expo_Center'] = gpd.GeoSeries.from_wkt(gdf_road_merged['Shuttle_Expo_Center'].astype(str))

venue_centroid = gdf_venue.iloc[0].geometry.centroid
gdf_road_merged["venue_centroid"] = gdf_road_merged.apply(lambda x: venue_centroid, axis=1)

In [174]:
# first, identify the nearest node (of pandana network) to each lon lat coordinate of road segments
road_start_nodes = network.get_node_ids(gdf_road_merged.start_longitude, gdf_road_merged.start_latitude).values
road_end_nodes = network.get_node_ids(gdf_road_merged.end_longitude, gdf_road_merged.end_latitude).values

In [175]:
# Nework distance: use the start and end location of each road segment to calculate the shortest path distance (km) and take the average
# Distance Unit: km
def compute_average_distance_to_poi(df, col):
    shuttle_nodes = network.get_node_ids(df[col].x, df[col].y).values
    distances_start = network.shortest_path_lengths(road_start_nodes, shuttle_nodes)
    distances_end = network.shortest_path_lengths(road_end_nodes, shuttle_nodes)
    df['distance_to_'+col+'_start'] = pd.Series(distances_start) / 1000 # covert to km
    df['distance_to_'+col+'_end'] =  pd.Series(distances_end) / 1000
    df['distance_to_'+col] = (df['distance_to_'+col+'_start'] + df['distance_to_'+col+'_end'])/2
    df.drop(columns=['distance_to_'+col+'_start', 'distance_to_'+col+'_end'], inplace=True)
    return df

In [176]:
gdf_road_merged = compute_average_distance_to_poi(gdf_road_merged, 'venue_centroid')
gdf_road_merged = compute_average_distance_to_poi(gdf_road_merged, 'Shuttle_Waterloo_Park')
gdf_road_merged = compute_average_distance_to_poi(gdf_road_merged, 'Shuttle_Barton_Creek_Square')
gdf_road_merged = compute_average_distance_to_poi(gdf_road_merged, 'Shuttle_Expo_Center')

## 3. Matching with OSM road data to enrich network attribute

In [113]:
# Function to calculate bearing between two points (start and end of a road segment)
def calculate_bearing(start_lat, start_lon, end_lat, end_lon):
    # Convert degrees to radians
    start_lat = np.radians(start_lat)
    start_lon = np.radians(start_lon)
    end_lat = np.radians(end_lat)
    end_lon = np.radians(end_lon)
    
    # Difference in longitudes
    delta_lon = end_lon - start_lon
    
    # Bearing calculation
    x = np.sin(delta_lon) * np.cos(end_lat)
    y = np.cos(start_lat) * np.sin(end_lat) - np.sin(start_lat) * np.cos(end_lat) * np.cos(delta_lon)
    bearing = np.arctan2(x, y)
    
    # Convert from radians to degrees and normalize to 0-360
    bearing = np.degrees(bearing)
    bearing = (bearing + 360) % 360
    return bearing
# The bearing value represents the direction from the starting point to the destination, measured clockwise from true north (0° is north, 90° is east, etc.).

In [115]:
# Apply the function to calculate bearings for road segments
gdf_road_merged['segment_bearing'] = gdf_road_merged.apply(
    lambda row: calculate_bearing(row['start_latitude'], row['start_longitude'], row['end_latitude'], row['end_longitude']), axis=1
)

In [162]:
edges_osm_need = edges_osm.reset_index()
edges_osm_need.drop(['u','v','key', 'ref', 'reversed', 'bridge', 'access','tunnel'], axis=1, inplace=True)
edges_osm_need.head(2)

Unnamed: 0,osmid,name,highway,oneway,length,bearing,geometry,lanes,maxspeed,width,junction
0,15287110,Heritage Drive,residential,False,88.55,341.4,"LINESTRING (-97.98883 30.20374, -97.98913 30.20449)",,,,
1,15277135,Hideaway Cove,residential,False,116.618,60.4,"LINESTRING (-97.98883 30.20374, -97.98848 30.20385, -97.98836 30.20390, -97.98815 30.20401, -97.98793 30.20416, -97.98779 30.20425)",,,,


In [None]:
edges_osm_need['geometry'] = gpd.GeoSeries.from_wkt(edges_osm_need['geometry'])
edges_osm_need = gpd.GeoDataFrame(edges_osm_need,crs="EPSG:3857", geometry='geometry') #crs="EPSG:4326",


# Merge with traffic data for tree-based ML prediction model

In [178]:
%%time
df_traffic = pd.read_csv("output/austin_2022_GP_10min_interval_delaydifference.csv", sep=',', header=0)
df_traffic_merged = df_traffic.merge(gdf_road_merged, left_on='tmc_code', right_on='tmc')

In [188]:
df_traffic_merged.head(2)

Unnamed: 0,tmc_code,hour_min,delay_baseline,delay_focus,delay_difference,date,intersection,start_latitude,start_longitude,end_latitude,end_longitude,miles,airbnb_count,distance_to_Shuttle_Waterloo_Park,distance_to_Shuttle_Barton_Creek_Square,distance_to_Shuttle_Expo_Center,distance_to_Uber_DelValle_HighSchool,distance_to_venue
0,112+04758,00:00,-0.93268,-0.93268,0.0,2022-10-21,51ST ST/CAMERON RD/EXIT 237,30.3033,-97.71418,30.3053,-97.71289,0.15814,87,4685.01085,11838.72611,10171.9709,19603.37763,23552.59734
1,112+04758,00:10,-0.93268,-0.93268,0.0,2022-10-21,51ST ST/CAMERON RD/EXIT 237,30.3033,-97.71418,30.3053,-97.71289,0.15814,87,4685.01085,11838.72611,10171.9709,19603.37763,23552.59734


In [186]:
df_traffic_merged.drop(['geometry', 'tmc','Shuttle_Waterloo_Park','Shuttle_Barton_Creek_Square','Shuttle_Expo_Center','venue_centroid'], axis=1, inplace=True)

In [187]:
%%time
df_traffic_merged.to_csv('output/austin_2022_GP_10min_interval_delaydifference_with_features_forML.csv',index=False)

CPU times: user 21.2 s, sys: 809 ms, total: 22 s
Wall time: 24 s


# (Optional) Visualizing module

In [54]:
%%time
# congestion_colors = ["#00FF00", "#ADFF2F", "#FFFF00", "#FFA500", "#FF0000", "#8B0000"]

m = gdf_road_merged.explore(
    column='airbnb_count',
    cmap= "rainbow", #"rainbow",
    tiles="CartoDB dark_matter", # OpenStreetMap, CartoDB dark_matter
    categorical=False
)

m = gdf_shuttle.explore(
    m=m,
    markersize=40,
    linewidth=2,
    edgecolor="black",
)

gdf_venue.explore(
    m=m
)

CPU times: user 541 ms, sys: 0 ns, total: 541 ms
Wall time: 581 ms
