In [3]:
# import sys
# print(sys.version)
# !/home/rongxiang/miniconda3/envs/pytorch_env/bin/pip install scikit-learn

In [4]:
import torch
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.5f}'.format

In [108]:
%%time

df_traffic = pd.read_csv('output/austin_2022_GP_10min_interval_delaydifference_with_features_forML2.csv')
df_traffic = df_traffic.rename(columns={"miles": "segment_length"})

df_traffic['minutes_since_midnight'] = df_traffic['hour_min'].apply(lambda x: int(x[:2]) * 60 + int(x[3:]))

print(df_traffic.date.unique())
print('unique road segments (samples):',df_traffic.tmc_code.unique().shape[0])
print('total observations: ',df_traffic.shape[0])
df_traffic.head(2)

['2022-10-21' '2022-10-22' '2022-10-23']
unique road segments (samples): 3221
total observations:  1388251
CPU times: user 4.66 s, sys: 349 ms, total: 5.01 s
Wall time: 5.01 s


Unnamed: 0,tmc_code,hour_min,delay_baseline,delay_focus,delay_difference,date,intersection,start_latitude,start_longitude,end_latitude,end_longitude,segment_length,airbnb_count,distance_to_venue_centroid,distance_to_Shuttle_Waterloo_Park,distance_to_Shuttle_Barton_Creek_Square,distance_to_Shuttle_Expo_Center,segment_bearing,osm_id,road_type,oneway,lanes,maxspeed,lanes_mean,minutes_since_midnight,no_devices
0,112+04760,00:00,-1.77926,-2.22226,-0.443,2022-10-21,US-183/EXIT 239-240,30.32874,-97.70568,30.33272,-97.70402,0.29287,49,26.64241,7.39988,14.4158,10.33789,19.79829,"[122981397, 1012363973]66847798019339769835",secondary_link,True,"['1', '2']",55 mph,1.5,0,0.0
1,112+04760,00:10,-1.30941,-2.4344,-1.12499,2022-10-21,US-183/EXIT 239-240,30.32874,-97.70568,30.33272,-97.70402,0.29287,49,26.64241,7.39988,14.4158,10.33789,19.79829,"[122981397, 1012363973]66847798019339769835",secondary_link,True,"['1', '2']",55 mph,1.5,10,0.0


In [109]:
# df_traffic = df_traffic[df_traffic['minutes_since_midnight']>=540]

df_traffic.loc[df_traffic.date == '2022-10-21', 'day_of_week'] = 4
df_traffic.loc[df_traffic.date == '2022-10-22', 'day_of_week'] = 5
df_traffic.loc[df_traffic.date == '2022-10-23', 'day_of_week'] = 6
# print(df_traffic.day_of_week.unique())

# Split the 'hour_min' column into two new columns 'hour' and 'minute'
# df_traffic[['hour', 'minute']] = df_traffic['hour_min'].str.split(':', expand=True)
# # Convert the columns to integers if needed
# df_traffic['hour'] = df_traffic['hour'].astype(int)
# df_traffic['minute'] = df_traffic['minute'].astype(int)

# because the event usually begins in the morning after 10, people arrive typically around that time, so we want to only focus on after 930
df_traffic = pd.get_dummies(df_traffic, columns=['date'], prefix='date')
df_traffic.loc[df_traffic.road_type=="motorway_link", 'road_type'] = 'motorway'
df_traffic.loc[df_traffic.road_type=="primary_link", 'road_type'] = 'primary'
df_traffic.loc[df_traffic.road_type=="secondary_link", 'road_type'] = 'secondary'
df_traffic.loc[df_traffic.road_type=="trunk_link", 'road_type'] = 'trunk'
df_traffic.loc[df_traffic.road_type=="tertiary_link", 'road_type'] = 'tertiary'

# Add the one-hot encoded columns back to the GeoDataFrame
df_traffic = pd.concat([df_traffic, pd.get_dummies(df_traffic['road_type'],prefix='roadtype')], axis=1)
df_traffic['roadtype_motorway'] = df_traffic['roadtype_motorway'].astype(int)
df_traffic['roadtype_primary'] = df_traffic['roadtype_primary'].astype(int)
df_traffic['roadtype_residential'] = df_traffic['roadtype_residential'].astype(int)
df_traffic['roadtype_secondary'] = df_traffic['roadtype_secondary'].astype(int)
df_traffic['roadtype_tertiary'] = df_traffic['roadtype_tertiary'].astype(int)
df_traffic['roadtype_trunk'] = df_traffic['roadtype_trunk'].astype(int)
df_traffic['roadtype_unclassified'] = df_traffic['roadtype_unclassified'].astype(int)

df_traffic['oneway'] = df_traffic['oneway'].astype(int)
df_traffic['date_2022-10-21'] = df_traffic['date_2022-10-21'].astype(int)
df_traffic['date_2022-10-22'] = df_traffic['date_2022-10-22'].astype(int)
df_traffic['date_2022-10-23'] = df_traffic['date_2022-10-23'].astype(int)

import re

def process_max_speed(value):
    if isinstance(value, str):
        # Extract all numeric values from the string
        numbers = list(map(int, re.findall(r'\d+', value)))
        if numbers:
            return max(numbers)  # Return the maximum value
    return None

# Apply the function to the max_speed column
df_traffic['max_speed'] = df_traffic['maxspeed'].apply(process_max_speed)

In [110]:
df_traffic['timed_tmc_code'] = df_traffic['tmc_code'] + '_' + df_traffic['minutes_since_midnight'].astype(str) + '_' + df_traffic['day_of_week'].astype(str)
df_traffic.shape[0]
df_traffic.head()

Unnamed: 0,tmc_code,hour_min,delay_baseline,delay_focus,delay_difference,intersection,start_latitude,start_longitude,end_latitude,end_longitude,segment_length,airbnb_count,distance_to_venue_centroid,distance_to_Shuttle_Waterloo_Park,distance_to_Shuttle_Barton_Creek_Square,distance_to_Shuttle_Expo_Center,segment_bearing,osm_id,road_type,oneway,lanes,maxspeed,lanes_mean,minutes_since_midnight,no_devices,day_of_week,date_2022-10-21,date_2022-10-22,date_2022-10-23,roadtype_motorway,roadtype_primary,roadtype_residential,roadtype_secondary,roadtype_tertiary,roadtype_trunk,roadtype_unclassified,max_speed,timed_tmc_code
0,112+04760,00:00,-1.77926,-2.22226,-0.443,US-183/EXIT 239-240,30.32874,-97.70568,30.33272,-97.70402,0.29287,49,26.64241,7.39988,14.4158,10.33789,19.79829,"[122981397, 1012363973]66847798019339769835",secondary,1,"['1', '2']",55 mph,1.5,0,0.0,4.0,1,0,0,0,0,0,1,0,0,0,55.0,112+04760_0_4.0
1,112+04760,00:10,-1.30941,-2.4344,-1.12499,US-183/EXIT 239-240,30.32874,-97.70568,30.33272,-97.70402,0.29287,49,26.64241,7.39988,14.4158,10.33789,19.79829,"[122981397, 1012363973]66847798019339769835",secondary,1,"['1', '2']",55 mph,1.5,10,0.0,4.0,1,0,0,0,0,0,1,0,0,0,55.0,112+04760_10_4.0
2,112+04760,00:20,-1.30941,-1.77926,-0.46985,US-183/EXIT 239-240,30.32874,-97.70568,30.33272,-97.70402,0.29287,49,26.64241,7.39988,14.4158,10.33789,19.79829,"[122981397, 1012363973]66847798019339769835",secondary,1,"['1', '2']",55 mph,1.5,20,0.0,4.0,1,0,0,0,0,0,1,0,0,0,55.0,112+04760_20_4.0
3,112+04760,00:30,-0.27878,-1.30941,-1.03063,US-183/EXIT 239-240,30.32874,-97.70568,30.33272,-97.70402,0.29287,49,26.64241,7.39988,14.4158,10.33789,19.79829,"[122981397, 1012363973]66847798019339769835",secondary,1,"['1', '2']",55 mph,1.5,30,0.0,4.0,1,0,0,0,0,0,1,0,0,0,55.0,112+04760_30_4.0
4,112+04760,00:40,-2.00397,-2.22226,-0.21829,US-183/EXIT 239-240,30.32874,-97.70568,30.33272,-97.70402,0.29287,49,26.64241,7.39988,14.4158,10.33789,19.79829,"[122981397, 1012363973]66847798019339769835",secondary,1,"['1', '2']",55 mph,1.5,40,0.0,4.0,1,0,0,0,0,0,1,0,0,0,55.0,112+04760_40_4.0


In [16]:
# feature_columns = ['minutes_since_midnight', 'segment_length','segment_bearing', 'oneway', 'lanes_mean', 'max_speed', 'no_devices', 
#                    'roadtype_motorway','roadtype_primary','roadtype_residential','roadtype_secondary','roadtype_tertiary','roadtype_trunk','roadtype_unclassified',
#                    'airbnb_count',  'distance_to_venue_centroid'
#                ] 

# # Prepare input features and labels for the model
# X = df_traffic[feature_columns].copy() 
# # 'distance_to_Shuttle_Waterloo_Park', 'distance_to_Shuttle_Barton_Creek_Square', 'distance_to_Shuttle_Expo_Center',
# # 'start_latitude', 'start_longitude',

# y = df_traffic['delay_difference'].values
# # Normalize features
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

In [113]:
# node_features = np.hstack([X_scaled, time_of_day_encoded, day_of_week_encoded])
# node_features = torch.tensor(node_features, dtype=torch.float)
# node_features.size()

In [151]:
# %%time
# # check VIF
# df_cor = X.corr()
# pd.DataFrame(np.linalg.inv(X.corr().values), index = df_cor.index, columns=df_cor.columns)

# Create graph structure

In [124]:
edges = pd.read_csv('./output/INRIX_network_edge_connectivity.csv')

# Combine all unique IDs from 'source' and 'target'
unique_ids = pd.unique(edges[['source', 'target']].values.ravel())

# Create a mapping from road segment IDs to numeric indices
id_to_index = {road_id: idx for idx, road_id in enumerate(unique_ids)}

# Map the IDs to numeric indices
edges['source_numeric'] = edges['source'].map(id_to_index)
edges['target_numeric'] = edges['target'].map(id_to_index)
edges
# edge_index = torch.tensor(edges[['source_numeric', 'target_numeric']].values.T, dtype=torch.long)

Unnamed: 0,source,target,source_numeric,target_numeric
0,112P13033,112P13162,0,1
1,112-13162,112P13162,2,1
2,112-13161,112P13162,3,1
3,112N13033,112P13162,4,1
4,112N13162,112P13162,5,1
...,...,...,...,...
25785,112P17123,112P17125,3872,3885
25786,112-19750,112-19751,3806,3807
25787,112-19752,112-19751,975,3807
25788,112-19751,112-19752,3807,975


In [None]:
# df_traffic[['minutes_since_midnight','day_of_week']].drop_duplicates()

In [165]:
# # Create target tensor
# targets = torch.tensor(y, dtype=torch.float)
# data = Data(x=node_features, edge_index=edge_index, y=targets)
# print(data)

Data(x=[866449, 20], edge_index=[2, 25790], y=[866449])


# Define the GCN Model

In [173]:
_columns = ['tmc_code','day_of_week','minutes_since_midnight', 'segment_length','segment_bearing', 'oneway', 'lanes_mean', 'max_speed', 'no_devices', 
                   'roadtype_motorway','roadtype_primary','roadtype_residential','roadtype_secondary','roadtype_tertiary','roadtype_trunk','roadtype_unclassified',
                   'airbnb_count',  'distance_to_venue_centroid','delay_difference'
               ] 
df_traffic_need = df_traffic[_columns]
df_traffic_need.head(2)

Unnamed: 0,tmc_code,day_of_week,minutes_since_midnight,segment_length,segment_bearing,oneway,lanes_mean,max_speed,no_devices,roadtype_motorway,roadtype_primary,roadtype_residential,roadtype_secondary,roadtype_tertiary,roadtype_trunk,roadtype_unclassified,airbnb_count,distance_to_venue_centroid,delay_difference
0,112+04760,4.0,0,0.29287,19.79829,1,1.5,55.0,0.0,0,0,0,1,0,0,0,49,26.64241,-0.443
1,112+04760,4.0,10,0.29287,19.79829,1,1.5,55.0,0.0,0,0,0,1,0,0,0,49,26.64241,-1.12499


In [174]:
import torch
from torch_geometric.data import DataLoader, Data
from torch_geometric.nn import GCNConv
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Step 1: Generate Hypothetical Data
num_segments = df_traffic_need.tmc_code.unique().shape[0]
total_timepoints = df_traffic_need[['minutes_since_midnight','day_of_week']].drop_duplicates().shape[0]
# Convert minutes to radians

df_traffic_need['time_of_day_sin'] = np.sin(2 * np.pi * df_traffic_need['minutes_since_midnight'] / 1440)  # 1440 minutes in a day
df_traffic_need['time_of_day_cos'] = np.cos(2 * np.pi * df_traffic_need['minutes_since_midnight'] / 1440)

# # Generate random features for nodes
# np.random.seed(42)
# road_types = np.random.choice([0, 1], size=num_segments)  # 0 = residential, 1 = highway
# segment_lengths = np.random.uniform(500, 2000, num_segments)  # Random segment length between 500m and 2000m
# traffic_delays = np.random.uniform(0, 60, num_segments)  # Random traffic delays (0 to 60 minutes)

# # Repeat traffic delay and segment length for each time step
# traffic_delays_expanded = np.tile(traffic_delays, total_timepoints)
# segment_lengths_expanded = np.tile(segment_lengths, total_timepoints)

# # Create time of day (sin and cos for cyclical encoding)
# time_of_day = np.tile(np.arange(total_timepoints), num_segments)
# time_of_day_sin = np.sin(2 * np.pi * time_of_day / 1440)
# time_of_day_cos = np.cos(2 * np.pi * time_of_day / 1440)

# # Step 2: Create DataFrame for Nodes
# df_nodes = pd.DataFrame({
#     'segment_id': np.repeat(np.arange(num_segments), total_timepoints),
#     'traffic_delay': traffic_delays_expanded,
#     'segment_length': segment_lengths_expanded,
#     'time_of_day_sin': time_of_day_sin,
#     'time_of_day_cos': time_of_day_cos
# })

In [233]:
print(df_nodes.shape)
print(df_edges.shape)

(43200, 5)
(9, 2)


In [176]:
df_traffic_need['node_id'] = df_traffic_need['tmc_code'].map(id_to_index)
# df_traffic_need = df_traffic_need.set_index('node_id')
df_traffic_need.head(3)

Unnamed: 0,tmc_code,day_of_week,minutes_since_midnight,segment_length,segment_bearing,oneway,lanes_mean,max_speed,no_devices,roadtype_motorway,roadtype_primary,roadtype_residential,roadtype_secondary,roadtype_tertiary,roadtype_trunk,roadtype_unclassified,airbnb_count,distance_to_venue_centroid,delay_difference,time_of_day_sin,time_of_day_cos,node_id
0,112+04760,4.0,0,0.29287,19.79829,1,1.5,55.0,0.0,0,0,0,1,0,0,0,49,26.64241,-0.443,0.0,1.0,1167
1,112+04760,4.0,10,0.29287,19.79829,1,1.5,55.0,0.0,0,0,0,1,0,0,0,49,26.64241,-1.12499,0.04362,0.99905,1167
2,112+04760,4.0,20,0.29287,19.79829,1,1.5,55.0,0.0,0,0,0,1,0,0,0,49,26.64241,-0.46985,0.08716,0.99619,1167


In [197]:
df_traffic_need['time_point'] = df_traffic_need['day_of_week'] * 144 + df_traffic_need['minutes_since_midnight'] // 10
print(df_traffic_need['time_point'].unique().shape[0])
# df_traffic_need['time_point'].unique()

In [207]:
df_traffic_need.head(2)

Unnamed: 0,tmc_code,day_of_week,minutes_since_midnight,segment_length,segment_bearing,oneway,lanes_mean,max_speed,no_devices,roadtype_motorway,roadtype_primary,roadtype_residential,roadtype_secondary,roadtype_tertiary,roadtype_trunk,roadtype_unclassified,airbnb_count,distance_to_venue_centroid,delay_difference,time_of_day_sin,time_of_day_cos,node_id,time_point
0,112+04760,4.0,0,0.29287,19.79829,1,1.5,55.0,0.0,0,0,0,1,0,0,0,49,26.64241,-0.443,0.0,1.0,1167,576.0
1,112+04760,4.0,10,0.29287,19.79829,1,1.5,55.0,0.0,0,0,0,1,0,0,0,49,26.64241,-1.12499,0.04362,0.99905,1167,577.0


In [226]:
# from torch_geometric.data import Data
# import torch

# # Create a dictionary to store graphs by time
# time_graphs = {}

# node_feature_names = ['segment_length','segment_bearing', 'oneway',  'no_devices', 'airbnb_count',  'distance_to_venue_centroid']
# edge_index = torch.tensor(edges[['source_numeric', 'target_numeric']].values.T, dtype=torch.long)

# # Group by time
# for time_point, group in df_traffic_need.groupby('time_point'):
#     # Node features: Combine speed, delay, and length
#     node_features = torch.tensor(group[node_feature_names].values, dtype=torch.float)
    
#     # Create PyTorch Geometric Data object
#     graph = Data(x=node_features, edge_index=edge_index)
    
#     # Store the graph in the dictionary
#     time_graphs[time_point] = graph

In [235]:
df_traffic_need.shape[0]

1388251

In [None]:
# Step 2: Convert Data to PyG Format
# removed Node features: 'lanes_mean', 'max_speed',
node_feature_names = ['segment_length','segment_bearing', 'oneway',  'no_devices', 'airbnb_count',  'distance_to_venue_centroid']
node_features = torch.tensor(df_traffic_need[node_feature_names].values, dtype=torch.float)

# Edge connectivity (source, target)
edge_index = torch.tensor(edges[['source_numeric', 'target_numeric']].values.T, dtype=torch.long)

# Target: traffic delay at each time point
target_traffic_delay = torch.tensor(df_traffic_need['delay_difference'].values, dtype=torch.float)

# Create PyG Data object
data = Data(x=node_features, edge_index=edge_index, y=target_traffic_delay)

In [210]:
data

Data(x=[1388251, 9], edge_index=[2, 25790], y=[1388251])

In [191]:
print(data.is_directed())
print(data.num_nodes)
data.size()

False
1388251


(1388251, 1388251)

In [192]:
# %%time
# import torch_geometric
# import networkx as nx

# g = torch_geometric.utils.to_networkx(data[:10], to_undirected=True)
# nx.draw(g)

In [182]:
%%time
# Step 3: Define the GCN Model
class TrafficGCN(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(TrafficGCN, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x

# Initialize the model with input feature size (4 features: segment_length, traffic_delay, sin/cos)
model = TrafficGCN(in_channels=node_features.size()[1], hidden_channels=32, out_channels=1)

# Step 4: Create DataLoader
loader = DataLoader([data], batch_size=1, shuffle=True)

# Step 5: Train the Model
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = torch.nn.MSELoss()

for epoch in range(100):  # Train for 30 epochs
    model.train()
    total_loss = 0
    for batch in loader:
        optimizer.zero_grad()
        out = model(batch.x, batch.edge_index).squeeze()  # Output shape should match target
        loss = loss_fn(out, batch.y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss:.4f}")

Epoch 1, Loss: 512.4287
Epoch 2, Loss: 486.0608
Epoch 3, Loss: 460.7694
Epoch 4, Loss: 436.7087
Epoch 5, Loss: 414.0042
Epoch 6, Loss: 392.7188
Epoch 7, Loss: 372.8812
Epoch 8, Loss: 354.4917
Epoch 9, Loss: 337.5348
Epoch 10, Loss: 321.9865
Epoch 11, Loss: 307.8172
Epoch 12, Loss: 294.9960
Epoch 13, Loss: 283.4879
Epoch 14, Loss: 273.2528
Epoch 15, Loss: 264.2379
Epoch 16, Loss: 256.3814
Epoch 17, Loss: 249.6131
Epoch 18, Loss: 243.8584
Epoch 19, Loss: 239.0414
Epoch 20, Loss: 235.0843
Epoch 21, Loss: 231.9079
Epoch 22, Loss: 229.4309
Epoch 23, Loss: 227.5686
Epoch 24, Loss: 226.2353
Epoch 25, Loss: 225.3472
Epoch 26, Loss: 224.8257
Epoch 27, Loss: 224.5986
Epoch 28, Loss: 224.5987
Epoch 29, Loss: 224.7637
Epoch 30, Loss: 225.0361
Epoch 31, Loss: 225.3658
Epoch 32, Loss: 225.7110
Epoch 33, Loss: 226.0391
Epoch 34, Loss: 226.3248
Epoch 35, Loss: 226.5493
Epoch 36, Loss: 226.6995
Epoch 37, Loss: 226.7678
Epoch 38, Loss: 226.7525
Epoch 39, Loss: 226.6568
Epoch 40, Loss: 226.4876
Epoch 41,

In [183]:
import torch
import torch.nn.functional as F
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Step 5: Evaluate the Model
model.eval()
y_true = []  # To store actual values
y_pred = []  # To store predicted values

with torch.no_grad():
    for batch in loader:
        # Get model predictions
        out = model(batch.x, batch.edge_index).squeeze()
        
        # Append the predicted and actual values
        y_true.extend(batch.y.cpu().numpy())  # Convert to numpy for sklearn
        y_pred.extend(out.cpu().numpy())  # Convert to numpy for sklearn

# Convert lists to numpy arrays for evaluation
y_true = np.array(y_true)
y_pred = np.array(y_pred)

# Calculate MSE, MAE, and R²
mse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

# Print evaluation results
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R²): {r2:.4f}")

Mean Squared Error (MSE): 219.3468
Mean Absolute Error (MAE): 5.5952
R-squared (R²): -0.0048
