In [1]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer

import pandas as pd

import numpy as np
import osmnx as ox

## Download Pedestrian Collision Data

In [2]:
# Traffic incidents open data from https://data.calgary.ca/Transportation-Transit/Traffic-Incidents/35ra-9556

data = pd.read_csv('Traffic_Incidents.csv')
data.head()

Unnamed: 0,INCIDENT INFO,DESCRIPTION,START_DT,MODIFIED_DT,QUADRANT,Longitude,Latitude,Count,id,Point
0,Westbound Memorial Drive and 9 Street NW,Traffic incident. Blocking the right lane,2023/11/27 08:55:49 AM,2023/11/27 08:57:14 AM,NW,-114.081269,51.053693,1,2023-11-27T08:55:4951.053693443345296-114.0812...,POINT (-114.08126874525114 51.053693443345296)
1,Eastbound Glenmore Trail and Crowchild Trail SW,Traffic incident. Blocking the right lane,2023/11/27 07:44:17 AM,2023/11/27 07:45:31 AM,SW,-114.122814,51.001318,1,2023-11-27T07:44:1751.00131795970452-114.12281...,POINT (-114.12281385934625 51.00131795970452)
2,Eastbound 17 Avenue east of Deerfoot Trail SE,Traffic incident. Blocking the right lane,2023/11/27 07:06:55 AM,2023/11/27 07:08:46 AM,SE,-114.011517,51.037175,1,2023-11-27T07:06:5551.03717516674427-114.01151...,POINT (-114.01151686241202 51.03717516674427)
3,Southbound 36 Street at 8 Avenue NE,Two vehicle incident. Blocking the left lanes,2023/11/26 05:47:39 PM,2023/11/26 06:08:18 PM,NE,-113.981869,51.060388,1,2023-11-26T17:47:3951.06038813992116-113.98186...,POINT (-113.98186891621388 51.06038813992116)
4,Tsuut'ina Trail and 130 Avenue SW,Traffic incident.,2023/11/26 05:07:18 PM,2023/11/26 05:50:25 PM,SW,-114.138711,50.936205,1,2023-11-26T17:07:1850.93620462055178-114.13871...,POINT (-114.13871128738727 50.93620462055178)


## Download Climate Data

In [3]:
climate = pd.read_csv('climate-daily.csv')
climate = climate[["LOCAL_YEAR","LOCAL_MONTH","LOCAL_DAY","MIN_TEMPERATURE","TOTAL_PRECIPITATION","SNOW_ON_GROUND"]]
climate = climate.rename(columns={"LOCAL_YEAR": "Year", "LOCAL_MONTH": "Month","LOCAL_DAY":"Day","TOTAL_PRECIPITATION":"PRECIPITATION" })
climate.head()

Unnamed: 0,Year,Month,Day,MIN_TEMPERATURE,PRECIPITATION,SNOW_ON_GROUND
0,2016,8,4,,,
1,2016,8,5,12.5,0.0,
2,2016,8,6,10.7,25.5,
3,2016,8,7,13.4,3.8,
4,2016,8,8,12.5,0.4,


## Data Cleanup and Feature Engineering

In [4]:
# Drop NA values
data = data.dropna(subset=['DESCRIPTION', 'START_DT',"Longitude",'Latitude'])

# Filter only pedestrian collisions
data= data[data["DESCRIPTION"].str.contains("ped", case=False)]

# Convert column to datetime
data['START_DT'] = pd.to_datetime(data['START_DT'])

  data['START_DT'] = pd.to_datetime(data['START_DT'])


In [5]:
df = data[["START_DT","Longitude","Latitude"]].copy()

df['Month'] = df['START_DT'].dt.month
df['DayofWeek'] = df['START_DT'].dt.dayofweek
df['Hour'] = df['START_DT'].dt.hour
df['Year'] = df['START_DT'].dt.year
df['Day'] = df['START_DT'].dt.day

# Categorical variables 
df['Season'] = pd.cut(df['Month'], bins=[-1, 5, 10, 13], labels=['Winter', 'Summer', 'Winter'],ordered=False).astype(str)
df['Workday'] = pd.cut(df['DayofWeek'], bins=[-1, 5, 7], labels=['Weekday', 'Weekend'],ordered=False).astype(str)

df = df.drop(["START_DT"], axis=1)
df.head()

Unnamed: 0,Longitude,Latitude,Month,DayofWeek,Hour,Year,Day,Season,Workday
5,-114.068607,51.00251,11,6,16,2023,26,Winter,Weekend
40,-114.16971,51.018934,11,4,18,2023,24,Winter,Weekday
41,-114.161009,51.021726,11,4,17,2023,24,Winter,Weekday
63,-114.062535,51.059772,11,3,20,2023,23,Winter,Weekday
64,-114.111903,51.005006,11,3,20,2023,23,Winter,Weekday


### Merge Climate and Collision Data

In [6]:
df_final = pd.merge(df, climate, on=["Year","Month","Day"])
df_final = df_final[["Longitude","Latitude","Month","Season","DayofWeek","Workday","Hour","PRECIPITATION","MIN_TEMPERATURE"]]
df_final['PRECIPITATION'] = df_final['PRECIPITATION'].apply(lambda x: 1 if x>0 else 0)
df_final

Unnamed: 0,Longitude,Latitude,Month,Season,DayofWeek,Workday,Hour,PRECIPITATION,MIN_TEMPERATURE
0,-114.169710,51.018934,11,Winter,4,Weekday,18,0,-11.3
1,-114.161009,51.021726,11,Winter,4,Weekday,17,0,-11.3
2,-114.062535,51.059772,11,Winter,3,Weekday,20,1,-6.3
3,-114.111903,51.005006,11,Winter,3,Weekday,20,1,-6.3
4,-114.091637,51.038428,11,Winter,2,Weekday,18,1,-3.2
...,...,...,...,...,...,...,...,...,...
1504,-114.014025,51.067037,12,Winter,0,Weekday,12,0,-20.3
1505,-113.937759,50.914367,12,Winter,3,Weekday,20,0,-26.3
1506,-114.090045,51.037818,12,Winter,3,Weekday,9,0,-26.3
1507,-114.076439,51.002463,12,Winter,2,Weekday,18,0,-25.2


## Negative Synthetic Data

The pedestrian collision data above represents the positive labels in our classifier. We therfore need to generate negative labels as well. I'll use the `sdv` python library to train a GAN to generate synthetic data and `osmnx` to generate a random sample of points along the road network

### Using `osmnx` to sample negative lable locations

In [7]:
# G = ox.graph_from_place('Calgary, AB, Canada', network_type='drive')
# ox.save_graphml(G, 'calgary.graphml')
G = ox.load_graphml('calgary.graphml')

In [8]:
points = ox.utils_geo.sample_points(G, len(df))

  points = ox.utils_geo.sample_points(G, len(df))

  return lines.interpolate(np.random.rand(n), normalized=True)


### Using `sdv` to generate synthetic tabular data

In [9]:
# Define table metadata excluding lat lon. Since we don't want the negative locations 
# to have the same distribution as the positive locations

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=df_final.drop(["Longitude","Latitude"], axis=1))
metadata

{
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1",
    "columns": {
        "Month": {
            "sdtype": "numerical"
        },
        "Season": {
            "sdtype": "categorical"
        },
        "DayofWeek": {
            "sdtype": "categorical"
        },
        "Workday": {
            "sdtype": "categorical"
        },
        "Hour": {
            "sdtype": "numerical"
        },
        "PRECIPITATION": {
            "sdtype": "categorical"
        },
        "MIN_TEMPERATURE": {
            "sdtype": "numerical"
        }
    }
}

In [10]:
synthesizer = CTGANSynthesizer(
    metadata, 
    enforce_min_max_values=True,
    enforce_rounding=True,
    epochs=500,
    verbose=True
)

In [11]:
synthesizer.fit(df_final.drop(["Longitude","Latitude"], axis=1))

Gen. (-0.55) | Discrim. (-0.16): 100%|████████| 500/500 [00:30<00:00, 16.31it/s]


In [12]:
synthetic_data = synthesizer.sample(
    num_rows=df.shape[0]
)

df_final["Target"]=1
synthetic_data["Target"]=0

long_lat = list(zip(points.geometry.x.to_list(),points.geometry.y.to_list()))
synthetic_data = pd.concat([pd.DataFrame(long_lat, columns=['Longitude','Latitude']),synthetic_data], axis=1)

In [13]:
df_all = pd.concat([synthetic_data, df_final], axis=0).sample(frac=1)
df_all = df_all.dropna()

In [14]:
df_all.to_csv("pedestrian-collisions-final.csv")