In [19]:
import pandas as pd

# Load the uploaded dataset
file_path = 'collisions.csv'
collision_data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
collision_data.head()


Unnamed: 0,OBJECTID,COLLISION_SK,Case File Number,Accident Date and Time,Latitude WGS84,Longitude WGS84,Road Location,Intersecting Road Location,Road Configuration,Collision Configuration,...,Weather Condition,Road Surface,Road Condition,Unusual Environmental Circumstances 1,Unusual Environmental Circumstances 2,Road Alignment,Road Grade,School Purpose Related,x,y
0,1,91836,103255,7/31/2024 2:47:00 AM,44.630666,-63.622756,COWIE HILL RD,RIDGE VALLEY RD,Non-intersection,Single vehicle - off road to the right,...,Clear,Dry - normal,Normal,,,Straight,Slope,,-7082452.496,5563565.034
1,2,91510,103193,7/30/2024 11:34:00 PM,44.642959,-63.571918,BARRINGTON ST,BISHOP ST,Non-intersection,Multiple vehicle - hit parked vehicle,...,Clear,Dry - normal,Normal,,,Straight,Level,,-7076793.345,5565488.255
2,3,91527,104626,7/30/2024 8:00:00 PM,44.67166,-63.496656,COLE HARBOUR RD,,Non-intersection,Multiple vehicle - approaching sideswipe,...,Clear,Dry - normal,Normal,,,Straight,Level,,-7068415.111,5569979.849
3,4,91509,103045,7/30/2024 6:15:00 PM,44.672792,-63.531234,WOODLAWN RD,BRUCE ST,Intersection - two or more public roads,Multiple vehicle - left turn against traffic,...,Overcast or cloudy,Wet,Potholes or bumps,,,Curved,Slope,,-7072264.398,5570157.006
4,5,91537,103044,7/30/2024 6:13:00 PM,44.647794,-63.606797,Chebucto,,Non-intersection,Multiple vehicle - rear end,...,Clear,Dry - normal,Normal,,,Straight,Level,,-7080676.004,5566244.646


In [None]:
# Step 1: Preprocess the data

# Define a collision score based on fatal and non-fatal injuries
def calculate_collision_score(row):
    # Assign weights to fatal and non-fatal injuries
    fatal_score = 10 if row['Fatal Injury'] == 'Yes' else 0
    non_fatal_score = 5 if row['Non Fatal Injury'] == 'Yes' else 0
    return fatal_score + non_fatal_score

# Apply collision score to the dataset
collision_data['Collision Score'] = collision_data.apply(calculate_collision_score, axis=1)

# Select relevant features and handle missing values
selected_features = ['Road Location', 'Road Condition', 'Weather Condition', 
                     'Road Surface', 'Light Condition', 'Collision Score']
processed_data = collision_data[selected_features].dropna()

# Encode categorical variables
encoded_data = pd.get_dummies(processed_data, columns=[
    'Road Condition', 'Weather Condition', 'Road Surface', 'Light Condition'
])

# Separate features and target
X = encoded_data.drop(['Road Location', 'Collision Score'], axis=1)
y = encoded_data['Collision Score']

# Display the processed data for modeling
processed_data.head()


Unnamed: 0,Road Location,Road Condition,Weather Condition,Road Surface,Light Condition,CSI
0,COWIE HILL RD,Normal,Clear,Dry - normal,Dusk,0
1,BARRINGTON ST,Normal,Clear,Dry - normal,Dusk,0
2,COLE HARBOUR RD,Normal,Clear,Dry - normal,Daylight,0
3,WOODLAWN RD,Potholes or bumps,Overcast or cloudy,Wet,Daylight,0
4,Chebucto,Normal,Clear,Dry - normal,Daylight,0


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Display model performance
mae, r2


(0.02647320536520768, -0.026858642556691237)

In [22]:
# Step 1: Aggregate data by road location
road_group = collision_data.groupby('Road Location').agg({
    'Collision Score': 'mean',  # Average collision score per road
    'Weather Condition': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown',
    'Road Surface': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown',
    'Light Condition': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown',
    'Road Condition': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown',
}).reset_index()

# Encode categorical variables for modeling
encoded_road_data = pd.get_dummies(road_group, columns=[
    'Weather Condition', 'Road Surface', 'Light Condition', 'Road Condition'
])

# Separate features and target
X_road = encoded_road_data.drop(['Road Location', 'Collision Score'], axis=1)
y_road = encoded_road_data['Collision Score']

# Train-test split for model training
X_train_road, X_test_road, y_train_road, y_test_road = train_test_split(X_road, y_road, test_size=0.2, random_state=42)

# Train a Random Forest model
road_model = RandomForestRegressor(n_estimators=100, random_state=42)
road_model.fit(X_train_road, y_train_road)

# Evaluate the model
y_pred_road = road_model.predict(X_test_road)
road_mae = mean_absolute_error(y_test_road, y_pred_road)
road_r2 = r2_score(y_test_road, y_pred_road)

# Predict for all roads
road_group['Predicted Collision Score'] = road_model.predict(X_road)


# Display results
road_mae, road_r2


KeyError: "Column(s) ['Collision Score'] do not exist"

In [None]:
from dash import Dash, html, dcc, Input, Output, State
import dash_bootstrap_components as dbc
import plotly.graph_objects as go

# Prepare road data for Dash
road_data_display = road_group[['Road Location', 'Predicted Collision Score']]

# Initialize Dash app
app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# Define layout
app.layout = dbc.Container([
    dbc.Row(dbc.Col(html.H1("Road Collision Score Explorer", className="text-center mt-4"))),
    dbc.Row([
        dbc.Col([
            html.Label("Select Road:"),
            dcc.Dropdown(
                id="road-selector",
                options=[
                    {"label": road, "value": road}
                    for road in road_data_display['Road Location']
                ],
                placeholder="Select a road"
            ),
            dbc.Button("Get Collision Score", id="submit-button", color="primary", className="mt-3")
        ], width=4),
        dbc.Col([
            dcc.Graph(id="collision-score-gauge", style={"height": "400px"})
        ], width=8)
    ]),
    dbc.Row([
        dbc.Col(html.Div(id="road-details", className="mt-4"))
    ])
])

# Define callback to update the collision score visualization
@app.callback(
    Output("collision-score-gauge", "figure"),
    Output("road-details", "children"),
    Input("submit-button", "n_clicks"),
    State("road-selector", "value")
)
def update_road_score(n_clicks, selected_road):
    if not selected_road:
        return go.Figure(), "Please select a road to view its collision score."

    # Get predicted score for the selected road
    selected_row = road_data_display[road_data_display['Road Location'] == selected_road]
    predicted_score = selected_row['Predicted Collision Score'].values[0]*10

    # Create gauge figure
    fig = go.Figure(go.Indicator(
        mode="gauge+number",
        value=predicted_score,
        title={"text": "Predicted Collision Score"},
        gauge={
            "axis": {"range": [0, 10]},
            "bar": {"color": "darkblue"},
        }
    ))

    details = f"The predicted collision score for '{selected_road}' is {predicted_score:.2f}."
    return fig, details

# Run Dash app
app.run_server(debug=False, port=8051)


In [None]:
import re

# Step 1: Clean road location by removing numeric prefixes/suffixes
collision_data['Road Location'] = collision_data['Road Location'].apply(
    lambda x: re.sub(r'^\d+\s*|\s*\d+$', '', x).strip() if isinstance(x, str) else x
)

# Step 2: Re-aggregate data by cleaned road location
road_group_cleaned = collision_data.groupby('Road Location').agg({
    'Collision Score': 'mean',  # Average collision score per road
    'Weather Condition': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown',
    'Road Surface': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown',
    'Light Condition': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown',
    'Road Condition': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown',
}).reset_index()

# Step 3: Encode categorical variables for modeling
encoded_road_data_cleaned = pd.get_dummies(road_group_cleaned, columns=[
    'Weather Condition', 'Road Surface', 'Light Condition', 'Road Condition'
])

# Separate features and target
X_road_cleaned = encoded_road_data_cleaned.drop(['Road Location', 'Collision Score'], axis=1)
y_road_cleaned = encoded_road_data_cleaned['Collision Score']

# Train-test split for model training
X_train_road_cleaned, X_test_road_cleaned, y_train_road_cleaned, y_test_road_cleaned = train_test_split(
    X_road_cleaned, y_road_cleaned, test_size=0.2, random_state=42
)

# Train a new Random Forest model
road_model_cleaned = RandomForestRegressor(n_estimators=100, random_state=42)
road_model_cleaned.fit(X_train_road_cleaned, y_train_road_cleaned)

# Evaluate the new model
y_pred_road_cleaned = road_model_cleaned.predict(X_test_road_cleaned)
road_mae_cleaned = mean_absolute_error(y_test_road_cleaned, y_pred_road_cleaned)
road_r2_cleaned = r2_score(y_test_road_cleaned, y_pred_road_cleaned)

# Predict for all cleaned roads
road_group_cleaned['Predicted Collision Score'] = road_model_cleaned.predict(X_road_cleaned)



# Display model performance
road_mae_cleaned, road_r2_cleaned


(0.93949817697927, -0.04234142463143975)

In [None]:
# Prepare the cleaned road data for Dash
road_data_cleaned_display = road_group_cleaned[['Road Location', 'Predicted Collision Score']]

# Initialize Dash app
app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# Define layout
app.layout = dbc.Container([
    dbc.Row(dbc.Col(html.H1("Road Collision Score Explorer (Cleaned)", className="text-center mt-4"))),
    dbc.Row([
        dbc.Col([
            html.Label("Select Road:"),
            dcc.Dropdown(
                id="road-selector-cleaned",
                options=[
                    {"label": road, "value": road}
                    for road in road_data_cleaned_display['Road Location']
                ],
                placeholder="Select a road"
            ),
            dbc.Button("Get Collision Score", id="submit-button-cleaned", color="primary", className="mt-3")
        ], width=4),
        dbc.Col([
            dcc.Graph(id="collision-score-gauge-cleaned", style={"height": "400px"})
        ], width=8)
    ]),
    dbc.Row([
        dbc.Col(html.Div(id="road-details-cleaned", className="mt-4"))
    ])
])

# Define callback to update the collision score visualization
@app.callback(
    Output("collision-score-gauge-cleaned", "figure"),
    Output("road-details-cleaned", "children"),
    Input("submit-button-cleaned", "n_clicks"),
    State("road-selector-cleaned", "value")
)
def update_road_score_cleaned(n_clicks, selected_road):
    if not selected_road:
        return go.Figure(), "Please select a road to view its collision score."

    # Get predicted score for the selected road
    selected_row = road_data_cleaned_display[road_data_cleaned_display['Road Location'] == selected_road]
    predicted_score = selected_row['Predicted Collision Score'].values[0]

    # Create gauge figure
    fig = go.Figure(go.Indicator(
        mode="gauge+number",
        value=predicted_score,
        title={"text": "Predicted Collision Score"},
        gauge={
            "axis": {"range": [0, 10]},
            "bar": {"color": "darkblue"},
        }
    ))

    details = f"The predicted collision score for '{selected_road}' is {predicted_score:.2f}."
    return fig, details

# Run Dash app
app.run_server(debug=False, port=8051)


In [None]:
import dash
from dash import Dash, dcc, html, Input, Output
import dash_bootstrap_components as dbc
import plotly.express as px
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

# Load data
data = pd.read_csv('cleaned_collisions.csv')

# Define the composite severity index for each collision
data['CSI'] = (
    (data['Fatal Injury'].astype(int) * 5)
    + (data['Pedestrian Collision'] * 3)
    + (data['Bicycle Collision'] * 1)
    + (data['Impaired Driving'] * 3)
    + (data['Aggressive Driving'] * 3)
    + (data['Distracted Driving'] * 2)
    + (data['Intersection Collision'] * 4)
)

# Features for predicting CSI (adjust as needed)
feature_cols = [
    'Light Condition_Dawn', 'Light Condition_Daylight', 'Light Condition_Dusk',
    'Artificial Light Condition_Street lights on',
    'Weather Condition_Dust or smoke', 'Weather Condition_Fog, mist or smog',
    'Weather Condition_Freezing rain', 'Weather Condition_Overcast or cloudy',
    'Weather Condition_Rain', 'Weather Condition_Snow', 'Weather Condition_Strong wind',
    'Pedestrian Collision', 'Aggressive Driving', 'Distracted Driving', 
    'Impaired Driving', 'Bicycle Collision', 'Intersection Collision'
]

X = data[feature_cols]
y = data['CSI']

# Train regression model to predict CSI
reg_model = RandomForestRegressor(n_estimators=100, random_state=42)
reg_model.fit(X, y)

# Predict CSI for each collision
data['Predicted_CSI'] = reg_model.predict(X)

# Spatial clustering
coords = data[['Latitude WGS84', 'Longitude WGS84']].values
scaler = StandardScaler()
coords_scaled = scaler.fit_transform(coords)

dbscan = DBSCAN(eps=0.3, min_samples=10)
clusters = dbscan.fit_predict(coords_scaled)
data['Cluster'] = clusters

# Filter out noise
data_clusters = data[data['Cluster'] != -1]

# Compute average predicted CSI per cluster
cluster_risk = data_clusters.groupby('Cluster')['Predicted_CSI'].mean().reset_index()
cluster_risk.rename(columns={'Predicted_CSI':'Avg_Predicted_CSI'}, inplace=True)

# Merge cluster coordinates (just for reference)
cluster_coords = data_clusters.groupby('Cluster')[['Latitude WGS84', 'Longitude WGS84']].mean().reset_index()
cluster_info = pd.merge(cluster_coords, cluster_risk, on='Cluster')

# Dash App
app = Dash(__name__, external_stylesheets=[dbc.themes.LUX])

app.layout = dbc.Container([
    html.H3("High-Risk Collision Areas Using Composite Severity Index (Density Heatmap)", className="text-center my-4 text-primary"),
    dbc.Row([
        dbc.Col([
            html.Label("Min Avg CSI Threshold:", className="fw-bold"),
            dcc.Slider(
                id="csi-threshold-slider",
                min=0, max=10, step=1, value=3,
                marks={i: str(i) for i in range(0,11)}
            ),
        ], width=3),
        dbc.Col([
            dcc.Graph(id='cluster-csi-map', style={"height":"600px"})
        ], width=9)
    ], className="mb-4"),

    dbc.Row([
        dbc.Col([
            html.Div(id="cluster-csi-details", className="mt-4")
        ], width=12)
    ])
], fluid=True)

@app.callback(
    [Output('cluster-csi-map', 'figure'),
     Output('cluster-csi-details', 'children')],
    [Input('csi-threshold-slider', 'value')]
)
def update_map(csi_threshold):
    # Filter clusters that meet the CSI threshold
    high_risk_clusters = cluster_info[cluster_info['Avg_Predicted_CSI'] >= csi_threshold]
    if high_risk_clusters.empty:
        fig = px.scatter_mapbox(
            lat=[], lon=[],
            mapbox_style="carto-positron",
            zoom=10,
            title="No clusters meet the criteria"
        )
        details = "No clusters have an average CSI above the selected threshold."
        return fig, details

    # Filter original collision data to only include collisions from these high-risk clusters
    filtered_data_points = data_clusters[data_clusters['Cluster'].isin(high_risk_clusters['Cluster'])]

    # Create a density heatmap (density_mapbox) weighted by Predicted_CSI
    # The 'z' argument allows weighting the density by the predicted CSI values.
    # Adjust radius as needed for smoothness.
    fig = px.density_mapbox(
        filtered_data_points,
        lat='Latitude WGS84',
        lon='Longitude WGS84',
        z='Predicted_CSI',
        radius=10,
        center=dict(lat=data['Latitude WGS84'].mean(), lon=data['Longitude WGS84'].mean()),
        zoom=10,
        mapbox_style="carto-positron",
        color_continuous_scale=px.colors.sequential.YlGnBu,
        title="High-Risk Collision Density (Predicted CSI Weighted)"
    )
    fig.update_layout(margin={"r":0,"t":30,"l":0,"b":0})
    
    # Show details about the highest-risk clusters
    top_clusters = high_risk_clusters.nlargest(10, 'Avg_Predicted_CSI')
    details_table = html.Table([
        html.Thead(html.Tr([html.Th("Cluster"), html.Th("Avg CSI")]))
    ] + [
        html.Tr([html.Td(row['Cluster']), html.Td(f"{row['Avg_Predicted_CSI']:.2f}")]) 
        for _, row in top_clusters.iterrows()
    ])
    
    details = [
        html.H5("Highest-Risk Clusters"),
        details_table
    ]

    return fig, details

if __name__ == "__main__":
    app.run_server(debug=True, port=8053)


In [None]:
import dash
from dash import Dash, dcc, html, Input, Output
import dash_bootstrap_components as dbc
import plotly.express as px
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

# Load data
data = pd.read_csv('cleaned_collisions.csv')

# Define the composite severity index for each collision
data['CSI'] = (
    (data['Fatal Injury'].astype(int) * 5)
    + (data['Pedestrian Collision'] * 3)
    + (data['Bicycle Collision'] * 3)
    + (data['Impaired Driving'] * 2)
    + (data['Aggressive Driving'] * 2)
    + (data['Distracted Driving'] * 2)
    + (data['Intersection Collision'] * 1)
)

# Features for predicting CSI (adjust as needed)
feature_cols = [
    'Light Condition_Dawn', 'Light Condition_Daylight', 'Light Condition_Dusk',
    'Artificial Light Condition_Street lights on',
    'Weather Condition_Dust or smoke', 'Weather Condition_Fog, mist or smog',
    'Weather Condition_Freezing rain', 'Weather Condition_Overcast or cloudy',
    'Weather Condition_Rain', 'Weather Condition_Snow', 'Weather Condition_Strong wind',
    'Pedestrian Collision', 'Aggressive Driving', 'Distracted Driving', 
    'Impaired Driving', 'Bicycle Collision', 'Intersection Collision'
]

X = data[feature_cols]
y = data['CSI']

# Train regression model to predict CSI
reg_model = RandomForestRegressor(n_estimators=100, random_state=42)
reg_model.fit(X, y)

# Predict CSI for each collision
data['Predicted_CSI'] = reg_model.predict(X)

# Spatial clustering
coords = data[['Latitude WGS84', 'Longitude WGS84']].values
scaler = StandardScaler()
coords_scaled = scaler.fit_transform(coords)

dbscan = DBSCAN(eps=0.3, min_samples=10)
clusters = dbscan.fit_predict(coords_scaled)
data['Cluster'] = clusters

# Filter out noise
data_clusters = data[data['Cluster'] != -1]

# Compute average predicted CSI per cluster
cluster_risk = data_clusters.groupby('Cluster')['Predicted_CSI'].mean().reset_index()
cluster_risk.rename(columns={'Predicted_CSI':'Avg_Predicted_CSI'}, inplace=True)

# Get cluster coordinates (mean location of collisions in the cluster)
cluster_coords = data_clusters.groupby('Cluster')[['Latitude WGS84', 'Longitude WGS84']].mean().reset_index()
cluster_info = pd.merge(cluster_coords, cluster_risk, on='Cluster')

# Dash App
app = Dash(__name__, external_stylesheets=[dbc.themes.LUX])

app.layout = dbc.Container([
    html.H3("High-Risk Collision Areas Using Composite Severity Index", className="text-center my-4 text-primary"),
    dbc.Row([
        dbc.Col([
            html.Label("Min Avg CSI Threshold:", className="fw-bold"),
            dcc.Slider(
                id="csi-threshold-slider",
                min=0, max=10, step=1, value=3,
                marks={i: str(i) for i in range(0,11)}
            ),
        ], width=3),
        dbc.Col([
            dcc.Graph(id='cluster-csi-map', style={"height":"600px"})
        ], width=9)
    ], className="mb-4"),

    dbc.Row([
        dbc.Col([
            html.Div(id="cluster-csi-details", className="mt-4")
        ], width=12)
    ])
], fluid=True)

@app.callback(
    [Output('cluster-csi-map', 'figure'),
     Output('cluster-csi-details', 'children')],
    [Input('csi-threshold-slider', 'value')]
)
def update_map(csi_threshold):
    filtered = cluster_info[cluster_info['Avg_Predicted_CSI'] >= csi_threshold]
    
    if filtered.empty:
        fig = px.scatter_mapbox(
            lat=[], lon=[],
            mapbox_style="carto-positron",
            zoom=10,
            title="No clusters meet the criteria"
        )
        details = "No clusters have an average CSI above the selected threshold."
        return fig, details

    fig = px.scatter_mapbox(
        filtered,
        lat='Latitude WGS84',
        lon='Longitude WGS84',
        size='Avg_Predicted_CSI',
        color='Avg_Predicted_CSI',
        color_continuous_scale=px.colors.sequential.OrRd,
        mapbox_style="carto-positron",
        zoom=10,
        title="High-Risk Collision Clusters (Predicted CSI)"
    )
    fig.update_layout(margin={"r":0,"t":30,"l":0,"b":0})
    
    top_clusters = filtered.nlargest(10, 'Avg_Predicted_CSI')
    details_table = html.Table([
        html.Thead(html.Tr([html.Th("Cluster"), html.Th("Avg CSI")]))
    ] + [
        html.Tr([html.Td(row['Cluster']), html.Td(f"{row['Avg_Predicted_CSI']:.2f}")]) 
        for _, row in top_clusters.iterrows()
    ])
    
    details = [
        html.H5("Highest-Risk Clusters"),
        details_table
    ]

    return fig, details


if __name__ == "__main__":
    app.run_server(debug=True, port=8095)


In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from dash import Dash, html, dcc, Input, Output, State
import dash_bootstrap_components as dbc
import plotly.graph_objects as go

# Define a Composite Severity Index (CSI) function
def calculate_csi(row):
    # Example weighting scheme - adjust as needed
    fatal = 5 if 'Fatal Injury' in row and row['Fatal Injury'] == 'Yes' else 0
    non_fatal = 2 if 'Non Fatal Injury' in row and row['Non Fatal Injury'] == 'Yes' else 0
    ped = 3 if 'Pedestrian Collision' in row and row['Pedestrian Collision'] == 'Yes' else 0
    bike = 3 if 'Bicycle Collision' in row and row['Bicycle Collision'] == 'Yes' else 0
    impaired = 2 if 'Impaired Driving' in row and row['Impaired Driving'] == 'Yes' else 0
    aggressive = 1 if 'Aggressive Driving' in row and row['Aggressive Driving'] == 'Yes' else 0
    distracted = 1 if 'Distracted Driving' in row and row['Distracted Driving'] == 'Yes' else 0
    intersection = 1 if 'Intersection Collision' in row and row['Intersection Collision'] == 'Yes' else 0
    
    return fatal + non_fatal + ped + bike + impaired + aggressive + distracted + intersection

# Compute CSI for each collision in the dataset
collision_data['CSI'] = collision_data.apply(calculate_csi, axis=1)

# Select relevant features and handle missing values
selected_features = ['Road Location', 'Road Condition', 'Weather Condition', 
                     'Road Surface', 'Light Condition', 'CSI']
processed_data = collision_data[selected_features].dropna()

# Encode categorical variables
encoded_data = pd.get_dummies(processed_data, columns=[
    'Road Condition', 'Weather Condition', 'Road Surface', 'Light Condition'
])

# Separate features and target for the initial model
X = encoded_data.drop(['Road Location', 'CSI'], axis=1)
y = encoded_data['CSI']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print("MAE:", mae, "R2:", r2)

# Step 2: Aggregate data by road location using CSI
road_group = collision_data.groupby('Road Location').agg({
    'CSI': 'mean',  # Average CSI per road
    'Weather Condition': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown',
    'Road Surface': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown',
    'Light Condition': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown',
    'Road Condition': lambda x: x.mode()[0] if not x.mode().empty else 'Unknown',
}).reset_index()

# Encode categorical variables for road-level modeling
encoded_road_data = pd.get_dummies(road_group, columns=[
    'Weather Condition', 'Road Surface', 'Light Condition', 'Road Condition'
])

# Separate features and target
X_road = encoded_road_data.drop(['Road Location', 'CSI'], axis=1)
y_road = encoded_road_data['CSI']

# Train-test split for road-level model
X_train_road, X_test_road, y_train_road, y_test_road = train_test_split(X_road, y_road, test_size=0.2, random_state=42)

# Train a Random Forest model at the road level
road_model = RandomForestRegressor(n_estimators=100, random_state=42)
road_model.fit(X_train_road, y_train_road)

# Evaluate the model at the road level
y_pred_road = road_model.predict(X_test_road)
road_mae = mean_absolute_error(y_test_road, y_pred_road)
road_r2 = r2_score(y_test_road, y_pred_road)
print("Road-Level MAE:", road_mae, "Road-Level R2:", road_r2)

# Predict CSI for all roads
road_group['Predicted CSI'] = road_model.predict(X_road)

# Prepare road data for Dash
road_data_display = road_group[['Road Location', 'Predicted CSI']]

# Initialize Dash app
app = Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# Define layout
app.layout = dbc.Container([
    dbc.Row(dbc.Col(html.H1("Road CSI Explorer", className="text-center mt-4"))),
    dbc.Row([
        dbc.Col([
            html.Label("Select Road:"),
            dcc.Dropdown(
                id="road-selector",
                options=[
                    {"label": road, "value": road}
                    for road in road_data_display['Road Location']
                ],
                placeholder="Select a road"
            ),
            dbc.Button("Get CSI", id="submit-button", color="primary", className="mt-3")
        ], width=4),
        dbc.Col([
            dcc.Graph(id="csi-gauge", style={"height": "400px"})
        ], width=8)
    ]),
    dbc.Row([
        dbc.Col(html.Div(id="road-details", className="mt-4"))
    ])
])

# Define callback to update the CSI visualization
@app.callback(
    [Output("csi-gauge", "figure"),
     Output("road-details", "children")],
    [Input("submit-button", "n_clicks")],
    [State("road-selector", "value")]
)
def update_road_csi(n_clicks, selected_road):
    if not selected_road:
        return go.Figure(), "Please select a road to view its CSI."

    # Get predicted CSI for the selected road
    selected_row = road_data_display[road_data_display['Road Location'] == selected_road]
    predicted_csi = selected_row['Predicted CSI'].values[0]*10

    # Determine a suitable range for the gauge
    # Adjust based on your data distribution; here we assume a max CSI of around 20
    max_csi = 20  
    fig = go.Figure(go.Indicator(
        mode="gauge+number",
        value=predicted_csi,
        title={"text": "Predicted CSI"},
        gauge={
            "axis": {"range": [0, max_csi]},
            "bar": {"color": "darkblue"},
        }
    ))

    details = f"The predicted CSI for '{selected_road}' is {predicted_csi:.2f}."
    return fig, details

# Run Dash app
app.run_server(debug=False, port=8057)


MAE: 0.4958881069504263 R2: -0.007160340189543701
Road-Level MAE: 0.38111545729063007 Road-Level R2: -0.037607560302231446
