In [1]:
# Remove the placeholder and ensure pandas is imported only once
# Since pandas is already imported in other cells, you can remove the placeholder entirely
import pandas as pd
import os

# Define the path to the dataset
data_folder = os.path.join(os.getcwd(), './data')
dataset_path = os.path.join(data_folder, 'Collisions Dataset.csv')

# Read the dataset
try:
    collisions_df = pd.read_csv(dataset_path)
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print(f"Error: The file 'collisions.csv' was not found in the 'data' folder.")
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")

Dataset loaded successfully.


In [3]:
# Convert the 'Date & Time' column to datetime format
collisions_df["Date & Time"] = pd.to_datetime(collisions_df["Date & Time"], format='%m/%d/%Y %I:%M %p')

# Extract the hour from the time portion
collisions_df["Hour"] = collisions_df["Date & Time"].dt.hour


In [4]:
# Create a function to categorize the time of day
def categorize_time_of_day(hour):
    if 6 <= hour < 12:
        return "Morning"
    elif 12 <= hour < 18:
        return "Afternoon"
    elif 18 <= hour < 24:
        return "Evening"
    elif 0 <= hour < 6:
        return "Night"
    else:
        return "NaN"

# Apply the function to categorize based on the hour
collisions_df["Time of Day"] = collisions_df["Hour"].apply(categorize_time_of_day)

In [5]:
print("New dataframe with time of day:")
print(collisions_df[["Date & Time", "Hour", "Time of Day"]])
time_of_day = collisions_df["Time of Day"].value_counts()
print("Counts of each time of day category:")
print(time_of_day)

New dataframe with time of day:
                Date & Time  Hour Time of Day
0       2014-06-16 05:28:00     5       Night
1       2015-07-09 13:18:00    13   Afternoon
2       2014-08-22 18:22:00    18     Evening
3       2014-01-01 01:09:00     1       Night
4       2014-01-01 01:47:00     1       Night
...                     ...   ...         ...
3804262 2023-06-02 23:54:00    23     Evening
3804263 2023-06-07 23:43:00    23     Evening
3804264 2023-02-21 09:19:00     9     Morning
3804265 2023-08-11 19:56:00    19     Evening
3804266 2023-07-01 17:37:00    17   Afternoon

[3804267 rows x 3 columns]
Counts of each time of day category:
Time of Day
Afternoon    1640235
Morning       981954
Evening       904177
Night         277901
Name: count, dtype: int64


In [6]:
age_df = collisions_df["Driver Age (Crash Level) "].value_counts()
print("Counts of each driver age category:")
print(age_df)

Counts of each driver age category:
Driver Age (Crash Level) 
0                   41602
[0,0]               38761
19                  23918
21                  23742
22                  23608
                    ...  
[25,27,38,59]           1
[46,56,94]              1
[0,52,55,64]            1
[28,54,54,58,65]        1
[22,46,66,69]           1
Name: count, Length: 82494, dtype: int64


In [2]:
dtype_types = {
    'Collision ID': 'float32',
    'Posted Speed (Crash Level)': 'float32',  # Assuming numeric; may adjust if needed
    'Speed Limit': 'float32',
    'Speed Related': 'category',              # Likely 'Y'/'N' or boolean
    'Distracted Driver (Suspected)': 'category',
    'Distracted Driver Related (Confirmed)': 'category',
    'Suspected At Fault': 'category',
    'Aggressive Driving Related': 'category',
    'Red Light Running T/F': 'category',      # 'T'/'F'
    'Severity Score': 'float32',
    '# of Fatalities per Crash': 'int16',
    '# of Injuries per crash': 'int16',
    'Date & Time': 'object',                 # load as string initially (will parse later if needed)
    'Driver Condition (Crash Level)': 'category',
    'Driver Age (Crash Level)': 'float32',
    'Hit & Run Related': 'category',
    'Impaired Driving (Confirmed)': 'category',
    'Lat': 'float32',
    'Long': 'float32',
    'Person ID': 'int32',
    'Roadway Contributing Factors': 'category',
    'Surface Condition (Crash Level)': 'category',
    'Weather Conditions (Crash Level)': 'category',
    'Light Conditions (Crash Level)': 'category',
    'Traffic Control': 'category',
    'KABCO Severity': 'category',
    'Vehicle Contributing Factor (Crash Level)': 'category'
}

def fill_na_int(val):
    if val in ('', 'NA', 'NaN'):
        return -1  # some sentinel
    return int(val)

df = pd.read_csv(dataset_path, converters={'Collision ID': fill_na_int}, parse_dates=['Date & Time'], 
                 low_memory=False)
print("Data loaded. Shape:", df.shape)
print(df.head(5))

Data loaded. Shape: (3804267, 31)
   Collision ID  Posted Speed (Crash Level)   Speed Limit  \
0       4512922                          0.0          NaN   
1       4527603                          0.0          NaN   
2       4547444                         35.0          NaN   
3       4691880                         45.0         35.0   
4       4691882                         25.0         30.0   

              Speed Related  Distracted Driver (Suspected)  \
0  [false,false,false,true]                          False   
1              [false,true]                          False   
2       [false,false,false]                          False   
3                     false                           True   
4             [false,false]                          False   

  Distracted Driver Related (Confirmed) Suspected At Fault  \
0             [false,false,false,false]                Yes   
1                         [false,false]                Yes   
2                   [false,false,false] 

In [3]:
# import pandas as pd

def fill_na_int(val):
    if val in ('', 'NA', 'NaN'):
        return -1  # sentinel
    return int(val)

# CSV path (replace with your actual file path)
dataset_path = "./data/Collisions Dataset.csv"

# 1) Load data, using a converter for 'Collision ID', parse 'Date & Time'
df = pd.read_csv(
    dataset_path, 
    converters={'Collision ID': fill_na_int},  # fill NA with -1
    parse_dates=['Date & Time'], 
    low_memory=False
)

print("Data loaded. Shape:", df.shape)
print(df.head(5))

# 2) If you want to treat Collision ID with missing (-1) as a real 'missing' integer, you can convert it:
df['Collision ID'] = df['Collision ID'].replace(-1, pd.NA)    # turn sentinel -1 into real missing
df['Collision ID'] = df['Collision ID'].astype('Int32')       # Pandas extension type that supports NA


Data loaded. Shape: (3804267, 31)
   Collision ID  Posted Speed (Crash Level)   Speed Limit  \
0       4512922                          0.0          NaN   
1       4527603                          0.0          NaN   
2       4547444                         35.0          NaN   
3       4691880                         45.0         35.0   
4       4691882                         25.0         30.0   

              Speed Related  Distracted Driver (Suspected)  \
0  [false,false,false,true]                          False   
1              [false,true]                          False   
2       [false,false,false]                          False   
3                     false                           True   
4             [false,false]                          False   

  Distracted Driver Related (Confirmed) Suspected At Fault  \
0             [false,false,false,false]                Yes   
1                         [false,false]                Yes   
2                   [false,false,false] 

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

def fill_na_int(val):
    if val in ('', 'NA', 'NaN'):
        return -1
    return int(val)

# 1) Load data with converter for Collision ID
df = pd.read_csv(
    dataset_path,
    converters={'Collision ID': fill_na_int},
    parse_dates=['Date & Time'],
    low_memory=False
)
print("Initial load - shape:", df.shape)

# Replace sentinel -1 with NA and convert to a nullable integer type
df['Collision ID'] = df['Collision ID'].replace(-1, pd.NA).astype('Int32')

# Example dropping or reassigning columns you won't use:
# df.drop(columns=['Person ID', 'Latitude', 'Longitude'], errors='ignore', inplace=True)

# 2) Fill missing for categorical columns
categorical_cols = [
    col for col in df.columns 
    if (df[col].dtype.name == 'category') or (df[col].dtype == object)
]
for col in categorical_cols:
    df[col] = df[col].fillna('Unknown')  # assign back to df[col]

# 3) Fill numeric columns with median
numeric_cols = [col for col in df.columns if pd.api.types.is_numeric_dtype(df[col])]
for col in numeric_cols:
    if df[col].isnull().any():
        df[col] = df[col].fillna(df[col].median())  # assign back

# 4) Convert boolean-like columns to 0/1 if they exist
bool_cols = [
    'Speed Related', 'Distracted Driver (Suspected)', 
    'Distracted Driver Related (Confirmed)', 'Suspected At Fault',
    'Aggressive Driving Related', 'Red Light Running T/F',
    'Hit & Run Related', 'Impaired Driving (Confirmed)'
]
for col in bool_cols:
    if col in df.columns:
        df[col] = df[col].astype(str).str.upper()
        df[col] = df[col].map({'Y':1, 'YES':1, 'T':1, 'N':0, 'NO':0, 'F':0})
        df[col] = df[col].fillna(0).astype('int8')

# 5) Handle high-cardinality columns
remaining_cat_cols = []
for col in categorical_cols:
    if col in df.columns and col not in bool_cols and col != 'Date & Time':
        unique_count = df[col].nunique(dropna=True)
        if unique_count > 100:
            # We'll do label encoding
            print(f"[Label Encoding] '{col}' has {unique_count} unique categories.")
            encoder = LabelEncoder()
            df[col] = encoder.fit_transform(df[col].astype(str))
        else:
            remaining_cat_cols.append(col)  # keep for one-hot if you want

# 6) One-hot encode only the remaining moderate-cardinality columns
if remaining_cat_cols:
    print("One-hot encoding these columns:", remaining_cat_cols)
    df = pd.get_dummies(df, columns=remaining_cat_cols, drop_first=True)

print("Shape after encoding:", df.shape)

# 7) Extract optional time-based features from 'Date & Time'
if 'Date & Time' in df.columns:
    df['Hour'] = df['Date & Time'].dt.hour
    df['Weekday'] = df['Date & Time'].dt.dayofweek
    df = df.drop(columns=['Date & Time'])

print("Preprocessing complete. Final shape:", df.shape)


Initial load - shape: (3804267, 31)
[Label Encoding] 'Suspected At Fault.1' has 158 unique categories.
[Label Encoding] 'Driver Condition (Crash Level) ' has 312 unique categories.
[Label Encoding] 'Driver Age (Crash Level) ' has 82495 unique categories.
[Label Encoding] 'Person ID' has 1715 unique categories.
[Label Encoding] 'Roadway Contributing Factors' has 574 unique categories.
[Label Encoding] 'Traffic Control' has 824 unique categories.
[Label Encoding] 'Vehicle Contributing Factor (Crash Level)' has 336 unique categories.
One-hot encoding these columns: ['Drug Test Results (Crash Level) ', 'Surface Condition (Crash Level) ', 'Weather Conditions (Crash Level)', 'Light Conditions (Crash Level)', 'KABCO Severity']
Shape after encoding: (3804267, 70)
Preprocessing complete. Final shape: (3804267, 71)


In [5]:
# 3.1 If 'Severity Score' was dropped or changed, ensure it's still in df or re-merge it
# For example, if you originally stored it in another df or dropped it, you can re-attach it here
# df = df.merge(severity_df, on='Collision ID')   # only if needed

# 3.2 Define thresholds for low, medium, high:
def label_risk(sev):
    if sev >= 8.0:        # e.g., 8–10 => High
        return 'High Risk'
    elif sev >= 4.0:      # 4–7.99 => Medium
        return 'Medium Risk'
    else:                 # 1–3.99 => Low
        return 'Low Risk'

df['Risk Label'] = df['Severity Score'].apply(label_risk)

# 3.3 Create a normalized 0–100 risk score (optional)
min_score = df['Severity Score'].min()
max_score = df['Severity Score'].max()
df['Risk Score (0–100)'] = ((df['Severity Score'] - min_score) / (max_score - min_score) * 100).round(1)

# Inspect the distribution of classes
print(df['Risk Label'].value_counts())
print(df[['Severity Score','Risk Label','Risk Score (0–100)']].head(10))
print(df.shape)


Risk Label
Low Risk       3491492
Medium Risk     297800
High Risk        14975
Name: count, dtype: int64
   Severity Score Risk Label  Risk Score (0–100)
0             1.0   Low Risk                 0.0
1             1.0   Low Risk                 0.0
2             1.0   Low Risk                 0.0
3             1.0   Low Risk                 0.0
4             1.0   Low Risk                 0.0
5             1.0   Low Risk                 0.0
6             1.0   Low Risk                 0.0
7             1.0   Low Risk                 0.0
8             1.0   Low Risk                 0.0
9             1.0   Low Risk                 0.0
(3804267, 73)


In [6]:
from sklearn.model_selection import train_test_split

# 4.1 Separate features (X) and the classification target (y)
# drop the columns you don't want to feed into the model
X = df.drop(columns=['Severity Score','Risk Score (0–100)','Risk Label', 'Work Zone Indicator']) 
y = df['Risk Label']

# 4.2 Train/test split (stratify by Risk Label to preserve class proportions)
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    stratify=y, 
    random_state=42
)
print("Train size:", X_train.shape, "Test size:", X_test.shape)

# 4.3 Train Logistic Regression as an interpretable baseline
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(
    multi_class='ovr', 
    solver='saga',        # good for large datasets
    penalty='l2', 
    C=1.0, 
    max_iter=1000, 
    n_jobs=-1, 
    random_state=42
)

log_reg.fit(X_train, y_train)

# 4.4 Evaluate the logistic model
from sklearn.metrics import classification_report
y_pred_lr = log_reg.predict(X_test)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))


KeyboardInterrupt: 

In [39]:
y.value_counts()

Risk Label
Low Risk       3491492
Medium Risk     297800
High Risk        14975
Name: count, dtype: int64

In [42]:
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(
    class_weight='balanced_subsample',
    n_estimators=100, 
    max_depth=None, 
    min_samples_leaf=3, 
    n_jobs=-1, 
    random_state=42
)
rf_clf.fit(X_train, y_train)

y_pred_rf = rf_clf.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

report = classification_report(y_test, y_pred_rf, output_dict=True)
report_df = pd.DataFrame(report).transpose()

print(report)

Random Forest Classification Report:
              precision    recall  f1-score   support

   High Risk       1.00      1.00      1.00      2995
    Low Risk       1.00      1.00      1.00    698299
 Medium Risk       1.00      1.00      1.00     59560

    accuracy                           1.00    760854
   macro avg       1.00      1.00      1.00    760854
weighted avg       1.00      1.00      1.00    760854

{'High Risk': {'precision': 1.0, 'recall': 1.0, 'f1-score': 1.0, 'support': 2995.0}, 'Low Risk': {'precision': 1.0, 'recall': 0.999994271794747, 'f1-score': 0.9999971358891704, 'support': 698299.0}, 'Medium Risk': {'precision': 0.9999328453428246, 'recall': 1.0, 'f1-score': 0.9999664215439374, 'support': 59560.0}, 'accuracy': 0.9999947427495945, 'macro avg': {'precision': 0.9999776151142749, 'recall': 0.999998090598249, 'f1-score': 0.9999878524777026, 'support': 760854.0}, 'weighted avg': {'precision': 0.9999947431026434, 'recall': 0.9999947427495945, 'f1-score': 0.9999947428

In [43]:
import numpy as np

feature_names = X_train.columns

# 5.1 Random Forest Feature Importances
importances = rf_clf.feature_importances_
indices = np.argsort(importances)[::-1]
top_n = 10  # top 10
print("Top 10 RF Feature Importances:")
for idx in indices[:top_n]:
    print(f"  {feature_names[idx]} = {importances[idx]:.4f}")

# 5.2 Logistic Regression Coefficients (one-vs-rest)
classes = log_reg.classes_  # e.g. ['High Risk','Low Risk','Medium Risk'] in alphabetical order
print("\nLogistic Regression Coefficients for 'High Risk':")
high_idx = list(classes).index('High Risk')
coefs_high = log_reg.coef_[high_idx]
coef_sorted_idx = np.argsort(coefs_high)[::-1]
for idx in coef_sorted_idx[:top_n]:
    print(f"  {feature_names[idx]}: {coefs_high[idx]:.4f}")


Top 10 RF Feature Importances:
  # of Fatalities per Crash = 0.2068
  KABCO Severity_(K) Fatal Injury = 0.2057
  KABCO Severity_(O) No Injury = 0.1640
  KABCO Severity_(B) Suspected Minor/Visible Injury = 0.1501
  # of Injuries per crash  = 0.1184
  KABCO Severity_(C) Possible Injury / Complaint = 0.0859
  Driver Age (Crash Level)  = 0.0134
  Driver Condition (Crash Level)  = 0.0109
  Vehicle Contributing Factor (Crash Level) = 0.0054
  Suspected At Fault.1 = 0.0051

Logistic Regression Coefficients for 'High Risk':
  Long: 0.0000
  Posted Speed (Crash Level) : 0.0000
  # of Fatalities per Crash: 0.0000
  KABCO Severity_(K) Fatal Injury: 0.0000
  # of Injuries per crash : 0.0000
  Light Conditions (Crash Level)_Dark-Not Lighted: 0.0000
  Drug Test Results (Crash Level) _Pending: 0.0000
  Drug Test Results (Crash Level) _["Pending","Pending"]: 0.0000
  Drug Test Results (Crash Level) _Marijuana: 0.0000
  Weather Conditions (Crash Level)_Fog: 0.0000


In [47]:
# logistic probabilities on the test set
probs_lr = log_reg.predict_proba(X_test)
high_risk_index = list(log_reg.classes_).index('High Risk')
high_risk_prob = probs_lr[:, high_risk_index]  # probability of High Risk

# convert to 0-100
pred_risk_score = (high_risk_prob * 100).round(1)

# show a few examples
results_df = pd.DataFrame({
    "Predicted Risk Score": pred_risk_score[:10],
    "Predicted Label": y_pred_lr[:10],
    "Actual Label": y_test.iloc[:10].values
})
print(results_df)


   Predicted Risk Score Predicted Label Actual Label
0                   0.1        Low Risk     Low Risk
1                   0.3        Low Risk     Low Risk
2                   0.0        Low Risk     Low Risk
3                   0.1        Low Risk     Low Risk
4                   0.3        Low Risk     Low Risk
5                   0.0        Low Risk     Low Risk
6                   0.5        Low Risk     Low Risk
7                   0.0        Low Risk     Low Risk
8                   1.6        Low Risk     Low Risk
9                   0.4        Low Risk     Low Risk
