In [None]:
import pandas as pd
import numpy as np
import sqlite3
import altair as alt
import scipy.interpolate as interpolate
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import Lasso
from sklearn.impute import SimpleImputer

alt.data_transformers.disable_max_rows()

In [None]:
con = sqlite3.connect("switrs.sqlite")

query = """
    SELECT * FROM collisions WHERE county_location = 'los angeles'
    """

df = pd.read_sql_query(query, con, parse_dates = ["collision_date"])
df["year"] = df["collision_date"].dt.year
df["hour"] = pd.to_datetime(df["collision_time"]).dt.hour
df = df.query("year < 2021") # remove incomplete 2021 data
df["alcohol_involved"] = df["alcohol_involved"].fillna(0) # convert NaN to 0 in alcohol use column

dfc = df[["case_id", "county_location", "alcohol_involved", "collision_severity", "injured_victims", "collision_date", "year", "collision_time", "hour", "party_count"]]

#### Distribution of the proportion of crashes involving alcohol by time of day

In [None]:
# We will attempt to use multiple models to fit this distribution. We will use polynomial splines and B-splines but it appears a skewed-normal may be a possible fit as well.

x = dfc.groupby("hour")["hour"].mean()
ya = dfc[dfc["alcohol_involved"] == 1].groupby("hour")["alcohol_involved"].count() / len(dfc[dfc["alcohol_involved"] == 1])
yn = dfc[dfc["alcohol_involved"] == 0].groupby("hour")["alcohol_involved"].count() / len(dfc[dfc["alcohol_involved"] == 0])
adj_hour = [16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
adjlabel = "{0: '8am', 1: '9am', 2: '10am', 3: '11am', 4: '12pm', 5: '1pm', 6: '2pm', 7: '3pm', 8: '4pm', 9: '5pm', 10: '6pm', 11: '7pm', 12: '8pm', 13: '9pm', 14: '10pm', 15: '11pm', 16: '12am', 17: '1am', 18: '2am', 19: '3am', 20: '4am', 21: '5am', 22: '6am', 23: '7am'}[datum.value]"

dfah = pd.DataFrame({"hour": x, "adj_hour": adj_hour, "collisions_a": ya, "collisions_n": yn})

pla = alt.Chart(dfah).mark_bar(width = 20, color = "orange", opacity = 0.75).encode(
    x = alt.X("adj_hour:O", title = "Hour of the day", axis = alt.Axis(labelExpr = adjlabel)),
    y = alt.Y("collisions_a", title = "Number of collisions (alcohol involved)").stack(None)
).properties(width = 600, height = 200)

pln = alt.Chart(dfah).mark_bar(width = 20, opacity = 0.75).encode(
    x = alt.X("adj_hour:O", title = "Hour of the day", axis = alt.Axis(labelExpr = adjlabel)),
    y = alt.Y("collisions_n", title = "Number of collisions (alcohol not involved)").stack(None)
).properties(width = 600, height = 200)

(pln + pla).resolve_axis(y = "independent").display()

In [None]:
x = dfc.groupby("hour")["hour"].mean()
y = dfc.groupby("hour")["alcohol_involved"].mean()
xx = np.linspace(x.min(), x.max(), 24)

t, c, k = interpolate.splrep(x, y, k = 3, s = 0.0001)
ypred = interpolate.splev(xx, (t, c, k))

adj_hour = [16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
dfspl = pd.DataFrame({"hour": x, "adj_hour": adj_hour, "p": y, "p_fit": ypred})

spl_rmse = mean_squared_error(dfspl["p"], dfspl["p_fit"])
print("Root mean squared error: %.5f" % (np.sqrt(spl_rmse)))

adjlabel = "{0: '8am', 1: '9am', 2: '10am', 3: '11am', 4: '12pm', 5: '1pm', 6: '2pm', 7: '3pm', 8: '4pm', 9: '5pm', 10: '6pm', 11: '7pm', 12: '8pm', 13: '9pm', 14: '10pm', 15: '11pm', 16: '12am', 17: '1am', 18: '2am', 19: '3am', 20: '4am', 21: '5am', 22: '6am', 23: '7am'}[datum.value]"
adjscale = alt.Scale(domain = [-0.5, 23.5])

spl_base = alt.Chart(dfspl).mark_bar(width = 20, color = "orange").encode(
    x = alt.X("adj_hour", axis = alt.Axis(labelExpr = adjlabel)),
    y = alt.Y("p")
).properties(width = 600, height = 200)

spl = alt.Chart(dfspl).mark_line(color = "red").encode(
    x = alt.X("adj_hour", title = "Hour of the day", scale = adjscale),
    y = alt.Y("p_fit", title = "Probability of alcohol involvement")
).properties(width = 600, height = 200)

(spl_base + spl).display()

#### Distributions of the number of injuries per collision, alcohol vs. no alcohol

In [None]:
# These appear to follow exponential or Weibull distributions. We will determine which is the best fit and then compare to see if alcohol has an effect on the distribution.

#### Timeline for number of collisions

In [None]:
# train on up to 2019, predict 2020, compare to actual. This will be done using a time series model.

#### Timeline for number of collisions (alcohol vs no alcohol)

In [None]:
# trained on up to 2019, predicting 2020, but separating by alcohol vs. no alcohol and comparing to actuals. This will also be completed using a time series model

#### Feature selection using LASSO on fitted GLM against several labels

In [None]:
# One-hot encode categorical variables
Xoh = pd.get_dummies(X)

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')

# Fit the imputer to the data and transform it, then convert back to DataFrame
Xoh_imputed = imputer.fit_transform(Xoh)
Xoh_imputed = pd.DataFrame(Xoh_imputed, columns=Xoh.columns)

# Define the target variable
y = dfrf["injured_victims"].fillna(0) 

# Split the data into training and testing sets
X_tr, X_te, y_tr, y_te = train_test_split(Xoh_imputed, y, random_state=13)

# Initialize the Lasso model
lasso = Lasso(alpha=0.1, random_state=13)

# Fit the Lasso model
lasso.fit(X_tr, y_tr)

# Predict on the test set
lasso_pred = lasso.predict(X_te)

# Calculate and print RMSE
rmse_lasso = np.sqrt(mean_squared_error(y_te, lasso_pred))
print(f"Lasso RMSE: {rmse_lasso:.3f}")

# Show the coefficients of the features to understand feature importance
feature_importance_lasso = pd.DataFrame({
    "feature": X_tr.columns, 
    "coefficient": np.abs(lasso.coef_)
}).sort_values("coefficient", ascending=False)

# Print the top 20 most important features
print("Top Features (by absolute coefficient value):")
print(feature_importance_lasso.head(20))

In [None]:
Xoh2 = pd.get_dummies(X.drop("party_count", axis = 1))
y2 = dfrf["injured_victims"].fillna(0) / dfrf["party_count"].fillna(1)

# Fit the imputer to the data and transform it, then convert back to DataFrame
Xoh2_imputed = imputer.fit_transform(Xoh2)
Xoh2_imputed = pd.DataFrame(Xoh2_imputed, columns=Xoh2.columns)

# Split the data into training and testing sets
X_tr, X_te, y_tr, y_te = train_test_split(Xoh2_imputed, y2, random_state=13)

# Initialize the Lasso model
lasso = Lasso(alpha=0.1, random_state=13)

# Fit the Lasso model
lasso.fit(X_tr, y_tr)

# Predict on the test set
lasso_pred = lasso.predict(X_te)

# Calculate and print RMSE
rmse_lasso = np.sqrt(mean_squared_error(y_te, lasso_pred))
print(f"Lasso RMSE: {rmse_lasso:.3f}")

# Show the coefficients of the features to understand feature importance
feature_importance_lasso = pd.DataFrame({
    "feature": X_tr.columns, 
    "coefficient": np.abs(lasso.coef_)
}).sort_values("coefficient", ascending=False)

# Print the top 20 most important features
print("Top Features (by absolute coefficient value):")
print(feature_importance_lasso.head(20))

#### Feature selection using Random Forest feature importance against several labels

In [None]:
dfrf = df.copy()

dfrf["minute"] = pd.to_datetime(df["collision_time"], format = "%H:%M:%S").dt.minute
dfrf["day"] = pd.to_datetime(df["collision_time"], format = "%H:%M:%S").dt.day_of_year


drop_feats = ["collision_severity", "killed_victims", "injured_victims", "severe_injury_count",
              "other_visible_injury_count", "complaint_of_pain_injury_count", "pedestrian_killed_count", "pedestrian_injured_count",
              "bicyclist_killed_count", "bicyclist_injured_count", "motorcyclist_killed_count", "motorcyclist_injured_count",
              "case_id", "process_date", "hour", "collision_date", "process_date", "collision_time",
              "city_division_lapd", "caltrans_county", "caltrans_district", "state_route", "postmile"]

dfnan = pd.DataFrame()
dfnan["predictor"] = (dfrf.isna().sum() / dfrf.isna().count()).sort_values().index
dfnan["p_nan"] = (dfrf.isna().sum() / dfrf.isna().count()).sort_values().values

drop_nans = dfnan.query("p_nan > 0.8")["predictor"] # drop features that are more than 80 % nan

X = dfrf.drop(drop_feats, axis = 1).drop(drop_nans, axis = 1).convert_dtypes()

numcols = []
for column in X:
    if X[column].dtype != "string[python]":
        numcols.append(column)
badnumcols = [column for column in numcols if column not in ["distance", "party_count", "latitude", "longitude", "year", "minute", "day"]] # only keep these ones as numeric
X[badnumcols] = X[badnumcols].astype("string[python]")

badcats = [column for column in X if X[column].nunique() > 100 and X[column].dtype == "string[python]"]
X = X.drop(badcats, axis = 1) # drop categorical features with more than 100 unique groups

In [None]:
# finding number of unique groups for categorical features

strcolumns = []
for column in X:
    if X[column].dtype == "string[python]":
        strcolumns.append(column)

columns, uniques = [], []

for column in strcolumns:
    columns.append(column)
    uniques.append(len(X[column].value_counts()))
                   
opdf = pd.DataFrame({"column": columns, "unique": uniques})

uns = opdf.sort_values("unique", ascending = False)

In [None]:
Xoh = pd.get_dummies(X)
y = dfrf["injured_victims"].fillna(0)

X_tr, X_te, y_tr, y_te = train_test_split(Xoh, y, random_state = 13)

rf = RandomForestRegressor(n_estimators = 10, random_state = 13)

rff = rf.fit(X_tr, y_tr)

rfpred = rff.predict(X_te)

print("RMSE: %.3f" % (np.sqrt(mean_squared_error(y_te, rfpred))))
print("Proportion correct: %.3f "% ((y_te == rfpred.astype(int)).mean()))

pd.DataFrame({"feature": rff.feature_names_in_, "importance": rff.feature_importances_}).sort_values("importance", ascending = False).head(20)

In [None]:
Xoh2 = pd.get_dummies(X.drop("party_count", axis = 1))
y2 = dfrf["injured_victims"].fillna(0) / dfrf["party_count"].fillna(1) # repeating this time using the injuries per party involved

X2_tr, X2_te, y2_tr, y2_te = train_test_split(Xoh2, y2, random_state = 13)

rf2 = RandomForestRegressor(n_estimators = 10, random_state = 13)

rff2 = rf2.fit(X2_tr, y2_tr)

rfpred2 = rff2.predict(X2_te)

print("RMSE: %.3f" % (np.sqrt(mean_squared_error(y2_te, rfpred2))))
print("Proportion correct: %.3f "% ((y2_te == rfpred2.astype(int)).mean()))

pd.DataFrame({"feature": rff2.feature_names_in_, "importance": rff2.feature_importances_}).sort_values("importance", ascending = False).head(20)

In [None]:
Xoh3 = pd.get_dummies(X)
y3 = dfrf["collision_severity"] # repeating this time using categorical label

X3_tr, X3_te, y3_tr, y3_te = train_test_split(Xoh3, y3, random_state = 13)

rf3 = RandomForestClassifier(n_estimators = 10, random_state = 13)

rff3 = rf3.fit(X3_tr, y3_tr)

rfpred3 = rff3.predict(X3_te)

print("Proportion correct: %.3f "% ((y3_te == rfpred3).mean()))

cf3 = confusion_matrix(y3_te, rfpred3)

cmp3 = ConfusionMatrixDisplay(confusion_matrix=cf3, display_labels=True)

fig, ax = plt.subplots(figsize=(10,10))
cmp3.plot(ax=ax)

pd.DataFrame({"feature": rff3.feature_names_in_, "importance": rff3.feature_importances_}).sort_values("importance", ascending = False).head(20)