In [1]:
import pandas as pd
import numpy as np
import sqlite3
import altair as alt
import scipy.interpolate as interpolate
from scipy.stats import skewnorm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix, ConfusionMatrixDisplay
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
con = sqlite3.connect("switrs.sqlite")

query = """
    SELECT * FROM collisions WHERE county_location = 'los angeles'
    """

df = pd.read_sql_query(query, con, parse_dates = ["collision_date"])
df["year"] = df["collision_date"].dt.year
df["hour"] = pd.to_datetime(df["collision_time"]).dt.hour
df = df.query("year < 2021") # remove incomplete 2021 data
df["alcohol_involved"] = df["alcohol_involved"].fillna(0) # convert NaN to 0 in alcohol use column

dfc = df[["case_id", "county_location", "alcohol_involved", "collision_severity", "injured_victims", "collision_date", "year", "collision_time", "hour", "party_count"]]

  df["hour"] = pd.to_datetime(df["collision_time"]).dt.hour


#### Distribution of the proportion of crashes involving alcohol by time of day

In [244]:
# We will attempt to use multiple models to fit this distribution. We will use polynomial splines and B-splines but it appears a skewed-normal may be a possible fit as well.



#### Distributions of the number of injuries per collision, alcohol vs. no alcohol

In [None]:
# These appear to follow exponential or Weibull distributions. We will determine which is the best fit and then compare to see if alcohol has an effect on the distribution.

#### Timeline for number of collisions

In [None]:
# train on up to 2019, predict 2020, compare to actual. This will be done using a time series model.

#### Timeline for number of collisions (alcohol vs no alcohol)

In [None]:
# trained on up to 2019, predicting 2020, but separating by alcohol vs. no alcohol and comparing to actuals. This will also be completed using a time series model

#### Feature selection using LASSO on fitted GLM against several labels

#### Feature selection using Random Forest feature importance against several labels

In [None]:
dfrf = df.copy()

dfrf["minute"] = pd.to_datetime(df["collision_time"], format = "%H:%M:%S").dt.minute
dfrf["day"] = pd.to_datetime(df["collision_time"], format = "%H:%M:%S").dt.day_of_year


drop_feats = ["collision_severity", "killed_victims", "injured_victims", "severe_injury_count",
              "other_visible_injury_count", "complaint_of_pain_injury_count", "pedestrian_killed_count", "pedestrian_injured_count",
              "bicyclist_killed_count", "bicyclist_injured_count", "motorcyclist_killed_count", "motorcyclist_injured_count",
              "case_id", "process_date", "hour", "collision_date", "process_date", "collision_time",
              "city_division_lapd", "caltrans_county", "caltrans_district", "state_route", "postmile"]

dfnan = pd.DataFrame()
dfnan["predictor"] = (dfrf.isna().sum() / dfrf.isna().count()).sort_values().index
dfnan["p_nan"] = (dfrf.isna().sum() / dfrf.isna().count()).sort_values().values

drop_nans = dfnan.query("p_nan > 0.8")["predictor"] # drop features that are more than 80 % nan

X = dfrf.drop(drop_feats, axis = 1).drop(drop_nans, axis = 1).convert_dtypes()

numcols = []
for column in X:
    if X[column].dtype != "string[python]":
        numcols.append(column)
badnumcols = [column for column in numcols if column not in ["distance", "party_count", "latitude", "longitude", "year", "minute", "day"]] # only keep these ones as numeric
X[badnumcols] = X[badnumcols].astype("string[python]")

badcats = [column for column in X if X[column].nunique() > 100 and X[column].dtype == "string[python]"]
X = X.drop(badcats, axis = 1) # drop categorical features with more than 100 unique groups

In [None]:
# finding number of unique groups for categorical features

strcolumns = []
for column in X:
    if X[column].dtype == "string[python]":
        strcolumns.append(column)

columns, uniques = [], []

for column in strcolumns:
    columns.append(column)
    uniques.append(len(X[column].value_counts()))
                   
opdf = pd.DataFrame({"column": columns, "unique": uniques})

badcats = opdf.query("unique > 100")["column"].values
uns = opdf.sort_values("unique", ascending = False)

In [None]:
Xoh = pd.get_dummies(X)
y = dfrf["injured_victims"].fillna(0)

X_tr, X_te, y_tr, y_te = train_test_split(Xoh, y, random_state = 13)

rf = RandomForestRegressor(n_estimators = 10, random_state = 13)

rff = rf.fit(X_tr, y_tr)

rfpred = rff.predict(X_te)

print("RMSE: %.3f" % (np.sqrt(mean_squared_error(y_te, rfpred))))
print("Proportion correct: %.3f "% ((y_te == rfpred.astype(int)).mean()))

pd.DataFrame({"feature": rff.feature_names_in_, "importance": rff.feature_importances_}).sort_values("importance", ascending = False).head(20)

In [None]:
Xoh2 = pd.get_dummies(X.drop("party_count", axis = 1))
y2 = dfrf["injured_victims"].fillna(0) / dfrf["party_count"].fillna(1) # repeating this time using the injuries per party involved

X2_tr, X2_te, y2_tr, y2_te = train_test_split(Xoh2, y2, random_state = 13)

rf2 = RandomForestRegressor(n_estimators = 10, random_state = 13)

rff2 = rf2.fit(X2_tr, y2_tr)

rfpred2 = rff2.predict(X2_te)

print("RMSE: %.3f" % (np.sqrt(mean_squared_error(y2_te, rfpred2))))
print("Proportion correct: %.3f "% ((y2_te == rfpred2.astype(int)).mean()))

pd.DataFrame({"feature": rff2.feature_names_in_, "importance": rff2.feature_importances_}).sort_values("importance", ascending = False).head(20)

In [None]:
Xoh3 = pd.get_dummies(X)
y3 = dfrf["collision_severity"] # repeating this time using categorical label

X3_tr, X3_te, y3_tr, y3_te = train_test_split(Xoh3, y3, random_state = 13)

rf3 = RandomForestClassifier(n_estimators = 10, random_state = 13)

rff3 = rf3.fit(X3_tr, y3_tr)

rfpred3 = rff3.predict(X3_te)

print("Proportion correct: %.3f "% ((y3_te == rfpred3).mean()))

cf3 = confusion_matrix(y3_te, rfpred3)

cmp3 = ConfusionMatrixDisplay(confusion_matrix=cf3, display_labels=True)

fig, ax = plt.subplots(figsize=(10,10))
cmp3.plot(ax=ax)

pd.DataFrame({"feature": rff3.feature_names_in_, "importance": rff3.feature_importances_}).sort_values("importance", ascending = False).head(20)