In [2]:
import pandas as pd
import numpy as np
import sqlite3
import altair as alt
import matplotlib.pyplot as plt

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
con = sqlite3.connect("switrs.sqlite")

query = """
    SELECT * FROM collisions WHERE county_location = 'los angeles'
    """

df = pd.read_sql_query(query, con, parse_dates = ["collision_date"])
df["year"] = df["collision_date"].dt.year
df["hour"] = pd.to_datetime(df["collision_time"]).dt.hour
df = df.query("year < 2021") # remove incomplete 2021 data
df["alcohol_involved"] = df["alcohol_involved"].fillna(0) # convert NaN to 0 in alcohol use column

dfc = df[["case_id", "county_location", "alcohol_involved", "collision_severity", "injured_victims", "collision_date", "year", "collision_time", "hour", "party_count"]]

  df["hour"] = pd.to_datetime(df["collision_time"]).dt.hour


#### Distribution of the proportion of crashes involving alcohol by time of day

In [25]:
# We will attempt to use multiple models to fit this distribution. We will use polynomial splines and B-splines but it appears a skewed-normal may be a possible fit as well.

dftod = pd.DataFrame()

dftod["hour"] = range(0, 24)

dftod["alc"] = dfc.query("alcohol_involved == 1").groupby("hour")["case_id"].count()
dftod["nalc"] = dfc.query("alcohol_involved == 0").groupby("hour")["case_id"].count()
dftod["p"] = dftod["alc"] / dftod["nalc"]

dftod["adj_hour"] = [16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
dftod["adj_label"] = ["8am", "9am", "10am", "11am", "12pm", "1pm", "2pm", "3pm", "4pm", "5pm", "6pm", "7pm", "8pm", "9pm", "10pm", "11pm", "12am", "1am", "2am", "3am", "4am", "5am", "6am", "7am"]
adj_labels = dict(zip(dftod["adj_hour"], dftod["adj_label"]))
print(adj_labels)

palc_base = alt.Chart(dftod).mark_bar(width = 20).encode(
    x = alt.X("adj_hour", title = "Hour of day (adjusted, 0 = 8:00 am)", scale = alt.Scale(domain = [-0.5, 23.5]),
               axis = alt.Axis(labelExpr = "{0: '8am', 1: '9am', 2: '10am', 3: '11am', 4: '12pm', 5: '1pm', 6: '2pm', 7: '3pm', 8: '4pm', 9: '5pm', 10: '6pm', 11: '7pm', 12: '8pm', 13: '9pm', 14: '10pm', 15: '11pm', 16: '12am', 17: '1am', 18: '2am', 19: '3am', 20: '4am', 21: '5am', 22: '6am', 23: '7am'}[datum.value]")),
    y = alt.Y("p", title = "Proportion of accidents with alcohol involved")
).properties(width = 600, height = 200)

palc_base.display()

{16: '8am', 17: '9am', 18: '10am', 19: '11am', 20: '12pm', 21: '1pm', 22: '2pm', 23: '3pm', 0: '4pm', 1: '5pm', 2: '6pm', 3: '7pm', 4: '8pm', 5: '9pm', 6: '10pm', 7: '11pm', 8: '12am', 9: '1am', 10: '2am', 11: '3am', 12: '4am', 13: '5am', 14: '6am', 15: '7am'}


#### Distributions of the number of injuries per collision, alcohol vs. no alcohol

In [None]:
# These appear to follow exponential or Weibull distributions. We will determine which is the best fit and then compare to see if alcohol has an effect on the distribution.

#### Timeline for number of collisions

In [None]:
# train on up to 2019, predict 2020, compare to actual. This will be done using a time series model.

#### Timeline for number of collisions (alcohol vs no alcohol)

In [None]:
# trained on up to 2019, predicting 2020, but separating by alcohol vs. no alcohol and comparing to actuals. This will also be completed using a time series model

#### Feature selection using LASSO on fitted GLM against several labels

#### Feature selection using Random Forest feature importance against several labels