In [None]:
import pandas as pd
import numpy as np
import sqlite3
import altair as alt
import scipy.interpolate as interpolate
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.linear_model import Lasso
from sklearn.impute import SimpleImputer

alt.data_transformers.disable_max_rows()

In [None]:
con = sqlite3.connect("switrs.sqlite")

query = """
    SELECT * FROM collisions WHERE county_location = 'los angeles'
    """

df = pd.read_sql_query(query, con, parse_dates = ["collision_date"])
df["year"] = df["collision_date"].dt.year
df["hour"] = pd.to_datetime(df["collision_time"]).dt.hour
df = df.query("year < 2021") # remove incomplete 2021 data
df["alcohol_involved"] = df["alcohol_involved"].fillna(0) # convert NaN to 0 in alcohol use column

dfc = df[["case_id", "county_location", "alcohol_involved", "collision_severity", "injured_victims", "collision_date", "year", "collision_time", "hour", "party_count"]]

#### Distribution of the proportion of crashes involving alcohol by time of day

In [None]:
# We will attempt to use multiple models to fit this distribution. We will use polynomial splines and B-splines but it appears a skewed-normal may be a possible fit as well.

x = dfc.groupby("hour")["hour"].mean()
ya = dfc[dfc["alcohol_involved"] == 1].groupby("hour")["alcohol_involved"].count() / len(dfc[dfc["alcohol_involved"] == 1])
yn = dfc[dfc["alcohol_involved"] == 0].groupby("hour")["alcohol_involved"].count() / len(dfc[dfc["alcohol_involved"] == 0])
adj_hour = [16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
adjlabel = "{0: '8am', 1: '9am', 2: '10am', 3: '11am', 4: '12pm', 5: '1pm', 6: '2pm', 7: '3pm', 8: '4pm', 9: '5pm', 10: '6pm', 11: '7pm', 12: '8pm', 13: '9pm', 14: '10pm', 15: '11pm', 16: '12am', 17: '1am', 18: '2am', 19: '3am', 20: '4am', 21: '5am', 22: '6am', 23: '7am'}[datum.value]"

dfah = pd.DataFrame({"hour": x, "adj_hour": adj_hour, "collisions_a": ya, "collisions_n": yn})

pla = alt.Chart(dfah).mark_bar(width = 20, color = "orange", opacity = 0.75).encode(
    x = alt.X("adj_hour:O", title = "Hour of the day", axis = alt.Axis(labelExpr = adjlabel)),
    y = alt.Y("collisions_a", title = "Number of collisions (alcohol involved)").stack(None)
).properties(width = 600, height = 200)

pln = alt.Chart(dfah).mark_bar(width = 20, opacity = 0.75).encode(
    x = alt.X("adj_hour:O", title = "Hour of the day", axis = alt.Axis(labelExpr = adjlabel)),
    y = alt.Y("collisions_n", title = "Number of collisions (alcohol not involved)").stack(None)
).properties(width = 600, height = 200)

(pln + pla).resolve_axis(y = "independent").display()

In [None]:
x = dfc.groupby("hour")["hour"].mean()
y = dfc.groupby("hour")["alcohol_involved"].mean()
xx = np.linspace(x.min(), x.max(), 24)

t, c, k = interpolate.splrep(x, y, k = 3, s = 0.0001)
ypred = interpolate.splev(xx, (t, c, k))

adj_hour = [16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]
dfspl = pd.DataFrame({"hour": x, "adj_hour": adj_hour, "p": y, "p_fit": ypred})

spl_rmse = mean_squared_error(dfspl["p"], dfspl["p_fit"])
print("Root mean squared error: %.5f" % (np.sqrt(spl_rmse)))

adjlabel = "{0: '8am', 1: '9am', 2: '10am', 3: '11am', 4: '12pm', 5: '1pm', 6: '2pm', 7: '3pm', 8: '4pm', 9: '5pm', 10: '6pm', 11: '7pm', 12: '8pm', 13: '9pm', 14: '10pm', 15: '11pm', 16: '12am', 17: '1am', 18: '2am', 19: '3am', 20: '4am', 21: '5am', 22: '6am', 23: '7am'}[datum.value]"
adjscale = alt.Scale(domain = [-0.5, 23.5])

spl_base = alt.Chart(dfspl).mark_bar(width = 20, color = "orange").encode(
    x = alt.X("adj_hour", axis = alt.Axis(labelExpr = adjlabel)),
    y = alt.Y("p")
).properties(width = 600, height = 200)

spl = alt.Chart(dfspl).mark_line(color = "red").encode(
    x = alt.X("adj_hour", title = "Hour of the day", scale = adjscale),
    y = alt.Y("p_fit", title = "Probability of alcohol involvement")
).properties(width = 600, height = 200)

(spl_base + spl).display()

#### Distributions of the number of injuries per collision, alcohol vs. no alcohol

In [None]:
# These appear to follow exponential or Weibull distributions. We will determine which is the best fit and then compare to see if alcohol has an effect on the distribution.

# Group by 'alcohol_involved' and 'injured_victims' to count the number of accidents with each number of injuries
injury_distribution = dfc.groupby(['alcohol_involved', 'injured_victims']).size().reset_index(name='accident_count')

# Calculate the total number of accidents for each alcohol involvement category
total_accidents_by_alcohol = injury_distribution.groupby('alcohol_involved')['accident_count'].transform('sum')

# Calculate the proportion of each accident count within the alcohol involvement category
injury_distribution['proportion'] = injury_distribution['accident_count'] / total_accidents_by_alcohol

# Create the Altair chart
chart = alt.Chart(injury_distribution).mark_bar().encode(
    x=alt.X('injured_victims:O', title='Number of Injuries'),
    y=alt.Y('proportion:Q', title='Proportion of Accidents'),
    color=alt.Color('alcohol_involved:N', title='Alcohol Involved', scale=alt.Scale(domain=[0, 1], range=['red', 'blue'])),
    column=alt.Column('alcohol_involved:N', header=alt.Header(title='Alcohol Involvement')) 
).properties(
    width=250, 
    height=300 
)

chart.show()


In [None]:
from scipy.stats import expon, weibull_min

# Filter data for alcohol-involved accidents (alcohol_involved == 1)
alcohol_df = injury_distribution[injury_distribution['alcohol_involved'] == 1]

# Filter data for non-alcohol-involved accidents (alcohol_involved == 0)
no_alcohol_df = injury_distribution[injury_distribution['alcohol_involved'] == 0]

# Fit the Exponential distribution for alcohol-involved accidents
accident_data_alcohol = alcohol_df['accident_count']
exponential_params_alcohol = expon.fit(accident_data_alcohol)

# Fit the Weibull distribution for alcohol-involved accidents
weibull_params_alcohol = weibull_min.fit(accident_data_alcohol)

# Print the parameters for alcohol-involved accidents
print(f"Exponential Distribution Parameters (Alcohol Involved): {exponential_params_alcohol}")
print(f"Weibull Distribution Parameters (Alcohol Involved): {weibull_params_alcohol}")

# Fit the Exponential distribution for non-alcohol-involved accidents
accident_data_no_alcohol = no_alcohol_df['accident_count']
exponential_params_no_alcohol = expon.fit(accident_data_no_alcohol)

# Fit the Weibull distribution for non-alcohol-involved accidents
weibull_params_no_alcohol = weibull_min.fit(accident_data_no_alcohol)

# Print the parameters for non-alcohol-involved accidents
print(f"Exponential Distribution Parameters (No Alcohol): {exponential_params_no_alcohol}")
print(f"Weibull Distribution Parameters (No Alcohol): {weibull_params_no_alcohol}")

# Calculate log-likelihood for Exponential distribution for alcohol-involved accidents
expon_loglik_alcohol = np.sum([np.log(expon.pdf(x, *exponential_params_alcohol)) for x in accident_data_alcohol])

# Calculate log-likelihood for Weibull distribution for alcohol-involved accidents
weibull_loglik_alcohol = np.sum([np.log(weibull_min.pdf(x, *weibull_params_alcohol)) for x in accident_data_alcohol])

# Print log-likelihood values for alcohol-involved accidents
print(f"Log-Likelihood for Exponential (Alcohol Involved): {expon_loglik_alcohol}")
print(f"Log-Likelihood for Weibull (Alcohol Involved): {weibull_loglik_alcohol}")

# Calculate log-likelihood for Exponential distribution for non-alcohol-involved accidents
expon_loglik_no_alcohol = np.sum([np.log(expon.pdf(x, *exponential_params_no_alcohol)) for x in accident_data_no_alcohol])

# Calculate log-likelihood for Weibull distribution for non-alcohol-involved accidents
weibull_loglik_no_alcohol = np.sum([np.log(weibull_min.pdf(x, *weibull_params_no_alcohol)) for x in accident_data_no_alcohol])

# Print log-likelihood values for non-alcohol-involved accidents
print(f"Log-Likelihood for Exponential (No Alcohol): {expon_loglik_no_alcohol}")
print(f"Log-Likelihood for Weibull (No Alcohol): {weibull_loglik_no_alcohol}")



In [None]:
# Generate the Weibull PDF for both alcohol and no alcohol
x_values = np.linspace(0, max(injury_distribution['injured_victims']), 100)  # Range for accident counts

# Weibull PDFs for each distribution
weibull_pdf_alcohol = weibull_min.pdf(x_values, *weibull_params_alcohol)
weibull_pdf_no_alcohol = weibull_min.pdf(x_values, *weibull_params_no_alcohol)

weibull_df = pd.DataFrame({
    'injured_victims': x_values,
    'weibull_pdf_alcohol': weibull_pdf_alcohol,
    'weibull_pdf_no_alcohol': weibull_pdf_no_alcohol,
})

# Plot the Weibull PDFs
line_chart_alcohol = alt.Chart(weibull_df).mark_line(color='blue').encode(
    x='injured_victims:Q',
    y='weibull_pdf_alcohol:Q'
)

line_chart_no_alcohol = alt.Chart(weibull_df).mark_line(color='red').encode(
    x='injured_victims:Q',
    y='weibull_pdf_no_alcohol:Q'
)

final_chart = line_chart_alcohol + line_chart_no_alcohol

final_chart.show()

In [None]:
#Creating a model assuming there is no difference in the distributions of alcohol vs non alcohol
# Combine both datasets
combined_data = np.concatenate([accident_data_alcohol, accident_data_no_alcohol])

# Fit a single Weibull distribution to the combined data
weibull_params_combined = weibull_min.fit(combined_data)

# Log-likelihood for the combined data assuming same parameters for both
log_likelihood_combined = np.sum(np.log(weibull_min.pdf(combined_data, *weibull_params_combined)))

# Full model log-likelihood assuming there is a difference
log_likelihood_full = weibull_loglik_no_alcohol + weibull_loglik_alcohol

# Likelihood Ratio statistic
lrt_statistic = -2 * (log_likelihood_combined - log_likelihood_full)

df = 2 

p_value = 1 - chi2.cdf(lrt_statistic, df)

print(f"LRT Statistic: {lrt_statistic}")
print(f"P-value: {p_value}")


#### Timeline for number of collisions

In [None]:
# train on up to 2019, predict 2020, compare to actual. This will be done using a time series model.

In [None]:
#convert collision date to year_month to group by every month from 2001
dfm = dfc.copy()
dfm["year_month"] = dfm["collision_date"].dt.to_period('M') 
all_monthly_accidents = dfm.groupby('year_month').size()
all_monthly_accidents.info()

In [None]:
#sarima model 
def fit_sarima_and_forecast(time_series_data, order=(1,1,1), seasonal_order=(1,1,1,12)):
    model = SARIMAX(time_series_data, order=order, seasonal_order=seasonal_order)
    model_fit = model.fit(disp=False)
    forecast = model_fit.forecast(steps=12)  
    return forecast


In [None]:
#fit model for all accidents to forecast 2020
forecast_2020_monthly = fit_sarima_and_forecast(all_monthly_accidents[:'2019-12'])
actual_2020_monthly = all_monthly_accidents['2020-01':'2020-12']

#historical data upto 2019
df = pd.DataFrame({
    'Date': all_monthly_accidents.index,
    'Accidents': all_monthly_accidents.values,
    'Type': 'Historical'
})

#forecast data for 2020
df_forecast = pd.DataFrame({
    'Date': forecast_2020_monthly.index,
    'Accidents': forecast_2020_monthly.values,
    'Type': 'Forecasted'
})

#actual data for 2020
df_actual = pd.DataFrame({
    'Date': actual_2020_monthly.index,
    'Accidents': actual_2020_monthly.values,
    'Type': 'Actual'
})

#combine the dataframes
df_combined = pd.concat([df, df_forecast, df_actual])

#convert new 'Date' column to datetime
df_combined['Date'] = df_combined['Date'].dt.to_timestamp()
df_actual['Date'] = df_actual['Date'].dt.to_timestamp()

In [None]:
#chart forecasted vs actual vs historical for all accidents
chart = alt.Chart(df_combined).mark_line().encode(
    x='Date:T', 
    y='Accidents:Q',  
    color='Type:N',  
    tooltip=['Date:T', 'Accidents:Q', 'Type:N']  
).properties(
    title="Accident Forecast vs Actual (2020) - Monthly Data",
    width=800
)

#scatter points for actual 2020 accidents
scatter = alt.Chart(df_actual).mark_point(filled=True, size=10).encode(
    x='Date:T',
    y='Accidents:Q',
    color=alt.value('blue'),
    tooltip=['Date:T', 'Accidents:Q']
)

final_chart = chart + scatter

final_chart.configure_view(
    strokeWidth=0  
)

final_chart.show()

#### Timeline for number of collisions (alcohol vs no alcohol)

In [None]:
# trained on up to 2019, predicting 2020, but separating by alcohol vs. no alcohol and comparing to actuals. This will also be completed using a time series model

In [None]:
#convert collision date to year_month to group by every month from 2001 for alcohol and no alcohol
dfa = dfc.copy()
dfa["year_month"] = dfa["collision_date"].dt.to_period('M') 
monthly_alcohol_accidents = dfa[dfa['alcohol_involved'] == 1].groupby('year_month').size()
monthly_no_alcohol_accidents = dfa[dfa['alcohol_involved'] == 0].groupby('year_month').size()

In [None]:
monthly_alcohol_accidents.info()
monthly_no_alcohol_accidents.info()

In [None]:
#fit model for alcohol related accidents to forecast 2020
forecast_2020_monthly_alcohol = fit_sarima_and_forecast(monthly_alcohol_accidents[:'2019-12'])
actual_2020_monthly_alcohol = monthly_alcohol_accidents['2020-01':'2020-12']

#fit model for non alcohol related accidents to forecast 2020
forecast_2020_monthly_no_alcohol = fit_sarima_and_forecast(monthly_no_alcohol_accidents[:'2019-12'])
actual_2020_monthly_no_alcohol = monthly_no_alcohol_accidents['2020-01':'2020-12']

In [None]:
#df for historical alcohol accidents
df_alcohol = pd.DataFrame({
    'Date': monthly_alcohol_accidents.index,
    'Accidents': monthly_alcohol_accidents.values,
    'Type': ['Historical'] * len(monthly_alcohol_accidents)
})

#forecasted 2020 data for alcohol accidents
df_forecast_alcohol = pd.DataFrame({
    'Date': forecast_2020_monthly_alcohol.index,
    'Accidents': forecast_2020_monthly_alcohol.values,
    'Type': ['Forecasted'] * len(forecast_2020_monthly_alcohol)
})

#actual 2020 data for alcohol accidents in 2020
df_actual_alcohol = pd.DataFrame({
    'Date': actual_2020_monthly_alcohol.index,
    'Accidents': actual_2020_monthly_alcohol.values,
    'Type': ['Actual'] * len(actual_2020_monthly_alcohol)
})

#combined df for all alcohol data
df_combined_alcohol = pd.concat([df_alcohol, df_forecast_alcohol, df_actual_alcohol])

#convert new 'Date' column to datetime
df_combined_alcohol['Date'] = df_combined_alcohol['Date'].dt.to_timestamp()
df_actual_alcohol['Date'] = df_actual_alcohol['Date'].dt.to_timestamp()

In [None]:
#chart for alcohol data
chart_alcohol = alt.Chart(df_combined_alcohol).mark_line().encode(
    x='Date:T',
    y='Accidents:Q',
    color='Type:N',
    tooltip=['Date:T', 'Accidents:Q', 'Type:N']
).properties(
    title="Alcohol-related Accidents (2020): Forecasted vs. Actual",
    width=800
)

#scatter for actual 2020 data
chart_actual_alcohol = alt.Chart(df_actual_alcohol).mark_point(filled=True, size=10).encode(
    x='Date:T',
    y='Accidents:Q',
    color=alt.value('blue'),
    tooltip=['Date:T', 'Accidents:Q']
)

chart_alcohol + chart_actual_alcohol

In [None]:
#df for histrorical non-alcohol accidents
df_no_alcohol = pd.DataFrame({
    'Date': monthly_no_alcohol_accidents.index,
    'Accidents': monthly_no_alcohol_accidents.values,
    'Type': ['Historical'] * len(monthly_no_alcohol_accidents)
})

#forecasted 2020 data for non-alcohol accidents
df_forecast_no_alcohol = pd.DataFrame({
    'Date': forecast_2020_monthly_no_alcohol.index,
    'Accidents': forecast_2020_monthly_no_alcohol.values,
    'Type': ['Forecasted'] * len(forecast_2020_monthly_no_alcohol)
})

#actual 2020 data for non-alcohol accidents 
df_actual_no_alcohol = pd.DataFrame({
    'Date': actual_2020_monthly_no_alcohol.index,
    'Accidents': actual_2020_monthly_no_alcohol.values,
    'Type': ['Actual'] * len(actual_2020_monthly_no_alcohol)
})

#combine df for all no-alcohol data
df_combined_no_alcohol = pd.concat([df_no_alcohol, df_forecast_no_alcohol, df_actual_no_alcohol])

#convert new 'Date' column to datetime
df_combined_no_alcohol['Date'] = df_combined_no_alcohol['Date'].dt.to_timestamp()
df_actual_no_alcohol['Date'] = df_actual_no_alcohol['Date'].dt.to_timestamp()

#### Feature selection using Random Forest feature importance against several labels

In [None]:
dfrf = df.copy()

dfrf["minute"] = pd.to_datetime(df["collision_time"], format = "%H:%M:%S").dt.minute
dfrf["day"] = pd.to_datetime(df["collision_time"], format = "%H:%M:%S").dt.day_of_year


drop_feats = ["collision_severity", "killed_victims", "injured_victims", "severe_injury_count",
              "other_visible_injury_count", "complaint_of_pain_injury_count", "pedestrian_killed_count", "pedestrian_injured_count",
              "bicyclist_killed_count", "bicyclist_injured_count", "motorcyclist_killed_count", "motorcyclist_injured_count",
              "case_id", "process_date", "hour", "collision_date", "process_date", "collision_time",
              "city_division_lapd", "caltrans_county", "caltrans_district", "state_route", "postmile"]

dfnan = pd.DataFrame()
dfnan["predictor"] = (dfrf.isna().sum() / dfrf.isna().count()).sort_values().index
dfnan["p_nan"] = (dfrf.isna().sum() / dfrf.isna().count()).sort_values().values

drop_nans = dfnan.query("p_nan > 0.8")["predictor"] # drop features that are more than 80 % nan

X = dfrf.drop(drop_feats, axis = 1).drop(drop_nans, axis = 1).convert_dtypes()

numcols = []
for column in X:
    if X[column].dtype != "string[python]":
        numcols.append(column)
badnumcols = [column for column in numcols if column not in ["distance", "party_count", "latitude", "longitude", "year", "minute", "day"]] # only keep these ones as numeric
X[badnumcols] = X[badnumcols].astype("string[python]")

badcats = [column for column in X if X[column].nunique() > 100 and X[column].dtype == "string[python]"]
X = X.drop(badcats, axis = 1) # drop categorical features with more than 100 unique groups

In [None]:
# finding number of unique groups for categorical features

strcolumns = []
for column in X:
    if X[column].dtype == "string[python]":
        strcolumns.append(column)

columns, uniques = [], []

for column in strcolumns:
    columns.append(column)
    uniques.append(len(X[column].value_counts()))
                   
opdf = pd.DataFrame({"column": columns, "unique": uniques})

uns = opdf.sort_values("unique", ascending = False)

In [None]:
Xoh = pd.get_dummies(X)
y = dfrf["injured_victims"].fillna(0)

X_tr, X_te, y_tr, y_te = train_test_split(Xoh, y, random_state = 13)

rf = RandomForestRegressor(n_estimators = 10, random_state = 13)

rff = rf.fit(X_tr, y_tr)

rfpred = rff.predict(X_te)

print("RMSE: %.3f" % (np.sqrt(mean_squared_error(y_te, rfpred))))
print("Proportion correct: %.3f "% ((y_te == rfpred.astype(int)).mean()))

pd.DataFrame({"feature": rff.feature_names_in_, "importance": rff.feature_importances_}).sort_values("importance", ascending = False).head(20)

In [None]:
Xoh2 = pd.get_dummies(X.drop("party_count", axis = 1))
y2 = dfrf["injured_victims"].fillna(0) / dfrf["party_count"].fillna(1) # repeating this time using the injuries per party involved

X2_tr, X2_te, y2_tr, y2_te = train_test_split(Xoh2, y2, random_state = 13)

rf2 = RandomForestRegressor(n_estimators = 10, random_state = 13)

rff2 = rf2.fit(X2_tr, y2_tr)

rfpred2 = rff2.predict(X2_te)

print("RMSE: %.3f" % (np.sqrt(mean_squared_error(y2_te, rfpred2))))
print("Proportion correct: %.3f "% ((y2_te == rfpred2.astype(int)).mean()))

pd.DataFrame({"feature": rff2.feature_names_in_, "importance": rff2.feature_importances_}).sort_values("importance", ascending = False).head(20)

In [None]:
Xoh3 = pd.get_dummies(X)
y3 = dfrf["collision_severity"] # repeating this time using categorical label

X3_tr, X3_te, y3_tr, y3_te = train_test_split(Xoh3, y3, random_state = 13)

rf3 = RandomForestClassifier(n_estimators = 10, random_state = 13)

rff3 = rf3.fit(X3_tr, y3_tr)

rfpred3 = rff3.predict(X3_te)

print("Proportion correct: %.3f "% ((y3_te == rfpred3).mean()))

cf3 = confusion_matrix(y3_te, rfpred3)

cmp3 = ConfusionMatrixDisplay(confusion_matrix=cf3, display_labels=True)

fig, ax = plt.subplots(figsize=(10,10))
cmp3.plot(ax=ax)

pd.DataFrame({"feature": rff3.feature_names_in_, "importance": rff3.feature_importances_}).sort_values("importance", ascending = False).head(20)

#### Feature selection using LASSO on fitted GLM against several labels

In [None]:
# One-hot encode categorical variables
Xoh = pd.get_dummies(X)

# Impute missing values with the mean of each column
imputer = SimpleImputer(strategy='mean')

# Fit the imputer to the data and transform it, then convert back to DataFrame
Xoh_imputed = imputer.fit_transform(Xoh)
Xoh_imputed = pd.DataFrame(Xoh_imputed, columns=Xoh.columns)

# Define the target variable
y = dfrf["injured_victims"].fillna(0) 

# Split the data into training and testing sets
X_tr, X_te, y_tr, y_te = train_test_split(Xoh_imputed, y, random_state=13)

# Initialize the Lasso model
lasso = Lasso(alpha=0.1, random_state=13)

# Fit the Lasso model
lasso.fit(X_tr, y_tr)

# Predict on the test set
lasso_pred = lasso.predict(X_te)

# Calculate and print RMSE
rmse_lasso = np.sqrt(mean_squared_error(y_te, lasso_pred))
print(f"Lasso RMSE: {rmse_lasso:.3f}")

# Show the coefficients of the features to understand feature importance
feature_importance_lasso = pd.DataFrame({
    "feature": X_tr.columns, 
    "coefficient": np.abs(lasso.coef_)
}).sort_values("coefficient", ascending=False)

# Print the top 20 most important features
print("Top Features (by absolute coefficient value):")
print(feature_importance_lasso.head(20))

In [None]:
Xoh2 = pd.get_dummies(X.drop("party_count", axis = 1))
y2 = dfrf["injured_victims"].fillna(0) / dfrf["party_count"].fillna(1)

# Fit the imputer to the data and transform it, then convert back to DataFrame
Xoh2_imputed = imputer.fit_transform(Xoh2)
Xoh2_imputed = pd.DataFrame(Xoh2_imputed, columns=Xoh2.columns)

# Split the data into training and testing sets
X_tr, X_te, y_tr, y_te = train_test_split(Xoh2_imputed, y2, random_state=13)

# Initialize the Lasso model
lasso = Lasso(alpha=0.1, random_state=13)

# Fit the Lasso model
lasso.fit(X_tr, y_tr)

# Predict on the test set
lasso_pred = lasso.predict(X_te)

# Calculate and print RMSE
rmse_lasso = np.sqrt(mean_squared_error(y_te, lasso_pred))
print(f"Lasso RMSE: {rmse_lasso:.3f}")

# Show the coefficients of the features to understand feature importance
feature_importance_lasso = pd.DataFrame({
    "feature": X_tr.columns, 
    "coefficient": np.abs(lasso.coef_)
}).sort_values("coefficient", ascending=False)

# Print the top 20 most important features
print("Top Features (by absolute coefficient value):")
print(feature_importance_lasso.head(20))