## Simple Model

### let's do SLR for simple modeling

In [1]:
import pandas as pd, numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import matplotlib.pyplot as plt
import scipy.stats as stats
from pathlib import Path

pd.set_option('display.float_format', lambda x: f'{x:,.4f}')

In [2]:
# load data
path ='/Users/Marcy_Student/Desktop/Marcy-Modules/Mod6/final-project/data/filtered_collisions.csv'
df = pd.read_csv(path)
df = df.drop(columns='Unnamed: 0')
df

Unnamed: 0,crash_date,number_of_persons_injured,month,season,is_electric
0,2022-01-01,0,1,Winter,0
1,2022-01-01,0,1,Winter,0
2,2022-01-01,0,1,Winter,0
3,2022-01-01,1,1,Winter,0
4,2022-01-01,0,1,Winter,0
...,...,...,...,...,...
369996,2025-12-02,1,12,Winter,0
369997,2025-12-02,0,12,Winter,0
369998,2025-12-02,1,12,Winter,0
369999,2025-12-02,1,12,Winter,0


In [3]:
# Removes rows with only 1 injured person. ONLY USE IF STRATIFY IS NEEDED FOR BETTER MODEL
#df_filtered = df[(df['number_of_persons_injured'] < 20) & (df['number_of_persons_injured']!=18)]
#df_filtered

In [4]:
# getting our dummies before train-test split
df_enc = pd.get_dummies(df, columns=['season'], drop_first=True)
# baseline of Fall

In [None]:
# feature engineering our interaction term
season_cols = ['season_Spring', 'season_Summer', 'season_Winter']
for x in season_cols:
    df_enc[f'interaction_is_electric_{x}'] = df_enc['is_electric'] * df_enc[x]

df_enc

Unnamed: 0,crash_date,number_of_persons_injured,month,is_electric,season_Spring,season_Summer,season_Winter,interaction_is_electric_season_Spring,interaction_is_electric_season_Summer,interaction_is_electric_season_Winter
0,2022-01-01,0,1,0,False,False,True,0,0,0
1,2022-01-01,0,1,0,False,False,True,0,0,0
2,2022-01-01,0,1,0,False,False,True,0,0,0
3,2022-01-01,1,1,0,False,False,True,0,0,0
4,2022-01-01,0,1,0,False,False,True,0,0,0
...,...,...,...,...,...,...,...,...,...,...
369996,2025-12-02,1,12,0,False,False,True,0,0,0
369997,2025-12-02,0,12,0,False,False,True,0,0,0
369998,2025-12-02,1,12,0,False,False,True,0,0,0
369999,2025-12-02,1,12,0,False,False,True,0,0,0


In [6]:
y = df_enc['number_of_persons_injured']

X = df_enc.drop(columns=['number_of_persons_injured', 'crash_date', 'month']).astype(int)
display(y)
display(X)

0         0
1         0
2         0
3         1
4         0
         ..
369996    1
369997    0
369998    1
369999    1
370000    1
Name: number_of_persons_injured, Length: 370001, dtype: int64

Unnamed: 0,is_electric,season_Spring,season_Summer,season_Winter,interaction_is_electric_season_Spring,interaction_is_electric_season_Summer,interaction_is_electric_season_Winter
0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0
2,0,0,0,1,0,0,0
3,0,0,0,1,0,0,0
4,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...
369996,0,0,0,1,0,0,0
369997,0,0,0,1,0,0,0
369998,0,0,0,1,0,0,0
369999,0,0,0,1,0,0,0


In [7]:
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3,  random_state=42)

In [23]:
# 1. Baseline Model

# using the mean of the entire y as our prediction
y_base_pred = y_train.mean()

# filling in the mean with the same size
y_base_train = np.full_like(y_test, y_base_pred)

# calculating error
mae_base = mean_absolute_error(y_test, y_base_train)
rmse_base = np.sqrt(mean_squared_error(y_test, y_base_train))

print(f"MAE: {mae_base:.2f} ———— RMSE: {rmse_base:.2f}")

MAE: 0.55 ———— RMSE: 1.00


In [26]:
# 2. Simple Model
model = sm.OLS(y_train, x_train).fit()
display(model.summary())

pred_simple = model.predict(x_test)

mae_simple = mean_absolute_error(y_test, pred_simple)
rmse_simple = mean_squared_error(y_test, pred_simple)

print(f"MAE: {mae_simple:.2f} ———— RMSE: {rmse_simple:.2f}")

# our error is smaller than our baseline :)

0,1,2,3
Dep. Variable:,number_of_persons_injured,R-squared (uncentered):,0.228
Model:,OLS,Adj. R-squared (uncentered):,0.228
Method:,Least Squares,F-statistic:,10940.0
Date:,"Tue, 09 Dec 2025",Prob (F-statistic):,0.0
Time:,20:28:54,Log-Likelihood:,-338820.0
No. Observations:,259000,AIC:,677600.0
Df Residuals:,258993,BIC:,677700.0
Df Model:,7,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
is_electric,0.8929,0.018,49.344,0.000,0.857,0.928
season_Spring,0.5328,0.004,151.862,0.000,0.526,0.540
season_Summer,0.5723,0.004,162.598,0.000,0.565,0.579
season_Winter,0.5084,0.004,133.319,0.000,0.501,0.516
interaction_is_electric_season_Spring,-0.5433,0.026,-21.040,0.000,-0.594,-0.493
interaction_is_electric_season_Summer,-0.5412,0.025,-21.861,0.000,-0.590,-0.493
interaction_is_electric_season_Winter,-0.5409,0.030,-18.183,0.000,-0.599,-0.483

0,1,2,3
Omnibus:,205263.184,Durbin-Watson:,1.946
Prob(Omnibus):,0.0,Jarque-Bera (JB):,19305671.868
Skew:,3.201,Prob(JB):,0.0
Kurtosis:,44.809,Cond. No.,11.8


MAE: 0.61 ———— RMSE: 0.76


In [None]:
MAE = mean_absolute_error()