### Base Model Training

In [5]:
import pandas as pd

In [6]:
df = pd.read_csv("data/accidents_cleaned.csv")
df.head()

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),City,County,State,Zipcode,Country,Timezone,...,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Duration_Minutes,Hour,DayOfWeek,Month,IsWeekend,IsDay
0,1,26.7069,-80.11936,0.0,West Palm Beach,Palm Beach,FL,33417-4638,US,US/Eastern,...,0,0,1,0,60.0,9,4,4,0,1
1,2,38.781024,-121.26582,0.045,Roseville,Placer,CA,95678-1907,US,US/Pacific,...,1,0,0,0,103.133333,10,3,4,0,1
2,3,33.985249,-84.269348,0.0,Alpharetta,Fulton,GA,30022,US,US/Eastern,...,0,0,0,0,30.0,16,4,8,0,1
3,3,47.118706,-122.556908,0.0,Tacoma,Pierce,WA,98433,US,US/Pacific,...,0,0,0,0,33.733333,15,4,9,0,1
4,2,33.451355,-111.890343,0.0,Scottsdale,Maricopa,AZ,85256,US,US/Mountain,...,0,0,0,0,76.433333,16,0,6,0,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 960865 entries, 0 to 960864
Data columns (total 40 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Severity           960865 non-null  int64  
 1   Start_Lat          960865 non-null  float64
 2   Start_Lng          960865 non-null  float64
 3   Distance(mi)       960865 non-null  float64
 4   City               960865 non-null  object 
 5   County             960865 non-null  object 
 6   State              960865 non-null  object 
 7   Zipcode            960865 non-null  object 
 8   Country            960865 non-null  object 
 9   Timezone           960865 non-null  object 
 10  Airport_Code       960865 non-null  object 
 11  Weather_Timestamp  960865 non-null  object 
 12  Temperature(F)     960865 non-null  float64
 13  Wind_Chill(F)      960865 non-null  float64
 14  Humidity(%)        960865 non-null  float64
 15  Pressure(in)       960865 non-null  float64
 16  Vi

In [8]:
df

Unnamed: 0,Severity,Start_Lat,Start_Lng,Distance(mi),City,County,State,Zipcode,Country,Timezone,...,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Duration_Minutes,Hour,DayOfWeek,Month,IsWeekend,IsDay
0,1,26.706900,-80.119360,0.000,West Palm Beach,Palm Beach,FL,33417-4638,US,US/Eastern,...,0,0,1,0,60.000000,9,4,4,0,1
1,2,38.781024,-121.265820,0.045,Roseville,Placer,CA,95678-1907,US,US/Pacific,...,1,0,0,0,103.133333,10,3,4,0,1
2,3,33.985249,-84.269348,0.000,Alpharetta,Fulton,GA,30022,US,US/Eastern,...,0,0,0,0,30.000000,16,4,8,0,1
3,3,47.118706,-122.556908,0.000,Tacoma,Pierce,WA,98433,US,US/Pacific,...,0,0,0,0,33.733333,15,4,9,0,1
4,2,33.451355,-111.890343,0.000,Scottsdale,Maricopa,AZ,85256,US,US/Mountain,...,0,0,0,0,76.433333,16,0,6,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960860,3,40.110500,-82.977524,3.590,Columbus,Franklin,OH,43085,US,US/Eastern,...,0,0,0,0,29.350000,19,1,11,0,0
960861,4,38.647365,-78.468024,0.187,Luray,Page,VA,22835,US,US/Eastern,...,0,0,0,0,82.566667,21,4,9,0,0
960862,2,35.105789,-82.375168,0.000,Travelers Rest,Greenville,SC,29690-8116,US,US/Eastern,...,1,0,0,0,89.900000,17,2,7,0,1
960863,2,34.791351,-82.417435,0.000,Greenville,Greenville,SC,29605,US,US/Eastern,...,0,0,0,0,59.700000,8,2,2,0,1


In [9]:
df.isnull().sum()

Severity             0
Start_Lat            0
Start_Lng            0
Distance(mi)         0
City                 0
County               0
State                0
Zipcode              0
Country              0
Timezone             0
Airport_Code         0
Weather_Timestamp    0
Temperature(F)       0
Wind_Chill(F)        0
Humidity(%)          0
Pressure(in)         0
Visibility(mi)       0
Wind_Direction       0
Wind_Speed(mph)      0
Precipitation(in)    0
Weather_Condition    0
Amenity              0
Bump                 0
Crossing             0
Give_Way             0
Junction             0
No_Exit              0
Railway              0
Roundabout           0
Station              0
Stop                 0
Traffic_Calming      0
Traffic_Signal       0
Turning_Loop         0
Duration_Minutes     0
Hour                 0
DayOfWeek            0
Month                0
IsWeekend            0
IsDay                0
dtype: int64

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.impute import SimpleImputer

In [11]:
# Define target and features
target = 'Severity'
X = df.drop(columns=[target])
y = df[target]

In [12]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64', 'bool']).columns.tolist()

In [13]:
# Preprocessing pipelines for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # fill missing numerics if any
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),  # fill missing categoricals if any
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [14]:
# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [15]:
# Create pipeline with preprocessing and logistic regression
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='auto', n_jobs=-1))
])

In [16]:
# Split dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

X_train.shape

(768692, 39)

In [17]:
# Fit the model
clf.fit(X_train, y_train)



0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [18]:
# Predict on test
y_pred = clf.predict(X_test)

In [22]:
# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8377503603523908
Classification Report:
               precision    recall  f1-score   support

           1       0.57      0.04      0.07      1701
           2       0.86      0.95      0.91    152989
           3       0.67      0.46      0.54     32517
           4       0.44      0.06      0.11      4966

    accuracy                           0.84    192173
   macro avg       0.64      0.38      0.41    192173
weighted avg       0.82      0.84      0.82    192173



### Task:
- Train `DecisionTree` Classification model using this cleaned data and pipeline, and report the metric evaluation.

In [21]:
from sklearn.tree import DecisionTreeClassifier

clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(max_depth=10, min_samples_split=20))
])


In [23]:
# Fit the model
clf.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('classifier', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,10
,min_samples_split,20
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [24]:
# Predict on test
y_pred = clf.predict(X_test)

In [25]:
# Evaluate (Decision Tree)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8191317198565875
Classification Report:
               precision    recall  f1-score   support

           1       0.76      0.16      0.26      1701
           2       0.84      0.96      0.90    152989
           3       0.62      0.29      0.39     32517
           4       0.51      0.08      0.14      4966

    accuracy                           0.82    192173
   macro avg       0.68      0.37      0.42    192173
weighted avg       0.79      0.82      0.79    192173

