In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [24]:
df = pd.read_csv("Acc_Data_no_hotencode.csv")

In [25]:
df.shape

(635523, 9)

In [26]:
df.head()

Unnamed: 0,Accident_Severity,Light_Conditions,Number_of_Casualties,Number_of_Vehicles,Road_Surface_Conditions,Road_Type,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type
0,Serious,Darkness,1,2,Dry,Single_carriageway,Urban,Fine,Car
1,Serious,Daylight,1,2,Wet_or_damp,Single_carriageway,Urban,Raining,Car
2,Serious,Daylight,1,4,Dry,Single_carriageway,Urban,Fine,Passenger_Vehicles(Minibus_or_Bus)
3,Serious,Daylight,2,3,Dry,Single_carriageway,Urban,Fine,Car
4,Serious,Daylight,1,2,Dry,Dual_carriageway,Urban,Fine,Goods_Carrier


In [27]:
df['Accident_Severity'].value_counts()

Accident_Severity
Slight     541222
Serious     94301
Name: count, dtype: int64


Label Encoding

In [28]:
{column: len(df[column].unique()) for column in df.columns if df.dtypes[column] == 'object'}

{'Accident_Severity': 2,
 'Light_Conditions': 2,
 'Road_Surface_Conditions': 3,
 'Road_Type': 4,
 'Urban_or_Rural_Area': 2,
 'Weather_Conditions': 3,
 'Vehicle_Type': 5}

In [29]:
le_Light_Conditions = LabelEncoder()
le_Road_Surface_Conditions = LabelEncoder()
le_Road_Type = LabelEncoder()
le_Urban_or_Rural_Area = LabelEncoder()
le_Weather_Conditions = LabelEncoder()
le_Vehicle_Type = LabelEncoder()

In [30]:
df['Light_Conditions'] = le_Light_Conditions.fit_transform(df['Light_Conditions'])
df["Light_Conditions"].unique()

array([0, 1])

In [31]:
df['Road_Surface_Conditions'] = le_Road_Surface_Conditions.fit_transform(df['Road_Surface_Conditions'])
df["Road_Surface_Conditions"].unique()

array([0, 2, 1])

In [32]:
df['Road_Type'] = le_Road_Type.fit_transform(df['Road_Type'])
df["Road_Type"].unique()

array([3, 0, 1, 2])

In [33]:
df['Urban_or_Rural_Area'] = le_Urban_or_Rural_Area.fit_transform(df['Urban_or_Rural_Area'])
df["Urban_or_Rural_Area"].unique()

array([1, 0])

In [34]:
df['Weather_Conditions'] = le_Weather_Conditions.fit_transform(df['Weather_Conditions'])
df["Weather_Conditions"].unique()

array([0, 1, 2])

In [35]:
df['Vehicle_Type'] = le_Vehicle_Type.fit_transform(df['Vehicle_Type'])
df["Vehicle_Type"].unique()

array([1, 4, 2, 3, 0])

In [36]:
df.head()

Unnamed: 0,Accident_Severity,Light_Conditions,Number_of_Casualties,Number_of_Vehicles,Road_Surface_Conditions,Road_Type,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type
0,Serious,0,1,2,0,3,1,0,1
1,Serious,1,1,2,2,3,1,1,1
2,Serious,1,1,4,0,3,1,0,4
3,Serious,1,2,3,0,3,1,0,1
4,Serious,1,1,2,0,0,1,0,2


Test/Train split of data as 30/70 respectively

In [37]:
target = 'Accident_Severity'

In [58]:
y = df[target].copy()
X = df.drop(columns=[target]).copy()

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

Sampling of the train datasets

In [40]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

under = RandomUnderSampler(sampling_strategy=0.5)
X_train, y_train= under.fit_resample(X_train, y_train)

In [41]:
over = RandomOverSampler(sampling_strategy=0.8)
X_train, y_train= over.fit_resample(X_train, y_train)

In [42]:
unique, count = np.unique(y_train, return_counts=True)
y_train_value_count = { k:v for (k,v) in zip(unique, count)}
y_train_value_count

{'Serious': 105969, 'Slight': 132462}

Logistic Regression Classifier

In [43]:
'''
#Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test= sc.transform(X_test)
'''

'\n#Scaling\nfrom sklearn.preprocessing import StandardScaler\nsc = StandardScaler()\nX_train = sc.fit_transform(X_train)\nX_test= sc.transform(X_test)\n'

In [44]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [45]:
clf = LogisticRegression(solver='saga').fit(X_train, y_train)

In [46]:
Y_Test_Pred = clf.predict(X_test)

In [48]:
print(classification_report(Y_Test_Pred, y_test))

              precision    recall  f1-score   support

     Serious       0.36      0.23      0.28     44826
      Slight       0.79      0.88      0.83    145831

    accuracy                           0.72    190657
   macro avg       0.58      0.55      0.56    190657
weighted avg       0.69      0.72      0.70    190657



In [36]:
# Create a pipeline with scaler and logistic regression
pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, solver='saga', tol=0.1))

# Create a parameter grid
param_grid = {
    'logisticregression__C': [0.1, 0.5, 1, 10, 100],
    'logisticregression__penalty': ['l1', 'l2']
}

# Create GridSearchCV object
grid_search = GridSearchCV(pipe, param_grid, cv=5)

# Fit the model
grid_result = grid_search.fit(X_train, y_train)

In [37]:
# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'logisticregression__C': 1, 'logisticregression__penalty': 'l1'}
Best Score: 0.6005382599615288


In [50]:
grid_opt = LogisticRegression(C=0.5, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          n_jobs=None, penalty='l2', random_state=None,
          tol=0.1, verbose=0, warm_start=False).fit(X_train, y_train)

In [51]:
Y_Test_Pred = grid_opt.predict(X_test)

In [52]:
print(classification_report(Y_Test_Pred, y_test))

              precision    recall  f1-score   support

     Serious       0.36      0.23      0.28     44826
      Slight       0.79      0.88      0.83    145831

    accuracy                           0.72    190657
   macro avg       0.58      0.55      0.56    190657
weighted avg       0.69      0.72      0.70    190657



Model testing and saving

In [61]:
X

Unnamed: 0,Light_Conditions,Number_of_Casualties,Number_of_Vehicles,Road_Surface_Conditions,Road_Type,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type
0,0,1,2,0,3,1,0,1
1,1,1,2,2,3,1,1,1
2,1,1,4,0,3,1,0,4
3,1,2,3,0,3,1,0,1
4,1,1,2,0,0,1,0,2
...,...,...,...,...,...,...,...,...
635518,1,2,1,0,3,0,0,1
635519,0,1,1,1,3,0,0,1
635520,1,1,3,1,3,0,0,1
635521,0,1,2,2,3,0,0,3


In [62]:
X = np.array([['Darkness',7,10,'Frost_or_Snow','Dual_carriageway','Rural','Raining','Car']])

In [63]:

X[:,0] = le_Light_Conditions.transform(X[:,0])
X[:,1] = X[:,1]
X[:,2] = X[:,2]
X[:,3] = le_Road_Surface_Conditions.transform(X[:,3])
X[:,4] = le_Road_Type.transform(X[:,4])
X[:,5] = le_Urban_or_Rural_Area.transform(X[:,5])
X[:,6] = le_Weather_Conditions.transform(X[:,6])
X[:,7] = le_Vehicle_Type.transform(X[:,7])

X = X.astype(float)
X = sc.fit_transform(X)
X


array([[0., 0., 0., 0., 0., 0., 0., 0.]])

In [64]:
y_pred = clf.predict(X)



In [65]:
y_pred = grid_opt.predict(X)
y_pred



array(['Serious'], dtype=object)

In [66]:
import pickle

In [67]:
data = {"model": grid_opt, "le_Light_Conditions": le_Light_Conditions, "le_Road_Surface_Conditions": le_Road_Surface_Conditions, "le_Road_Type": le_Road_Type, "le_Urban_or_Rural_Area": le_Urban_or_Rural_Area, "le_Weather_Conditions": le_Weather_Conditions, "le_Vehicle_Type": le_Vehicle_Type}
with open('saved_steps.pkl', 'wb') as file:
    pickle.dump(data, file)

In [68]:
with open('saved_steps.pkl', 'rb') as file:
    data = pickle.load(file)

model_loaded = data["model"]
le_Light_Conditions = data["le_Light_Conditions"]
le_Road_Surface_Conditions = data["le_Road_Surface_Conditions"]
le_Road_Type = data["le_Road_Type"]
le_Urban_or_Rural_Area = data["le_Urban_or_Rural_Area"]
le_Weather_Conditions = data["le_Weather_Conditions"]
le_Vehicle_Type = data["le_Vehicle_Type"]


In [69]:
y_pred = model_loaded.predict(X)
y_pred



array(['Serious'], dtype=object)