In [1]:
import pandas as pd
import numpy as np


from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Acc_Data_no_hotencode.csv')

In [3]:
df.shape

(635523, 9)

In [4]:
df.head()

Unnamed: 0,Accident_Severity,Light_Conditions,Number_of_Casualties,Number_of_Vehicles,Road_Surface_Conditions,Road_Type,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type
0,Serious,Darkness,1,2,Dry,Single_carriageway,Urban,Fine,Car
1,Serious,Daylight,1,2,Wet_or_damp,Single_carriageway,Urban,Raining,Car
2,Serious,Daylight,1,4,Dry,Single_carriageway,Urban,Fine,Passenger_Vehicles(Minibus_or_Bus)
3,Serious,Daylight,2,3,Dry,Single_carriageway,Urban,Fine,Car
4,Serious,Daylight,1,2,Dry,Dual_carriageway,Urban,Fine,Goods_Carrier


In [5]:
{column: len(df[column].unique()) for column in df.columns if df.dtypes[column] == 'object'}

{'Accident_Severity': 2,
 'Light_Conditions': 2,
 'Road_Surface_Conditions': 3,
 'Road_Type': 4,
 'Urban_or_Rural_Area': 2,
 'Weather_Conditions': 3,
 'Vehicle_Type': 5}

In [7]:
from sklearn.preprocessing import LabelEncoder

le_Light_Conditions = LabelEncoder()
le_Road_Surface_Conditions = LabelEncoder()
le_Road_Type = LabelEncoder()
le_Urban_or_Rural_Area = LabelEncoder()
le_Weather_Conditions = LabelEncoder()
le_Vehicle_Type = LabelEncoder()

In [8]:
df['Light_Conditions'] = le_Light_Conditions.fit_transform(df['Light_Conditions'])
df["Light_Conditions"].unique()

array([0, 1])

In [9]:
df['Road_Surface_Conditions'] = le_Road_Surface_Conditions.fit_transform(df['Road_Surface_Conditions'])
df["Road_Surface_Conditions"].unique()

array([0, 2, 1])

In [10]:
df['Road_Type'] = le_Road_Type.fit_transform(df['Road_Type'])
df["Road_Type"].unique()

array([3, 0, 1, 2])

In [11]:
df['Urban_or_Rural_Area'] = le_Urban_or_Rural_Area.fit_transform(df['Urban_or_Rural_Area'])
df["Urban_or_Rural_Area"].unique()

array([1, 0])

In [12]:
df['Weather_Conditions'] = le_Weather_Conditions.fit_transform(df['Weather_Conditions'])
df["Weather_Conditions"].unique()

array([0, 1, 2])

In [13]:
df['Vehicle_Type'] = le_Vehicle_Type.fit_transform(df['Vehicle_Type'])
df["Vehicle_Type"].unique()

array([1, 4, 2, 3, 0])

In [14]:
df.head()

Unnamed: 0,Accident_Severity,Light_Conditions,Number_of_Casualties,Number_of_Vehicles,Road_Surface_Conditions,Road_Type,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type
0,Serious,0,1,2,0,3,1,0,1
1,Serious,1,1,2,2,3,1,1,1
2,Serious,1,1,4,0,3,1,0,4
3,Serious,1,2,3,0,3,1,0,1
4,Serious,1,1,2,0,0,1,0,2


In [15]:
target = 'Accident_Severity'

In [16]:
y = df[target].copy()
X = df.drop(columns=[target]).copy()

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [18]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

under = RandomUnderSampler(sampling_strategy=0.5)
X_train, y_train= under.fit_resample(X_train, y_train)

In [19]:
over = RandomOverSampler(sampling_strategy=0.8)
X_train, y_train= over.fit_resample(X_train, y_train)

In [20]:
unique, count = np.unique(y_train, return_counts=True)
y_train_value_count = { k:v for (k,v) in zip(unique, count)}
y_train_value_count

{'Serious': 105969, 'Slight': 132462}

In [21]:
#Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test= sc.transform(X_test)


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import warnings
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [23]:
clf = LogisticRegression(solver='saga').fit(X_train, y_train)
Y_Test_Pred = clf.predict(X_test)

print(classification_report(Y_Test_Pred, y_test))

              precision    recall  f1-score   support

     Serious       0.37      0.23      0.28     45976
      Slight       0.78      0.88      0.83    144681

    accuracy                           0.72    190657
   macro avg       0.58      0.55      0.55    190657
weighted avg       0.68      0.72      0.70    190657



In [24]:
# Create a pipeline with scaler and logistic regression
pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, solver='saga', tol=0.1))

# Create a parameter grid
param_grid = {
    'logisticregression__C': [0.1, 0.5, 1, 10, 100],
    'logisticregression__penalty': ['l1', 'l2']
}

# Create GridSearchCV object
grid_search = GridSearchCV(pipe, param_grid, cv=5)

# Fit the model
grid_result = grid_search.fit(X_train, y_train)

In [25]:
# Print best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Parameters: {'logisticregression__C': 0.1, 'logisticregression__penalty': 'l1'}
Best Score: 0.6041202719426295


In [26]:
grid_opt = LogisticRegression(C=0.5, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          n_jobs=None, penalty='l2', random_state=None,
          tol=0.1, verbose=0, warm_start=False).fit(X_train, y_train)

Y_Test_Pred = grid_opt.predict(X_test)

print(classification_report(Y_Test_Pred, y_test))

              precision    recall  f1-score   support

     Serious       0.37      0.23      0.28     45976
      Slight       0.78      0.88      0.83    144681

    accuracy                           0.72    190657
   macro avg       0.58      0.55      0.55    190657
weighted avg       0.68      0.72      0.70    190657



Model testing and saving

In [27]:
X

Unnamed: 0,Light_Conditions,Number_of_Casualties,Number_of_Vehicles,Road_Surface_Conditions,Road_Type,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type
0,0,1,2,0,3,1,0,1
1,1,1,2,2,3,1,1,1
2,1,1,4,0,3,1,0,4
3,1,2,3,0,3,1,0,1
4,1,1,2,0,0,1,0,2
...,...,...,...,...,...,...,...,...
635518,1,2,1,0,3,0,0,1
635519,0,1,1,1,3,0,0,1
635520,1,1,3,1,3,0,0,1
635521,0,1,2,2,3,0,0,3


In [28]:
X = np.array([['Darkness',7,10,'Frost_or_Snow','Dual_carriageway','Rural','Raining','Car']])

In [29]:

X[:,0] = le_Light_Conditions.transform(X[:,0])
X[:,1] = X[:,1]
X[:,2] = X[:,2]
X[:,3] = le_Road_Surface_Conditions.transform(X[:,3])
X[:,4] = le_Road_Type.transform(X[:,4])
X[:,5] = le_Urban_or_Rural_Area.transform(X[:,5])
X[:,6] = le_Weather_Conditions.transform(X[:,6])
X[:,7] = le_Vehicle_Type.transform(X[:,7])

X = X.astype(float)
X


array([[ 0.,  7., 10.,  1.,  0.,  0.,  1.,  1.]])

In [31]:
y_pred = clf.predict(X)

y_pred = grid_opt.predict(X)
y_pred

array(['Slight'], dtype=object)

In [32]:
import pickle

In [33]:
data = {"model": grid_opt, "le_Light_Conditions": le_Light_Conditions, "le_Road_Surface_Conditions": le_Road_Surface_Conditions, "le_Road_Type": le_Road_Type, "le_Urban_or_Rural_Area": le_Urban_or_Rural_Area, "le_Weather_Conditions": le_Weather_Conditions, "le_Vehicle_Type": le_Vehicle_Type}
with open('saved_steps.pkl', 'wb') as file:
    pickle.dump(data, file)

In [34]:
with open('saved_steps.pkl', 'rb') as file:
    data = pickle.load(file)

model_loaded = data["model"]
le_Light_Conditions = data["le_Light_Conditions"]
le_Road_Surface_Conditions = data["le_Road_Surface_Conditions"]
le_Road_Type = data["le_Road_Type"]
le_Urban_or_Rural_Area = data["le_Urban_or_Rural_Area"]
le_Weather_Conditions = data["le_Weather_Conditions"]
le_Vehicle_Type = data["le_Vehicle_Type"]


In [35]:
y_pred = model_loaded.predict(X)
y_pred

array(['Slight'], dtype=object)