In [1]:
import numpy as np
import pandas as pd 

# visualization data
import seaborn as sns
import matplotlib.pyplot as plt 
import plotly.express as px

# preprocess the data
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.impute import SimpleImputer, KNNImputer

# iterative operator
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# machine learning
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
# classification task
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

# metrics
from sklearn.metrics import accuracy_score, precision_score, classification_report, mean_absolute_error, mean_squared_error

# ignore warnings   
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load the data from csv file placed locally in our pc
df = pd.read_csv("Road Accident Data.csv")

df.head()

Unnamed: 0,Accident_Index,Accident Date,Day_of_Week,Junction_Control,Junction_Detail,Accident_Severity,Latitude,Light_Conditions,Local_Authority_(District),Carriageway_Hazards,...,Number_of_Casualties,Number_of_Vehicles,Police_Force,Road_Surface_Conditions,Road_Type,Speed_limit,Time,Urban_or_Rural_Area,Weather_Conditions,Vehicle_Type
0,200901BS70001,1/1/2021,Thursday,Give way or uncontrolled,T or staggered junction,Serious,51.512273,Daylight,Kensington and Chelsea,,...,1,2,Metropolitan Police,Dry,One way street,30,15:11,Urban,Fine no high winds,Car
1,200901BS70002,1/5/2021,Monday,Give way or uncontrolled,Crossroads,Serious,51.514399,Daylight,Kensington and Chelsea,,...,11,2,Metropolitan Police,Wet or damp,Single carriageway,30,10:59,Urban,Fine no high winds,Taxi/Private hire car
2,200901BS70003,1/4/2021,Sunday,Give way or uncontrolled,T or staggered junction,Slight,51.486668,Daylight,Kensington and Chelsea,,...,1,2,Metropolitan Police,Dry,Single carriageway,30,14:19,Urban,Fine no high winds,Taxi/Private hire car
3,200901BS70004,1/5/2021,Monday,Auto traffic signal,T or staggered junction,Serious,51.507804,Daylight,Kensington and Chelsea,,...,1,2,Metropolitan Police,Frost or ice,Single carriageway,30,8:10,Urban,Other,Motorcycle over 500cc
4,200901BS70005,1/6/2021,Tuesday,Auto traffic signal,Crossroads,Serious,51.482076,Darkness - lights lit,Kensington and Chelsea,,...,1,2,Metropolitan Police,Dry,Single carriageway,30,17:25,Urban,Fine no high winds,Car


In [5]:
# handle missing values using Mode for categorical column
df['Weather_Conditions'] = df['Weather_Conditions'].mode()[0]
df['Road_Type'] = df['Road_Type'].mode()[0]
df['Road_Surface_Conditions'] = df['Road_Surface_Conditions'].mode()[0]

# handle using knn imputer
from sklearn.impute import KNNImputer

# Convert 'Time' column to minutes
df['Time'] = pd.to_datetime(df['Time']).dt.hour * 60 + pd.to_datetime(df['Time']).dt.minute
imputer = KNNImputer(n_neighbors=5)
df["Time"] = imputer.fit_transform(df[["Time"]])

# remove missing values
df = df.drop('Carriageway_Hazards', axis=1)
df.dropna(inplace=True)

In [6]:
df.isnull().sum()

Accident_Index                0
Accident Date                 0
Day_of_Week                   0
Junction_Control              0
Junction_Detail               0
Accident_Severity             0
Latitude                      0
Light_Conditions              0
Local_Authority_(District)    0
Longitude                     0
Number_of_Casualties          0
Number_of_Vehicles            0
Police_Force                  0
Road_Surface_Conditions       0
Road_Type                     0
Speed_limit                   0
Time                          0
Urban_or_Rural_Area           0
Weather_Conditions            0
Vehicle_Type                  0
dtype: int64

In [7]:
# split the data into X and y
X = df.drop('Vehicle_Type', axis=1)
y = df['Vehicle_Type']

In [31]:
# Assuming 'Vehicle_Type' is your target variable, you may need to convert it to numerical values if it's categorical
# For example, using label encoding
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(df['Vehicle_Type'])

In [37]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

**Standard Scaler**

In [33]:
sc = StandardScaler()
sc.fit_transform(X_train, Y_train)

array([[ 0.96470825,  0.86124258, -0.47755361, ...,  0.        ,
         0.74205956,  0.        ],
       [-0.96291766, -1.0293429 , -0.47755361, ...,  0.        ,
         0.74205956,  0.        ],
       [ 0.01248389, -0.68431105,  0.00563924, ...,  0.        ,
        -1.34760072,  0.        ],
       ...,
       [ 0.38053961, -0.82610496,  0.00563924, ...,  0.        ,
        -1.34760072,  0.        ],
       [-0.96291766,  0.07192314, -0.47755361, ...,  0.        ,
         0.74205956,  0.        ],
       [-0.96291766, -1.51616866, -1.44393932, ...,  0.        ,
         0.74205956,  0.        ]])

**ML Algorithm**
**Linear Regression**

In [55]:
from sklearn.linear_model import LinearRegression
le = LinearRegression()
le.fit(X_train, Y_train)

In [35]:
print(le.intercept_)
print(le.coef_)

1.7282634792550156
[ 4.16132985e-07  4.36604553e-05  1.80438074e-03 -2.25864372e-02
  3.69875099e-03 -2.27669006e-02  3.42755770e-02 -1.33437605e-02
 -1.78049767e-04 -8.89239114e-04 -9.42418624e-03 -1.86576227e-02
 -1.16192562e-05  0.00000000e+00  0.00000000e+00  4.04079591e-04
  0.00000000e+00  3.40470071e-02  0.00000000e+00]


In [36]:
Y_pred = le.predict(X_test)

In [49]:
from sklearn.metrics import mean_squared_error, r2_score, confusion_matrix
# evaluate the model
print('MSE = ', mean_squared_error(Y_test, Y_pred))
print('R2 = ', r2_score(Y_test, Y_pred))
print('RMSE = ', np.sqrt(mean_squared_error(Y_test, Y_pred)))

MSE =  11.126368705316604
R2 =  0.0002063418678726059
RMSE =  3.3356211873227757


In [46]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, Y_train)

In [48]:
# predict the model
y_pred = dtc.predict(X_test)