In [None]:
import numpy as np
import pickle as pkl
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report

In [None]:
data=pd.read_excel(r'Modified_Data_With_Accident_Factor (2).xlsx')
data.head()

Unnamed: 0,District,PS Name,FIR No,Date Report,Date Accident,Time Report,Time Accident,Sections,Accident type,Death,...,Cluster,Cluster_Latitude,Cluster_Longitude,Cluster_Place_Name,Datetime,Hour,Time Range,Accident Hour,Accident Time of Day,Accident factor
0,KOLLAM CITY,Eravipuram,5653/2023,2023-01-06,2023-09-27,17:33:33,00:00:00,279338,Grevious Injury,0,...,2109,8.865654,76.607678,"Polayathaodu, Kollam, Kerala, 691001, India",2023-09-27 00:00:00,0,0-3,0,Midnight,Visibility
1,ERNAKULAM CITY,Kalamassery,6992/2023,2023-01-20,2023-01-19,18:42:00,00:15:00,279338,Grevious Injury,0,...,2423,10.029299,76.305734,"Skyline Road, Sahrudaya Nagar, Edapally, Ernak...",2023-01-19 00:15:00,0,0-3,0,Midnight,Over speeding
2,THRISSUR RURAL,Irinjalakkuda,7869/2023,2023-01-24,2023-01-10,16:43:00,01:00:00,279338,Grevious Injury,0,...,2311,10.345686,76.216873,"Daivamakkal Prarthanalayam, Daivamakkal Road, ...",2023-01-10 01:00:00,1,0-3,1,Midnight,Drunk and drive
3,PALAKKAD,Mannarkkad,8665/2022,2023-01-06,2023-01-06,17:00:00,01:30:00,279337,Minor Injury,0,...,1237,10.985304,76.483019,"Palakkad-Malappuram-Calicut Road, Kottiyode, M...",2023-01-06 01:30:00,1,0-3,1,Midnight,Using mobile phones
4,ALAPPUZHA,Chengannur,6092/2023,2023-01-21,2023-01-16,16:30:00,01:15:00,279337338,Grevious Injury,0,...,2504,9.282013,76.654424,"Karakkad, Chengannur, Alappuzha, Kerala, 68950...",2023-01-16 01:15:00,1,0-3,1,Midnight,Over speeding


In [None]:
print("Dataset Preview:")
print(data.head())

# Get dataset info (columns, data types, missing values)
print("\nDataset Info:")
print(data.info())

# Check for missing values
print("\nMissing Values:")
print(data.isna().sum())

# Check for duplicate rows
print("\nDuplicate Rows:")
print(data.duplicated().sum())

Dataset Preview:
         District        PS Name     FIR No Date Report Date Accident  \
0     KOLLAM CITY     Eravipuram  5653/2023  2023-01-06    2023-09-27   
1  ERNAKULAM CITY    Kalamassery  6992/2023  2023-01-20    2023-01-19   
2  THRISSUR RURAL  Irinjalakkuda  7869/2023  2023-01-24    2023-01-10   
3        PALAKKAD     Mannarkkad  8665/2022  2023-01-06    2023-01-06   
4       ALAPPUZHA     Chengannur  6092/2023  2023-01-21    2023-01-16   

  Time Report Time Accident     Sections    Accident type  Death  ...  \
0    17:33:33      00:00:00      279,338  Grevious Injury      0  ...   
1    18:42:00      00:15:00      279,338  Grevious Injury      0  ...   
2    16:43:00      01:00:00      279,338  Grevious Injury      0  ...   
3    17:00:00      01:30:00      279,337     Minor Injury      0  ...   
4    16:30:00      01:15:00  279,337,338  Grevious Injury      0  ...   

   Cluster  Cluster_Latitude  Cluster_Longitude  \
0     2109          8.865654          76.607678   
1  

Label encoding


In [None]:
columns_to_encode = ['Accident Time of Day','Lanes Road','Weather','Type Road','Traffic Control','Accident type','Accident factor']

label_encoder = LabelEncoder()

for column in columns_to_encode:
    data[column] = label_encoder.fit_transform(data[column])



Frequency encoding

In [None]:
for col in ['District', 'PS Name']:
    freq_encoding = data.groupby(col).size() / len(data)
    data[col] = data[col].map(freq_encoding)


Ordinal encoding

In [None]:
type_area_order = ['Rural', 'Urban']
divider_order = ['no','yes']

ordinal_encoder = OrdinalEncoder(categories=[type_area_order, divider_order], handle_unknown='use_encoded_value', unknown_value=-1)

data[['Type Area', 'Divider']] = ordinal_encoder.fit_transform(data[['Type Area', 'Divider']])



In [None]:
data = data[['Accident Time of Day','Lanes Road','Weather','Type Road','Traffic Control','Accident type','Accident factor','District', 'PS Name','Type Area', 'Divider']]

In [None]:
data.head()

Unnamed: 0,Accident Time of Day,Lanes Road,Weather,Type Road,Traffic Control,Accident type,Accident factor,District,PS Name,Type Area,Divider
0,5,2,3,4,4,1,7,0.04626,0.00386,1.0,0.0
1,5,1,2,2,1,1,3,0.0569,0.00562,1.0,1.0
2,5,0,2,4,4,1,1,0.05166,0.00426,0.0,0.0
3,5,2,0,2,4,2,6,0.05626,0.0027,0.0,0.0
4,5,0,0,5,4,1,3,0.07848,0.00536,0.0,0.0


Train test split

In [None]:
X = data.drop('Accident type', axis=1)
y = data['Accident type']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)
print("Training target shape:", y_train.shape)
print("Testing target shape:", y_test.shape)

Training features shape: (40000, 10)
Testing features shape: (10000, 10)
Training target shape: (40000,)
Testing target shape: (10000,)


Random forest

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=200,random_state=42)
rf_classifier.fit(X_train, y_train)

y_pred_rf = rf_classifier.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {accuracy_rf}")
print(classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.8856
              precision    recall  f1-score   support

           0       0.77      0.55      0.64       815
           1       0.91      0.95      0.93      7327
           2       0.85      0.83      0.84      1525
           3       0.73      0.52      0.61       333

    accuracy                           0.89     10000
   macro avg       0.81      0.71      0.75     10000
weighted avg       0.88      0.89      0.88     10000



Logistic regression

In [None]:
logreg_classifier = LogisticRegression(random_state=42, max_iter=1000) # Increased max_iter
logreg_classifier.fit(X_train, y_train)

y_pred_logreg = logreg_classifier.predict(X_test)

accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print(f"Logistic Regression Accuracy: {accuracy_logreg}")
print(classification_report(y_test, y_pred_logreg))

Logistic Regression Accuracy: 0.7327
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       815
           1       0.73      1.00      0.85      7327
           2       0.00      0.00      0.00      1525
           3       0.00      0.00      0.00       333

    accuracy                           0.73     10000
   macro avg       0.18      0.25      0.21     10000
weighted avg       0.54      0.73      0.62     10000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


XG Boost

In [None]:
xgb_classifier = XGBClassifier(n_estimators=400,learning_rate=1,random_state=42)  # You can add hyperparameters here
xgb_classifier.fit(X_train, y_train)

y_pred_xgb = xgb_classifier.predict(X_test)

accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {accuracy_xgb}")
print(classification_report(y_test, y_pred_xgb))

XGBoost Accuracy: 0.9013
              precision    recall  f1-score   support

           0       0.77      0.71      0.74       815
           1       0.93      0.95      0.94      7327
           2       0.86      0.84      0.85      1525
           3       0.70      0.64      0.67       333

    accuracy                           0.90     10000
   macro avg       0.82      0.78      0.80     10000
weighted avg       0.90      0.90      0.90     10000



In [None]:
filename = 'xgb_classifier_model.pkl'
with open(filename, 'wb') as file:
    pkl.dump(xgb_classifier, file)