<h3>Logistic Regression</h3>

In [1]:
# Step 1: Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# ===========================
# 2. LOAD DATA
# ===========================
df = pd.read_excel("ROADCRASH_MTECH.xlsx")


In [3]:
df.shape

(49, 19)

In [4]:
df.columns.tolist()

['Sr.No.',
 'Date',
 'Time of Accident(am/pm)',
 'SIDE  (LHS/RHS)',
 'Accident Location with Name of Village/Town/Landmarks',
 'Nature of Accident/Collision Type',
 'Classification of Accident',
 'Casue of Accident',
 'Road Feature/Type',
 'Weather Condtion',
 'Vehicles Responsible/Involved(Vehicle Type & Regn. Number)',
 'Vehicle Type',
 'Fatalities(Deaths)',
 'Grievous Injury*',
 'Minor Injury',
 'Non Injuired',
 'No. of Animals Hit/Killed,if any',
 'Help Provided by Ambulance/ Patrol/Carne,Etc',
 'Remarks']

In [5]:
# Step 3: Define dependent and independent variables
y = df["Classification of Accident"]   # Target (Severity)
X = df.drop(columns=["Classification of Accident", "Sr.No.", "Remarks"], errors="ignore")

In [6]:
# Step 4: Handle missing values
X = X.fillna("Unknown")
y = y.fillna("Unknown")


In [7]:
# ‚úÖ Step 5: Convert all datetime columns to string before encoding
for col in X.columns:
    if pd.api.types.is_datetime64_any_dtype(X[col]):
        X[col] = X[col].astype(str)

In [8]:
# ‚úÖ Step 6: Convert categorical columns to numbers
le = LabelEncoder()
X = X.apply(lambda col: le.fit_transform(col.astype(str)))
y = le.fit_transform(y.astype(str))

In [9]:
# Step 7: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [10]:
# Step 8: Build Logistic Regression model
model = LogisticRegression(max_iter=5000)
model.fit(X_train, y_train)


In [11]:
# Step 9: Predict severity
y_pred = model.predict(X_test)

# Step 10: Evaluate the model
print("\nüéØ Model Performance:")
print("----------------------")
#print("Accuracy:", accuracy_score(y_test, y_pred))
print("Accuracy:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred, zero_division=0))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


üéØ Model Performance:
----------------------
Accuracy: 50.0 %

Classification Report:
               precision    recall  f1-score   support

           0       0.33      1.00      0.50         1
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         1
           3       0.67      0.67      0.67         3
           4       0.67      0.40      0.50         5

    accuracy                           0.50        10
   macro avg       0.33      0.41      0.33        10
weighted avg       0.57      0.50      0.50        10


Confusion Matrix:
 [[1 0 0 0 0]
 [0 0 0 0 0]
 [0 0 0 0 1]
 [1 0 0 2 0]
 [1 1 0 1 2]]


In [12]:
df["Classification of Accident"].value_counts()
# But performance is weak (50%), and the metrics show imbalance and poor recall for several classes.

Classification of Accident
C4      17
C3      15
C2       9
C1       5
C1/2     1
Name: count, dtype: int64

In [15]:
# ==============================================================
# Step 11: Try Other Models for Comparison
# ==============================================================

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# ===============================
# 1Ô∏è‚É£ Decision Tree Classifier
# ===============================

# What: A tree-like model that splits data based on feature values.
# Why: Easy to interpret and captures non-linear relationships.

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)

print("\nüå≥ Decision Tree Model Performance:")
print("-----------------------------------")
print("Accuracy:", round(accuracy_score(y_test, y_pred_dt) * 100, 2), "%")
print("Classification Report:\n", classification_report(y_test, y_pred_dt, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))




üå≥ Decision Tree Model Performance:
-----------------------------------
Accuracy: 100.0 %
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         3
           4       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

Confusion Matrix:
 [[1 0 0 0]
 [0 1 0 0]
 [0 0 3 0]
 [0 0 0 5]]


In [17]:
# ===============================
# 2Ô∏è‚É£ Random Forest Classifier
# ===============================

# What: A collection (ensemble) of many decision trees.
# Why: Reduces overfitting and usually performs better than a single tree.

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

print("\nüå≤ Random Forest Model Performance:")
print("-----------------------------------")
print("Accuracy:", round(accuracy_score(y_test, y_pred_rf) * 100, 2), "%")
print("Classification Report:\n", classification_report(y_test, y_pred_rf, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))



üå≤ Random Forest Model Performance:
-----------------------------------
Accuracy: 100.0 %
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         3
           4       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

Confusion Matrix:
 [[1 0 0 0]
 [0 1 0 0]
 [0 0 3 0]
 [0 0 0 5]]


In [16]:
# ===============================
# 3Ô∏è‚É£ XGBoost Classifier
# ===============================

# What: A boosting model that builds trees sequentially, where each tree fixes
#       the mistakes of the previous ones.
# Why: Very powerful for structured/tabular data and handles imbalanced data well.

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

print("\n‚ö° XGBoost Model Performance:")
print("-----------------------------------")
print("Accuracy:", round(accuracy_score(y_test, y_pred_xgb) * 100, 2), "%")
print("Classification Report:\n", classification_report(y_test, y_pred_xgb, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



‚ö° XGBoost Model Performance:
-----------------------------------
Accuracy: 100.0 %
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           2       1.00      1.00      1.00         1
           3       1.00      1.00      1.00         3
           4       1.00      1.00      1.00         5

    accuracy                           1.00        10
   macro avg       1.00      1.00      1.00        10
weighted avg       1.00      1.00      1.00        10

Confusion Matrix:
 [[1 0 0 0]
 [0 1 0 0]
 [0 0 3 0]
 [0 0 0 5]]
