### CatBoost GBM Classifier Model Training

In [3]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [4]:
# Load necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from catboost import CatBoostClassifier

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
df = pd.read_csv("/content/drive/MyDrive/Road_Accident_Aalysis_project/Accidents_Cleaned.csv")
# df = df.sample(100000, random_state=42)


target = 'Severity'
X = df.drop(columns=target)
y = df[target]

In [6]:
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64', 'bool']).columns.tolist()

In [7]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [11]:
clf_catboost = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(
        iterations=100,
        learning_rate=0.05,
        depth=6,
        loss_function='MultiClass',
        eval_metric='Accuracy',
        random_seed=42,
        verbose=50,
        early_stopping_rounds=20,

    ))
])

clf_catboost.fit(X_train[:50000], y_train[:50000])

0:	learn: 0.6557400	total: 1.25s	remaining: 2m 3s
50:	learn: 0.7040600	total: 37.5s	remaining: 36.1s
99:	learn: 0.7230200	total: 1m 13s	remaining: 0us


In [12]:
y_pred_cb = clf_catboost.predict(X_test)

print('CatBoost Classifier Accuracy:', accuracy_score(y_test, y_pred_cb))
print('\nClassification Report:\n', classification_report(y_test, y_pred_cb))
print('\nConfusion Matrix:\n', confusion_matrix(y_test, y_pred_cb))

CatBoost Classifier Accuracy: 0.7209191310603968

Classification Report:
               precision    recall  f1-score   support

           1       0.00      0.00      0.00      5725
           2       0.73      0.92      0.81    190471
           3       0.68      0.38      0.49     95322
           4       0.00      0.00      0.00      1759

    accuracy                           0.72    293277
   macro avg       0.35      0.33      0.33    293277
weighted avg       0.70      0.72      0.69    293277



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Confusion Matrix:
 [[     0   5558    167      0]
 [     0 174774  15697      0]
 [     0  58667  36655      0]
 [     0    390   1369      0]]
