### Gradient Boosting Classifier
- Gradient Boosting is an ensemble machine learning algorithm that builds a sequence of weak learners, typically decision trees, where each subsequent model tries to correct the errors of the previous models.

- It optimizes a loss function by iteratively adding models that minimize the error, producing a strong predictive model.

- Gradient Boosting is effective for both classification and regression problems and often yields high accuracy.

- Unlike Random Forest which builds trees independently, Gradient Boosting builds trees sequentially, making it more prone to overfitting but also capable of capturing complex patterns.

- Hyperparameters like learning rate, number of trees (n_estimators), and max depth are critical and require tuning.

- Gradient Boosting can be slower to train but usually produces more accurate models for structured data.

- It handles numerical and categorical data with appropriate preprocessing and supports custom loss functions.

In [None]:
# Load necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.impute import SimpleImputer
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("C:/Users/win10/Desktop/Project_Aug25/data/accidents_cleaned.csv")
df.head()

In [None]:
# Separate features and target variable
target = 'Severity'
X = df.drop(columns=[target])
y = df[target]

In [None]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64', 'bool']).columns.tolist()

In [None]:
# Numeric transformer pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # fill missing
    ('scaler', StandardScaler())                     # scale numeric
])

# Categorical transformer pipeline
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing for numeric and categorical
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

In [None]:
# Create pipeline with GradientBoostingClassifier instead of RandomForest
clf_gb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(random_state=42, n_estimators=100))
])

In [None]:
# Fit the Gradient Boosting model
clf_gb.fit(X_train, y_train)

In [None]:
# Predict on the test set
y_pred_gb = clf_gb.predict(X_test)

In [None]:
# Evaluate performance
print("Gradient Boosting Classifier Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Classification Report:\n", classification_report(y_test, y_pred_gb))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))

In [None]:
# Feature Importance extraction after preprocessing

## Extract feature names after OneHotEncoding
cat_features = clf.named_steps['preprocessor'].named_transformers_['cat'].\
  .named_steps['onehot'].get_feature_names_out(categorical_cols)

all_features = np.concatenate([numerical_cols, cat_features])

importances = clf.named_steps['classifier'].feature_importances_
indices = np.argsort(importances)[::-1]

plt.figure(figsize=(12,6))
plt.title("Feature Importances from Random Forest Classifier")
plt.bar(range(len(importances)), importances[indices], align='center')
plt.xticks(range(len(importances)), all_features[indices], rotation=90)
plt.tight_layout()
plt.show()

#### Task: Batch Training of Advanced Models
- Explore and implement a method to train advanced machine learning models by dividing the preprocessed dataset into smaller batches.

- Train the model incrementally on these batches rather than all data at once.

- Combine or update the model progressively to obtain a final, fully trained model after processing all batches.

In [8]:
import pandas as pd
import numpy as np
import gc
import xgboost as xgb
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report
from IPython.display import display

Configuration

In [None]:
data_path = "C:/Users/dinesh/Documents/Infosys Springboard/Dataset/gpt_CleanData/cleaned_us_accidents.csv" 
target = "Severity" 
chunksize = 100_000     # smaller chunks to save RAM (was 200k)
val_size = 20_000       # smaller validation sample

Detect column types

In [20]:
sample_df = pd.read_csv(data_path, nrows=5000)
if target in sample_df.columns:
    sample_df = sample_df.drop(columns=[target])

categorical_cols = sample_df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = sample_df.select_dtypes(include=['int64', 'float64', 'bool']).columns.tolist()
print(f"Detected {len(numerical_cols)} numerical and {len(categorical_cols)} categorical columns.")

Detected 23 numerical and 10 categorical columns.


Preprocessing pipelines

In [21]:
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler(with_mean=False))  # avoids dense array
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])

Prepare validation set

In [22]:
val_df = pd.read_csv(data_path, nrows=val_size)
X_val = val_df.drop(columns=[target])
y_val = val_df[target]

print("Fitting preprocessor on validation sample...")
X_val_trans = preprocessor.fit_transform(X_val)   # fit once

# Map labels to 0..num_class-1
unique_classes = sorted(y_val.unique())
label_map = {label: i for i, label in enumerate(unique_classes)}
y_val_mapped = y_val.map(label_map)

# Convert to float32 and sparse DMatrix
X_val_trans = X_val_trans.astype(np.float32)
dval = xgb.DMatrix(X_val_trans, label=y_val_mapped)

# Clean memory
del X_val, val_df, X_val_trans
gc.collect()

Fitting preprocessor on validation sample...


520

XGBoost parameters

In [23]:
params = {
    'objective': 'multi:softmax',
    'num_class': len(unique_classes),
    'eval_metric': 'mlogloss',
    'eta': 0.1,
    'max_depth': 5,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'tree_method': 'hist',   # efficient on low RAM
    'device': 'cpu'
}

Incremental training

In [24]:
bst = None
batch_i = 0

for chunk in pd.read_csv(data_path, chunksize=chunksize, skiprows=range(1, val_size+1), header=0):
    batch_i += 1
    print(f"\nProcessing Batch {batch_i}...")

    # Features and labels
    X_chunk = chunk.drop(columns=[target])
    y_chunk = chunk[target].map(label_map)

    # Drop invalid labels
    valid_mask = y_chunk.notna()
    X_chunk = X_chunk[valid_mask]
    y_chunk = y_chunk[valid_mask]

    # Transform features
    X_chunk_trans = preprocessor.transform(X_chunk).astype(np.float32)
    dtrain = xgb.DMatrix(X_chunk_trans, label=y_chunk)

    del X_chunk, X_chunk_trans, chunk
    gc.collect()

    # Incremental training
    if bst is None:
        bst = xgb.train(params, dtrain, num_boost_round=50)
    else:
        bst = xgb.train(params, dtrain, num_boost_round=50, xgb_model=bst)

    # Evaluate on validation
    preds = bst.predict(dval)
    acc = accuracy_score(y_val_mapped, preds)
    print(f"Batch {batch_i} Validation Accuracy: {acc:.4f}")
    print(classification_report(y_val_mapped, preds, zero_division=0))

print("\nTraining complete — Incremental XGBoost finished successfully.")


Processing Batch 1...
Batch 1 Validation Accuracy: 0.8809
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        18
           1       0.89      0.99      0.94     17432
           2       0.68      0.18      0.28      1695
           3       0.75      0.01      0.01       855

    accuracy                           0.88     20000
   macro avg       0.58      0.29      0.31     20000
weighted avg       0.86      0.88      0.84     20000


Processing Batch 2...
Batch 2 Validation Accuracy: 0.8843
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        18
           1       0.89      0.99      0.94     17432
           2       0.65      0.27      0.38      1695
           3       0.72      0.02      0.03       855

    accuracy                           0.88     20000
   macro avg       0.57      0.32      0.34     20000
weighted avg       0.86      0.88      0.85     20000


Processing 