Final Estimator: This is the model that takes the predictions of the base estimators as input and makes the final prediction. In your code, the final estimator is another CatBoostClassifier.

The CatBoost model was trained on the X_train data, which is derived from the X_resampled data, which in turn comes from the X DataFrame. Therefore, the CatBoost model used all the features present in the X DataFrame after the preprocessing steps.

In [None]:
# ===========================
# Step 1: Load Full Dataset
# ===========================

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

file_path = '/content/drive/My Drive/datasets/stroke_prediction_dataset.csv'
data = pd.read_csv(file_path)

print("Full dataset shape:", data.shape)

Mounted at /content/drive
Full dataset shape: (15000, 22)


In [None]:
# =========================
# Advanced Stroke Prediction Pipeline
# =========================

# Step 1: Load Libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from category_encoders import TargetEncoder # Import after installation
from imblearn.over_sampling import SMOTENC
from catboost import CatBoostClassifier
from sklearn.impute import KNNImputer

# Step 2: Load Dataset
file_path = '/content/drive/My Drive/datasets/stroke_prediction_dataset.csv'
data = pd.read_csv(file_path)

# Step 3: Initial Cleanup
cols_to_drop = ['Patient ID', 'Patient Name']
for col in cols_to_drop:
    if col in data.columns:
        data.drop(columns=[col], inplace=True)

In [None]:


# Step 3: Initial Cleanup
cols_to_drop = ['Patient ID', 'Patient Name']
for col in cols_to_drop:
    if col in data.columns:
        data.drop(columns=[col], inplace=True)

# Separate numerical and categorical columns
num_cols = data.select_dtypes(include=[np.number]).columns
cat_cols = data.select_dtypes(include=['object']).columns

# Fill missing values
# Numerical: KNN Imputer
num_imputer = KNNImputer(n_neighbors=5)
data[num_cols] = num_imputer.fit_transform(data[num_cols])

# Categorical: fill with mode
for col in cat_cols:
    data[col] = data[col].fillna(data[col].mode()[0])

# Step 5: Feature Engineering (Optional but improves accuracy)
# Example: age groups and interaction feature
if 'Age' in data.columns:
    data['Age_group'] = pd.cut(data['Age'], bins=[0,30,50,70,100], labels=[1,2,3,4]).astype(int)
    cat_cols = cat_cols.tolist() + ['Age_group'] # Add Age_group to categorical columns
if 'Hypertension' in data.columns and 'Age' in data.columns:
    data['Hypertension_x_Age'] = data['Hypertension'] * data['Age']
    # Decide if 'Hypertension_x_Age' should be treated as categorical or numerical based on its values.
    # Since it's a product, likely numerical, so not adding to cat_cols here.

# Step 4: Encode Categorical Features (Target Encoding) - Moved after feature engineering
target_col = 'Diagnosis'
te = TargetEncoder()
data[cat_cols] = te.fit_transform(data[cat_cols], data[target_col])


# Step 6: Separate Features and Target
X = data.drop(target_col, axis=1)
y = data[target_col]

# Step 7: Handle Class Imbalance with SMOTENC
# Identify categorical feature indices in the data *before* scaling
categorical_indices = [X.columns.get_loc(col) for col in cat_cols if col in X.columns]
smote_nc = SMOTENC(categorical_features=categorical_indices, random_state=42)
X_resampled, y_resampled = smote_nc.fit_resample(X, y)

print("Original shape:", X.shape, y.value_counts())
print("Resampled shape:", X_resampled.shape, np.bincount(y_resampled))

# Step 8: Scale Features
scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)


# Step 9: Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled_scaled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# Step 10: Initialize CatBoost Model
# Removed cat_features as data is scaled and encoded
cat_model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    eval_metric='AUC',
    random_state=42,
    verbose=100,
    early_stopping_rounds=50
)

# Step 11: Train CatBoost
cat_model.fit(X_train, y_train, eval_set=(X_test, y_test))

# Step 12: Evaluate Model
y_pred = cat_model.predict(X_test)
y_proba = cat_model.predict_proba(X_test)[:,1]

print("\n=== CatBoost Evaluation ===")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_proba))

# ===============================
# Optional: Stacking Ensemble
# ===============================
rf = RandomForestClassifier(n_estimators=500, random_state=42)
gb = GradientBoostingClassifier(random_state=42)

estimators = [('cb', cat_model), ('rf', rf), ('gb', gb)]
stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator=CatBoostClassifier(iterations=500, learning_rate=0.05, depth=6, verbose=0, random_state=42)
)

# Train stacking ensemble
stack_model.fit(X_train, y_train)
y_stack_pred = stack_model.predict(X_test)
y_stack_proba = stack_model.predict_proba(X_test)[:,1]

print("\n=== Stacking Ensemble Evaluation ===")
print("Accuracy:", accuracy_score(y_test, y_stack_pred))
print("Precision:", precision_score(y_test, y_stack_pred))
print("Recall:", recall_score(y_test, y_stack_pred))
print("F1 Score:", f1_score(y_test, y_stack_pred))
print("ROC AUC:", roc_auc_score(y_test, y_stack_proba))

Original shape: (15000, 21) Diagnosis
0.0    7532
1.0    7468
Name: count, dtype: int64
Resampled shape: (15064, 21) [7532 7532]
0:	test: 0.9414038	best: 0.9414038 (0)	total: 51.4ms	remaining: 51.4s
100:	test: 0.9737507	best: 0.9738159 (98)	total: 1.12s	remaining: 9.94s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9740670144
bestIteration = 136

Shrink model to first 137 iterations.

=== CatBoost Evaluation ===
Accuracy: 0.8987719880517756
Precision: 0.8963696369636963
Recall: 0.9017264276228419
F1 Score: 0.8990400529625951
ROC AUC: 0.9740670144020247
0:	total: 4.38ms	remaining: 4.38s
100:	total: 880ms	remaining: 7.83s
200:	total: 2.21s	remaining: 8.78s
300:	total: 2.94s	remaining: 6.82s
400:	total: 3.4s	remaining: 5.09s
500:	total: 3.88s	remaining: 3.87s
600:	total: 4.38s	remaining: 2.91s
700:	total: 4.83s	remaining: 2.06s
800:	total: 5.27s	remaining: 1.31s
900:	total: 5.79s	remaining: 637ms
999:	total: 6.26s	remaining: 0us
0:	total: 3.96ms	remaining: 3.95s
100:

In [None]:
%pip install category_encoders imbalanced-learn catboost

Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost, category_encoders
Successfully installed catboost-1.2.8 category_encoders-2.8.1


The code interacts with all the attributes (columns) in the dataset through several data preprocessing and modeling steps. Here's a breakdown of how the code interacts with the data's attributes:

Loading the Data: The first interaction is loading the dataset from the specified file path into a pandas DataFrame.
Initial Cleanup: The code drops specific columns (Patient ID, Patient Name) that are not relevant for the prediction task.
Separating Attribute Types: Attributes are separated into numerical and categorical types to apply different preprocessing techniques.
Handling Missing Values:
For numerical attributes, missing values are imputed using the K-Nearest Neighbors (KNN) imputer, which estimates missing values based on the values of their nearest neighbors.
For categorical attributes, missing values are filled with the mode (most frequent value) of each column.
Feature Engineering: New attributes are created based on existing ones:
Age_group is created by categorizing the Age attribute into bins.
Hypertension_x_Age is created by multiplying the Hypertension and Age attributes.
Encoding Categorical Attributes: Categorical attributes (including the newly engineered Age_group) are transformed into numerical representations using Target Encoding. This replaces each category with the mean of the target variable for that category.
Separating Features and Target: The target attribute (Diagnosis) is separated from the feature attributes (X) that will be used for training the models.
Handling Class Imbalance: SMOTENC is applied to the features and target to create synthetic samples for the minority class. This is done before scaling, using the indices of the original categorical columns to guide the synthesis process.
Scaling Features: The numerical feature attributes (which now include the encoded categorical features) are scaled using StandardScaler to have zero mean and unit variance. This is important for distance-based algorithms and can improve the performance of some models.
Splitting Data: The scaled feature attributes and the resampled target attribute are split into training and testing sets.
Model Training: The training data (including all preprocessed attributes) is used to train the CatBoost model and the Stacking Ensemble model.
Model Evaluation: The trained models are used to make predictions on the test data, and the performance is evaluated using various metrics.
In essence, the code interacts with all attributes by cleaning, transforming, and preparing them for use in the machine learning models. The specific interaction depends on whether an attribute is numerical or categorical and the stage of the pipeline.

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for CatBoost
param_grid = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5],
}

# Initialize Stratified K-Fold for cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=CatBoostClassifier(random_state=42, verbose=0), # verbose=0 to reduce output
    param_grid=param_grid,
    cv=skf,
    scoring='roc_auc', # Use ROC AUC as the scoring metric
    n_jobs=-1, # Use all available CPU cores
    verbose=2 # Increase verbosity to see progress
)

print("Starting GridSearchCV...")

# Fit GridSearchCV to the training data
# Note: We use the resampled and scaled training data
grid_search.fit(X_train, y_train)

print("\n=== GridSearchCV Results ===")
print(f"Best parameters found: {grid_search.best_params_}")
print(f"Best cross-validation AUC score: {grid_search.best_score_:.4f}")

# Get the best estimator
best_cat_model = grid_search.best_estimator_

# Evaluate the best model on the test set
y_pred_best = best_cat_model.predict(X_test)
y_proba_best = best_cat_model.predict_proba(X_test)[:,1]

print("\n=== Best CatBoost Model Evaluation on Test Set ===")
print("Accuracy:", accuracy_score(y_test, y_pred_best))
print("Precision:", precision_score(y_test, y_pred_best))
print("Recall:", recall_score(y_test, y_pred_best))
print("F1 Score:", f1_score(y_test, y_pred_best))
print("ROC AUC:", roc_auc_score(y_test, y_proba_best))

Starting GridSearchCV...
Fitting 5 folds for each of 81 candidates, totalling 405 fits

=== GridSearchCV Results ===
Best parameters found: {'depth': 4, 'iterations': 300, 'l2_leaf_reg': 5, 'learning_rate': 0.05}
Best cross-validation AUC score: 0.9702

=== Best CatBoost Model Evaluation on Test Set ===
Accuracy: 0.9014271490209094
Precision: 0.8953564421190321
Recall: 0.9090305444887118
F1 Score: 0.9021416803953871
ROC AUC: 0.9747266188508519


In [None]:
from sklearn.model_selection import cross_val_score

# Define the number of folds
n_splits = 5

# Initialize Stratified K-Fold cross-validator
# StratifiedKFold is used to maintain the percentage of samples for each class
# as in the original dataset, which is important for imbalanced datasets.
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Perform cross-validation for CatBoost model
# We use the scaled resampled data (X_resampled_scaled, y_resampled)
# The scoring metric is 'roc_auc' as used in the model training
cv_scores_catboost = cross_val_score(
    cat_model,
    X_resampled_scaled,
    y_resampled,
    cv=skf,
    scoring='roc_auc',
    n_jobs=-1 # Use all available CPU cores
)

print(f"\n=== CatBoost Cross-validation (AUC) ===")
print(f"AUC scores for each fold: {cv_scores_catboost}")
print(f"Mean AUC: {cv_scores_catboost.mean():.4f}")
print(f"Standard deviation of AUC: {cv_scores_catboost.std():.4f}")


=== CatBoost Cross-validation (AUC) ===
AUC scores for each fold: [0.97284342 0.96848968 0.97043941 0.96510265 0.97163758]
Mean AUC: 0.9697
Standard deviation of AUC: 0.0027
