In [None]:
# ===========================
# Step 1: Load Full Dataset
# ===========================

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

file_path = '/content/drive/My Drive/datasets/full_data.csv'
data = pd.read_csv(file_path)

print("Full dataset shape:", data.shape)


Mounted at /content/drive
Full dataset shape: (4981, 11)


In [None]:
# Full Stroke Prediction Pipeline

# Step 0: Install dependencies
!pip install catboost lightgbm imblearn

# Step 1: Load libraries
from google.colab import drive
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# Import CatBoost and LightGBM
from catboost import CatBoostClassifier
import lightgbm as lgb


# Step 3: Preprocessing

# Drop non-predictive columns if they exist
cols_to_drop = ['Patient ID', 'Patient Name']
for col in cols_to_drop:
    if col in data.columns:
        data = data.drop(columns=[col])

# Fill missing values in 'Symptoms' if needed
if 'Symptoms' in data.columns:
    data['Symptoms'] = data['Symptoms'].fillna('Unknown')

# Encode categorical variables
cat_cols = data.select_dtypes(include=['object']).columns
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    le_dict[col] = le  # Save encoder if needed later

# Separate features and target
X = data.drop('stroke', axis=1)
y = data['stroke']

# Impute missing values for numerical columns after separating features and target
num_cols = X.select_dtypes(include=np.number).columns
imputer = SimpleImputer(strategy='mean')
X[num_cols] = imputer.fit_transform(X[num_cols])

# Check for missing values after imputation
print("\nMissing values after imputation:")
print(X.isnull().sum())

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply SMOTE for balancing
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y)

print("Original shape:", X.shape, y.value_counts())
print("Resampled shape:", X_resampled.shape, np.bincount(y_resampled))

# Step 4: Feature importance with Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_resampled, y_resampled)

feature_importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print("\nüå≥ Random Forest Feature Importances:")
print(feature_importance_df)

# Select features above threshold
importance_threshold = 0.01
important_features_df = feature_importance_df[feature_importance_df['Importance'] >= importance_threshold]
important_feature_names = important_features_df['Feature'].tolist()

print("\n‚ú® Selected Features based on Importance Threshold:")
print(important_feature_names)

# Step 5: Select important features from resampled data
important_feature_indices = [X.columns.get_loc(col) for col in important_feature_names]

X_resampled_selected = X_resampled[:, important_feature_indices]

# Step 6: Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled_selected, y_resampled, test_size=0.2, random_state=42
)

# Step 7: Prepare test set (scale and select features)
# Since we scaled and balanced only training, apply scaling & selection on original test set from X_scaled (unbalanced)
# So, get test indices from train_test_split of the original dataset, then select and scale accordingly

# For simplicity, just split original scaled X and y to get test set before SMOTE:
X_train_orig, X_test_orig, y_train_orig, y_test_orig = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Select important features for original test set
X_test_selected = X_test_orig[:, important_feature_indices]

# Step 8: Initialize models
lr = LogisticRegression(max_iter=1000)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
svm = SVC(probability=True, random_state=42)
gb = GradientBoostingClassifier(random_state=42)
catboost = CatBoostClassifier(iterations=100, random_state=42, verbose=0) # Added CatBoost
lgbm = lgb.LGBMClassifier(random_state=42) # Added LightGBM


estimators = [('lr', lr), ('rf', rf), ('svm', svm)]
stacking_model = StackingClassifier(estimators=estimators, final_estimator=gb, cv=5)

models = {
    'Logistic Regression': lr,
    'Random Forest': rf,
    'Support Vector Machine': svm,
    'Gradient Boosting': gb,
    'CatBoost': catboost, # Added CatBoost
    'LightGBM': lgbm, # Added LightGBM
    'Stacking Ensemble': stacking_model
}

# Step 9: Train and evaluate
for name, model in models.items():
    print(f"\nTraining and evaluating {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test_selected)

    if hasattr(model, "predict_proba"):
        y_proba = model.predict_proba(X_test_selected)[:, 1]
    else:
        y_proba = None

    print(f"\nüîç {name} Evaluation:")
    print("Accuracy:", accuracy_score(y_test_orig, y_pred))
    print("Precision:", precision_score(y_test_orig, y_pred))
    print("Recall:", recall_score(y_test_orig, y_pred))
    print("F1 Score:", f1_score(y_test_orig, y_pred))
    if y_proba is not None:
        print("ROC AUC Score:", roc_auc_score(y_test_orig, y_proba))

Original shape: (4981, 10) stroke
0    4733
1     248
Name: count, dtype: int64
Resampled shape: (9466, 10) [4733 4733]

üå≥ Random Forest Feature Importances:
             Feature  Importance
1                age    0.365767
7  avg_glucose_level    0.135290
8                bmi    0.120554
9     smoking_status    0.119349
5          work_type    0.103741
4       ever_married    0.066877
6     Residence_type    0.026539
0             gender    0.024595
2       hypertension    0.024297
3      heart_disease    0.012990

‚ú® Selected Features based on Importance Threshold:
['age', 'avg_glucose_level', 'bmi', 'smoking_status', 'work_type', 'ever_married', 'Residence_type', 'gender', 'hypertension', 'heart_disease']

Training and evaluating Logistic Regression...

üîç Logistic Regression Evaluation:
Accuracy: 0.7422266800401204
Precision: 0.15120274914089346
Recall: 0.8148148148148148
F1 Score: 0.25507246376811593
ROC AUC Score: 0.8501040807509523

Training and evaluating Random Forest...




üîç LightGBM Evaluation:
Accuracy: 0.9598796389167502
Precision: 0.6944444444444444
Recall: 0.46296296296296297
F1 Score: 0.5555555555555556
ROC AUC Score: 0.9417147794666353

Training and evaluating Stacking Ensemble...

üîç Stacking Ensemble Evaluation:
Accuracy: 0.9849548645937813
Precision: 0.9148936170212766
Recall: 0.7962962962962963
F1 Score: 0.8514851485148515
ROC AUC Score: 0.9864989591924904


In [None]:
%pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
