In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [None]:
#upload data file
from google.colab import files
uploaded = files.upload()
# Load full dataset
df = pd.read_csv("stroke_prediction_dataset.csv")

Saving stroke_prediction_dataset.csv to stroke_prediction_dataset.csv


In [None]:
# Load the dataset
df = pd.read_csv('stroke_prediction_dataset.csv')

In [None]:
# --- 1. Data Cleaning and Initial Preprocessing ---

# Convert column names to a clean format
df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

# Drop 'patient_id' and 'patient_name' as they are not features for prediction
df = df.drop(['patient_id', 'patient_name'], axis=1)

# Handle inconsistent 'gender' data: 'Other' to a more common category or drop if rare
# Check unique values first
# print(df['gender'].unique())
# If 'Other' exists and is rare, we might drop it or reassign. For simplicity, let's assume it's not a major issue or handle it during encoding.
# If 'Other' is present and significant, further investigation is needed. For this dataset, let's assume it's clean or handled by one-hot.

# Correct inconsistent 'marital_status' data (e.g., 'Married' vs 'married') - already handled by .lower() above
df['marital_status'] = df['marital_status'].str.lower()

# Clean 'blood_pressure_levels' and 'cholesterol_levels'
def clean_blood_pressure(bp_str):
    if isinstance(bp_str, str) and '/' in bp_str:
        systolic, diastolic = map(int, bp_str.split('/'))
        return systolic, diastolic
    return np.nan, np.nan

df[['systolic_bp', 'diastolic_bp']] = df['blood_pressure_levels'].apply(lambda x: pd.Series(clean_blood_pressure(x)))
df = df.drop('blood_pressure_levels', axis=1)

def clean_cholesterol(chol_str):
    if isinstance(chol_str, str):
        hdl_match = re.search(r'HDL:\s*(\d+)', chol_str)
        ldl_match = re.search(r'LDL:\s*(\d+)', chol_str)
        hdl = int(hdl_match.group(1)) if hdl_match else np.nan
        ldl = int(ldl_match.group(1)) if ldl_match else np.nan
        return hdl, ldl
    return np.nan, np.nan

df[['hdl_cholesterol', 'ldl_cholesterol']] = df['cholesterol_levels'].apply(lambda x: pd.Series(clean_cholesterol(x)))
df = df.drop('cholesterol_levels', axis=1)

# Convert 'smoking_status' to a more consistent format
df['smoking_status'] = df['smoking_status'].replace({
    'Formerly Smoked': 'formerly_smoked',
    'Currently Smokes': 'smokes',
    'Non-smoker': 'never_smoked'
})

# Convert 'alcohol_intake' to a more consistent format
df['alcohol_intake'] = df['alcohol_intake'].replace({
    'Social Drinker': 'social_drinker',
    'Frequent Drinker': 'frequent_drinker',
    'Rarely': 'rarely',
    'Never': 'never'
})

# Convert 'physical_activity' to a more consistent format
df['physical_activity'] = df['physical_activity'].replace({
    'Low': 'low',
    'Moderate': 'moderate',
    'High': 'high'
})

# Convert 'dietary_habits' to a more consistent format
df['dietary_habits'] = df['dietary_habits'].str.lower().str.replace('-', '_')

# Convert 'diagnosis' to numerical (target variable)
df['diagnosis'] = df['diagnosis'].apply(lambda x: 1 if x == 'Stroke' else 0)

# Convert 'stroke_history' and 'family_history_of_stroke' to numerical (binary)
df['stroke_history'] = df['stroke_history'].astype(int)
df['family_history_of_stroke'] = df['family_history_of_stroke'].map({'Yes': 1, 'No': 0})

# Handle 'symptoms' - One-hot encode symptoms. First, fill NaN with an empty string.
df['symptoms'] = df['symptoms'].fillna('')
all_symptoms = set()
for symptoms_list in df['symptoms'].apply(lambda x: x.split(', ') if x else []):
    for symptom in symptoms_list:
        if symptom:
            all_symptoms.add(symptom.strip())

for symptom in all_symptoms:
    df[f'symptom_{symptom.lower().replace(" ", "_")}'] = df['symptoms'].apply(lambda x: 1 if symptom in x else 0)
df = df.drop('symptoms', axis=1)



In [None]:
# --- 2. Missing Values ---

# Identify numerical and categorical columns
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
categorical_cols = df.select_dtypes(include='object').columns.tolist()

# Remove 'diagnosis' from numerical_cols as it's the target
if 'diagnosis' in numerical_cols:
    numerical_cols.remove('diagnosis')

# Impute missing numerical values with the median
for col in numerical_cols:
    if df[col].isnull().any():
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)

# Impute missing categorical values with the mode
for col in categorical_cols:
    if df[col].isnull().any():
        mode_val = df[col].mode()[0]
        df[col].fillna(mode_val, inplace=True)



In [None]:
# --- 3. Outlier Detection and Treatment (Winsorization for numerical features) ---
# We'll use IQR method for outlier detection and cap them.
for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])



In [None]:
# --- 4. Feature Engineering ---

# Create Age-BMI interaction
df['age_bmi_interaction'] = df['age'] * df['body_mass_index_bmi']

# Create Glucose-BP interaction
df['glucose_systolic_interaction'] = df['average_glucose_level'] * df['systolic_bp']
df['glucose_diastolic_interaction'] = df['average_glucose_level'] * df['diastolic_bp']

# Create Cholesterol Ratio
df['hdl_ldl_ratio'] = df['hdl_cholesterol'] / (df['ldl_cholesterol'] + 1e-6) # Add small epsilon to avoid division by zero

# Polynomial Features for key numerical columns
poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(df[numerical_cols])
poly_feature_names = poly.get_feature_names_out(numerical_cols)
poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=df.index)
df = pd.concat([df.drop(columns=numerical_cols), poly_df], axis=1)

# Update numerical_cols after polynomial features
numerical_cols = poly_df.columns.tolist()

In [None]:
# --- 5. One-Hot Encoding for Categorical Features ---

# Re-identify categorical columns after previous steps
categorical_cols = df.select_dtypes(include='object').columns.tolist()

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True) # drop_first to avoid multicollinearity

# --- 6. Feature Scaling ---

# Separate features (X) and target (y)
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']

# Identify numerical columns for scaling (excluding newly created binary symptom columns)
# Filter out columns that are already binary (0/1) from numerical_cols for scaling
cols_to_scale = [col for col in X.columns if X[col].nunique() > 2 and X[col].dtype != 'uint8'] # uint8 for one-hot encoded

scaler = StandardScaler()
X[cols_to_scale] = scaler.fit_transform(X[cols_to_scale])



In [None]:
# --- 7. Addressing Data Imbalance (SMOTE) ---

# Check imbalance
# print(y.value_counts())

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# print(y_resampled.value_counts())

# --- 8. Model Training and Hyperparameter Tuning (Random Forest) ---

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Define the model
rf_model = RandomForestClassifier(random_state=42)

# Hyperparameter Tuning using GridSearchCV
# A smaller grid for demonstration, expand for more exhaustive search
param_grid = {
    'n_estimators': [50, 100],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid,
                           cv=StratifiedKFold(n_splits=5), n_jobs=-1, verbose=2, scoring='accuracy')

grid_search.fit(X_train, y_train)

# Best parameters and best score
print("\nBest parameters found: ", grid_search.best_params_)
print("Best accuracy found: {:.4f}".format(grid_search.best_score_))

# Train the model with the best parameters
best_rf_model = grid_search.best_estimator_
best_rf_model.fit(X_train, y_train)



Fitting 5 folds for each of 48 candidates, totalling 240 fits

Best parameters found:  {'max_depth': None, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best accuracy found: 0.5102


In [None]:
# --- 9. Model Evaluation ---

y_pred = best_rf_model.predict(X_test)

print("\n--- Model Evaluation on Test Set ---")
print("Accuracy: {:.4f}".format(accuracy_score(y_test, y_pred)))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Cross-validation on the resampled dataset
cv_scores = cross_val_score(best_rf_model, X_resampled, y_resampled, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1)
print("\nCross-validation Accuracy Scores (Resampled Data):", cv_scores)
print("Mean CV Accuracy (Resampled Data): {:.4f}".format(np.mean(cv_scores)))
print("Std Dev CV Accuracy (Resampled Data): {:.4f}".format(np.std(cv_scores)))

# --- 10. Feature Importance (Model-centric approach) ---
importances = best_rf_model.feature_importances_
feature_names = X_train.columns
forest_importances = pd.Series(importances, index=feature_names).sort_values(ascending=False)

print("\n--- Top 10 Feature Importances ---")
print(forest_importances.head(10))

# Optional: Plot feature importances
# plt.figure(figsize=(12, 8))
# sns.barplot(x=forest_importances.head(20).values, y=forest_importances.head(20).index)
# plt.title("Top 20 Feature Importances")
# plt.xlabel("Mean Decrease in Impurity")
# plt.ylabel("Feature")
# plt.tight_layout()
# plt.show()



--- Model Evaluation on Test Set ---
Accuracy: 0.5008

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.51      0.51      1507
           1       0.50      0.49      0.50      1506

    accuracy                           0.50      3013
   macro avg       0.50      0.50      0.50      3013
weighted avg       0.50      0.50      0.50      3013


Confusion Matrix:
 [[768 739]
 [765 741]]

Cross-validation Accuracy Scores (Resampled Data): [0.50116163 0.49220046 0.51178228 0.49585131 0.51527224]
Mean CV Accuracy (Resampled Data): 0.5033
Std Dev CV Accuracy (Resampled Data): 0.0089

--- Top 10 Feature Importances ---
systolic_bp diastolic_bp                 0.016467
average_glucose_level ldl_cholesterol    0.016460
body_mass_index_bmi systolic_bp          0.016359
hdl_cholesterol ldl_cholesterol          0.016320
age body_mass_index_bmi                  0.016108
systolic_bp ldl_cholesterol              0.016007
stress_levels dias

In [None]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

In [None]:
# Model Training with Bagging
# Define the base estimator
base_estimator = RandomForestClassifier(random_state=42)
# Create the Bagging Classifier
# Changed 'base_estimator' to 'estimator' to match the current scikit-learn API
bagging_model = BaggingClassifier(estimator=base_estimator, n_estimators=50, random_state=42)
# Train the model
bagging_model.fit(X_resampled, y_resampled)
# --- Model Evaluation ---
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)
# Make predictions
y_pred = bagging_model.predict(X_test)
print("\n--- Model Evaluation on Test Set ---")
print("Accuracy: {:.4f}".format(accuracy_score(y_test, y_pred)))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
# Cross-validation on the resampled dataset
cv_scores = cross_val_score(bagging_model, X_resampled, y_resampled, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1)
print("\nCross-validation Accuracy Scores (Resampled Data):", cv_scores)
print("Mean CV Accuracy (Resampled Data): {:.4f}".format(np.mean(cv_scores)))
print("Std Dev CV Accuracy (Resampled Data): {:.4f}".format(np.std(cv_scores)))


--- Model Evaluation on Test Set ---
Accuracy: 1.0000

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      1507
           1       1.00      1.00      1.00      1506

    accuracy                           1.00      3013
   macro avg       1.00      1.00      1.00      3013
weighted avg       1.00      1.00      1.00      3013


Confusion Matrix:
 [[1507    0]
 [   0 1506]]

Cross-validation Accuracy Scores (Resampled Data): [0.49352805 0.50282111 0.50879522 0.50082974 0.50664011]
Mean CV Accuracy (Resampled Data): 0.5025
Std Dev CV Accuracy (Resampled Data): 0.0053
