## Imports

In [None]:
!pip install -U dabl

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dabl
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from mlens.ensemble import SuperLearner
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix , classification_report
import optuna
from optuna.samplers import TPESampler
import warnings

warnings.filterwarnings('ignore')

## Load dataset

In [None]:
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head()

In [None]:
df.describe()

In [None]:
df = df.dropna()

## EDA

### Heat Map Correlation

In [None]:
sns.heatmap(df.corr(), cmap='viridis_r')

### Distribution Plots

In [None]:
sns.set_theme(style="darkgrid")
sns.displot(df['age'], kde=True)
plt.xlabel("Age (in years)")
plt.title(f"Distribution of Ages")
plt.show()

In [None]:
sns.distplot(df[df['stroke'] == 0]["age"], label='No Stroke')
sns.distplot(df[df['stroke'] == 1]["age"], label='Stroke')
plt.title('No Stroke/Stroke by Age')
plt.legend()
plt.show()

In [None]:
sns.displot(df['avg_glucose_level'], kde=True)
plt.xlabel("Average Glucose Level")
plt.title(f"Distribution of Average Glucose Level")
plt.show()

In [None]:
sns.distplot(df[df['stroke'] == 0]["avg_glucose_level"], label='No Stroke')
sns.distplot(df[df['stroke'] == 1]["avg_glucose_level"], label='Stroke')
plt.title('No Stroke/Stroke by Avg Glucose Level')
plt.legend()
plt.show()

In [None]:
sns.displot(df['bmi'], kde=True)
plt.xlabel("Body Mass Index")
plt.title(f"Distribution of Body Mass Index")
plt.show()

In [None]:
sns.distplot(df[df['stroke'] == 0]["bmi"], label='No Stroke')
sns.distplot(df[df['stroke'] == 1]["bmi"], label='Stroke')
plt.title('No Stroke/Stroke by BMI')
plt.legend()
plt.show()

### General Plots

In [None]:
dabl.plot(df, target_col='stroke')

## Data Preprocessing

In [None]:
x = df.iloc[:, 1:-1].values
y = df.iloc[:, -1].values

### Encoding

In [None]:
ct = ColumnTransformer(transformers= [('encoder', OneHotEncoder(), [0,5,9])], remainder= 'passthrough')
x = np.array(ct.fit_transform(x))

In [None]:
le = LabelEncoder()
x[:, 15] = le.fit_transform(x[:, 15])
x[:, 16] = le.fit_transform(x[:, 16])

### SMOTE

In [None]:
x, y = SMOTE().fit_resample(x,y)

### Split dataset

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.33, random_state= 0)

In [None]:
print("Size x_train: ", x_train.shape)
print("Size y_train: ", y_train.shape)
print("Size x_test: ", x_test.shape)
print("Size y_test: ", y_test.shape)

### Feature Scaling

In [None]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

## Model Selection

In [None]:
class Optimizer:
    def __init__(self, metric, trials=30):
        self.metric = metric
        self.trials = trials
        self.sampler = TPESampler(seed=42)
        
    def objective(self, trial):
        model = create_model(trial)
        model.fit(x_train, y_train)
        preds = model.predict(x_test)
        if self.metric == 'acc':
            return accuracy_score(y_test, preds)
        else:
            return f1_score(y_test, preds)
            
    def optimize(self):
        study = optuna.create_study(direction="maximize", sampler=self.sampler)
        study.optimize(self.objective, n_trials=self.trials)
        return study.best_params

### Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(x_train, y_train)
preds = rf.predict(x_test)

print("Random Forest accuracy: ", accuracy_score(y_test, preds))
print("Random Forest f1-score: ", f1_score(y_test, preds))

def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 6)
    n_estimators = trial.suggest_int("n_estimators", 2, 150)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)
    model = RandomForestClassifier(
        min_samples_leaf=min_samples_leaf, 
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        random_state=666
    )
    return model

optimizer = Optimizer('f1')
rf_f1_params = optimizer.optimize()
rf_f1_params['random_state'] = 42
rf_f1 = RandomForestClassifier(
    **rf_f1_params
)
rf_f1.fit(x_train, y_train)
preds = rf_f1.predict(x_test)

print('Optimized on F1 score')
print('Optimized Random Forest: ', accuracy_score(y_test, preds))
print('Optimized Random Forest f1-score: ', f1_score(y_test, preds))

optimizer = Optimizer('acc')
rf_acc_params = optimizer.optimize()
rf_acc_params['random_state'] = 42
rf_acc = RandomForestClassifier(
    **rf_acc_params
)
rf_acc.fit(x_train, y_train)
preds = rf_acc.predict(x_test)

print('Optimized on accuracy')
print('Optimized Random Forest: ', accuracy_score(y_test, preds))
print('Optimized Random Forest f1-score: ', f1_score(y_test, preds))

### XGBoost

In [None]:
xgb = XGBClassifier(
    random_state=42
)
xgb.fit(x_train, y_train)
preds = xgb.predict(x_test)

print('XGBoost accuracy: ', accuracy_score(y_test, preds))
print('XGBoost f1-score: ', f1_score(y_test, preds))

def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 6)
    n_estimators = trial.suggest_int("n_estimators", 1, 150)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0000001, 1)
    gamma = trial.suggest_uniform('gamma', 0.0000001, 1)
    subsample = trial.suggest_uniform('subsample', 0.0001, 1.0)
    model = XGBClassifier(
        learning_rate=learning_rate, 
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        gamma=gamma, 
        subsample=subsample,
        random_state=666
    )
    return model

optimizer = Optimizer('f1')
xgb_f1_params = optimizer.optimize()
xgb_f1_params['random_state'] = 42
xgb_f1 = XGBClassifier(
    **xgb_f1_params
)
xgb_f1.fit(x_train, y_train)
preds = xgb_f1.predict(x_test)

print('Optimized on F1 score')
print('Optimized XGBoost accuracy: ', accuracy_score(y_test, preds))
print('Optimized XGBoost f1-score: ', f1_score(y_test, preds))

optimizer = Optimizer('acc')
xgb_acc_params = optimizer.optimize()
xgb_acc_params['random_state'] = 42
xgb_acc = XGBClassifier(
    **xgb_acc_params
)
xgb_acc.fit(x_train, y_train)
preds = xgb_acc.predict(x_test)

print('Optimized on accuracy')
print('Optimized XGBoost accuracy: ', accuracy_score(y_test, preds))
print('Optimized XGBoost f1-score: ', f1_score(y_test, preds))

### Logistic Regression

In [None]:
lr = LogisticRegression(
    random_state=666
)
lr.fit(x_train, y_train)
preds = lr.predict(x_test)

print('Logistic Regression: ', accuracy_score(y_test, preds))
print('Logistic Regression f1-score: ', f1_score(y_test, preds))

### Decision Tree

In [None]:
dt = DecisionTreeClassifier(
    random_state=666
)
dt.fit(x_train, y_train)
preds = dt.predict(x_test)

print('Decision Tree accuracy: ', accuracy_score(y_test, preds))
print('Decision Tree f1-score: ', f1_score(y_test, preds))

def create_model(trial):
    max_depth = trial.suggest_int("max_depth", 2, 6)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 16)
    min_weight_fraction_leaf = trial.suggest_uniform('min_weight_fraction_leaf', 0.0, 0.5)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    model = DecisionTreeClassifier(
        min_samples_split=min_samples_split, 
        min_weight_fraction_leaf=min_weight_fraction_leaf, 
        max_depth=max_depth, 
        min_samples_leaf=min_samples_leaf, 
        random_state=666
    )
    return model

optimizer = Optimizer('f1')
dt_f1_params = optimizer.optimize()
dt_f1_params['random_state'] = 666
dt_f1 = DecisionTreeClassifier(
    **dt_f1_params
)
dt_f1.fit(x_train, y_train)
preds = dt_f1.predict(x_test)

print('Optimized on F1-score')
print('Optimized Decision Tree accuracy: ', accuracy_score(y_test, preds))
print('Optimized Decision Tree f1-score: ', f1_score(y_test, preds))

optimizer = Optimizer('acc')
dt_acc_params = optimizer.optimize()
dt_acc_params['random_state'] = 666
dt_acc = DecisionTreeClassifier(
    **dt_acc_params
)
dt_acc.fit(x_train, y_train)
preds = dt_acc.predict(x_test)

print('Optimized on accuracy')
print('Optimized Decision Tree accuracy: ', accuracy_score(y_test, preds))
print('Optimized Decision Tree f1-score: ', f1_score(y_test, preds))

### K-Nearest Neighbors

In [None]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
preds = knn.predict(x_test)

print('KNN accuracy: ', accuracy_score(y_test, preds))
print('KNN f1-score: ', f1_score(y_test, preds))

sampler = TPESampler(seed=0)
def create_model(trial):
    n_neighbors = trial.suggest_int("n_neighbors", 2, 25)
    model = KNeighborsClassifier(n_neighbors=n_neighbors)
    return model

optimizer = Optimizer('f1')
knn_f1_params = optimizer.optimize()
knn_f1 = KNeighborsClassifier(
    **knn_f1_params
)
knn_f1.fit(x_train, y_train)
preds = knn_f1.predict(x_test)

print('Optimized on F1-score')
print('Optimized KNN accuracy: ', accuracy_score(y_test, preds))
print('Optimized KNN f1-score: ', f1_score(y_test, preds))

optimizer = Optimizer('acc')
knn_acc_params = optimizer.optimize()
knn_acc = KNeighborsClassifier(
    **knn_acc_params
)
knn_acc.fit(x_train, y_train)
preds = knn_acc.predict(x_test)

print('Optimized on accuracy')
print('Optimized KNN accuracy: ', accuracy_score(y_test, preds))
print('Optimized KNN f1-score: ', f1_score(y_test, preds))

### AdaBoost

In [None]:
abc = AdaBoostClassifier(
    random_state=666
)
abc.fit(x_train, y_train)
preds = abc.predict(x_test)

print('AdaBoost accuracy: ', accuracy_score(y_test, preds))
print('AdaBoost f1-score: ', f1_score(y_test, preds))

def create_model(trial):
    n_estimators = trial.suggest_int("n_estimators", 2, 150)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0005, 1.0)
    model = AdaBoostClassifier(
        n_estimators=n_estimators, 
        learning_rate=learning_rate, 
        random_state=666
    )
    return model

optimizer = Optimizer('f1')
abc_f1_params = optimizer.optimize()
abc_f1_params['random_state'] = 666
abc_f1 = AdaBoostClassifier(
    **abc_f1_params
)
abc_f1.fit(x_train, y_train)
preds = abc_f1.predict(x_test)

print('Optimized on F1-score')
print('Optimized AdaBoost accuracy: ', accuracy_score(y_test, preds))
print('Optimized AdaBoost f1-score: ', f1_score(y_test, preds))

optimizer = Optimizer('acc')
abc_acc_params = optimizer.optimize()
abc_acc_params['random_state'] = 666
abc_acc = AdaBoostClassifier(**abc_acc_params)
abc_acc.fit(x_train, y_train)
preds = abc_acc.predict(x_test)

print('Optimized on accuracy')
print('Optimized AdaBoost accuracy: ', accuracy_score(y_test, preds))
print('Optimized AdaBoost f1-score: ', f1_score(y_test, preds))

### Support Vector Machine

In [None]:
svc = SVC(random_state=666)
svc.fit(x_train, y_train)
preds = svc.predict(x_test)

print("SupportVectorClassifier accuracy: ", accuracy_score(y_test, preds))
print("SupportVectorClassifier f1-score: ", f1_score(y_test, preds))

def create_model(trial):
    kernel = trial.suggest_categorical('kernel', ['rbf', 'sigmoid'])
    degree = trial.suggest_int('degree', 2, 5)
    gamma = trial.suggest_categorical('gamma', ['auto', 'scale'])
    model = SVC(
        kernel=kernel,
        degree=degree,
        gamma=gamma,
        random_state=0
    )
    return model

optimizer = Optimizer('f1')
svc_f1_params = optimizer.optimize()
svc_f1_params['random_state'] = 666
svc_f1 = SVC(**svc_f1_params)
svc_f1.fit(x_train, y_train)
preds = svc_f1.predict(x_test)

print('Optimized on F1-score')
print("Optimized SupportVectorClassifier accuracy: ", accuracy_score(y_test, preds))
print("Optimized SupportVectorClassifier f1-score: ", f1_score(y_test, preds))

optimizer = Optimizer('accuracy')
svc_acc_params = optimizer.optimize()
svc_acc_params['random_state'] = 666
svc_acc = SVC(**svc_acc_params)
svc_acc.fit(x_train, y_train)
preds = svc_acc.predict(x_test)

print('Optimized on accuracy')
print("Optimized SupportVectorClassifier accuracy: ", accuracy_score(y_test, preds))
print("Optimized SupportVectorClassifier f1-score: ", f1_score(y_test, preds))

### Super Learner

In [None]:
model = SuperLearner(folds=5, random_state=42)
model.add([svc, abc, xgb, rf, dt, knn])
model.add_meta(LogisticRegression())
model.fit(x_train, y_train)
preds = model.predict(x_test)
print('SuperLearner accuracy: ', accuracy_score(y_test, preds))
print('SuperLearner f1-score: ', f1_score(y_test, preds))

### Ensemble

In [None]:
mdict = {
    'RF': RandomForestClassifier(random_state=666),
    'XGB': XGBClassifier(random_state=666),
    'LR': LogisticRegression(random_state=666),
    'DT': DecisionTreeClassifier(random_state=666),
    'KNN': KNeighborsClassifier(),
    'ABC': AdaBoostClassifier(random_state=666),
    'SVC': SVC(random_state=666),
    'OARF': RandomForestClassifier(**rf_acc_params),
    'OFRF': RandomForestClassifier(**rf_f1_params),
    'OAXGB': XGBClassifier(**xgb_acc_params),
    'OFXGB': XGBClassifier(**xgb_f1_params),
    'OADT': DecisionTreeClassifier(**dt_acc_params),
    'OFDT': DecisionTreeClassifier(**dt_f1_params),
    'OAKNN': KNeighborsClassifier(**knn_acc_params),
    'OFKNN': KNeighborsClassifier(**knn_f1_params),
    'OAABC': AdaBoostClassifier(**abc_acc_params),
    'OFABC': AdaBoostClassifier(**abc_f1_params),
    'OASVC': SVC(**svc_acc_params),
    'OFSVC': SVC(**svc_f1_params)
}

In [None]:
def create_model(trial):
    model_names = list()
    models_list = [
        'RF', 'XGB', 'DT', 'LR', 'KNN', 'ABC', 'SVC', 'OARF', 'OFRF', 'OAXGB', 'OFXGB',
        'OADT', 'OFDT', 'OAKNN', 'OFKNN', 'OAABC', 'OFABC', 'OASVC', 'OFSVC'
    ]
    
    head_list = [
        'RF', 'XGB', 'DT', 
        'KNN', 'LR', 'ABC', 
        'SVC'
    ]
    n_models = trial.suggest_int("n_models", 2, 6)
    for i in range(n_models):
        model_item = trial.suggest_categorical('model_{}'.format(i), models_list)
        if model_item not in model_names:
            model_names.append(model_item)
    
    folds = trial.suggest_int("folds", 2, 6)
    
    model = SuperLearner(
        folds=folds, 
        random_state=666
    )
    
    models = [
        mdict[item] for item in model_names
    ]
    model.add(models)
    head = trial.suggest_categorical('head', head_list)
    model.add_meta(
        mdict[head]
    )
        
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(x_train, y_train)
    preds = model.predict(x_test)
    score = accuracy_score(y_test, preds)
    return score

study = optuna.create_study(
    direction="maximize", 
    sampler=sampler
)
study.optimize(
    objective, 
    n_trials=50
)

In [None]:
params = study.best_params

head = params['head']
folds = params['folds']
del params['head'], params['n_models'], params['folds']
result = list()
for key, value in params.items():
    if value not in result:
        result.append(value)
        
print(result)

## Final Model

In [None]:
model = SuperLearner(
    folds=folds, 
    random_state=666
)

models = [
    mdict[item] for item in result
]
model.add(models)
model.add_meta(mdict[head])

model.fit(x_train, y_train)

preds = model.predict(x_test)

print('Optimized SuperLearner accuracy: ', accuracy_score(y_test, preds))
print('Optimized SuperLearner f1-score: ', f1_score(y_test, preds))

In [None]:
print(confusion_matrix(y_test, preds))
print(classification_report(y_test, preds))