<pre>
                               ___________
                              [___________]
                               {=========}
                             .-'         '-.
                            /               \
                           /_________________\
                           | _  _  _ __    _ |
                           ||_)|_)/ \ /|\ /  |
      ,.----.   ,.----.    ||  | \\_//_|~\\_ |
     //  \   \ //  \   \   |_________________|
     \\   \  / \\   \  /   |                 |
      `'----'   `'----'    '-----------------'
      
<b>Super Learner Predictions for Strokes</b> - 95% accuracy
by Alin Cijov
</pre>

In [None]:
import numpy as np
import pandas as pd

from numpy import hstack
from numpy import vstack
from numpy import asarray
from sklearn.datasets import make_blobs
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier

# Dataset

In [None]:
class StrokeDataset():
    '''
        
        Stroke Dataset
        
        Cleaning, normalize and replace labels with numbers
        
    '''
    def __init__(self, path):
        df = pd.read_csv(path)
        df.drop(columns=['id'], inplace=True)
        df = self.fill_bmi_na(df)
        df = self.normalize(df)
        df = self.categorize(df)
        self.features = df[list(df.columns)[1:-1]].values
        self.labels = df['stroke'].values
        self.df = df
        
    def fill_bmi_na(self, df):
        df['bmi'] = df['bmi'].fillna(df['bmi'].mean())
        return df
    
    def normalize(self, df):
        columns = ['age', 'avg_glucose_level', 'bmi']
        for column in columns:
            df[column] /= df[column].max()
        return df
    
    def categorize(self, df):
        self.categories_to_idx = {}
        self.idx_to_categories = {}
        columns = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
        for column in columns:
            if(column != 'age' or column != 'avg_glucose_level'):
                temp = np.unique(df[column].values)
                self.categories_to_idx[column] = {v:k for k,v in enumerate(temp)}
                self.idx_to_categories[column] = {k:v for k,v in enumerate(temp)}
        df = df.replace(self.categories_to_idx)
        return df

In [None]:
path = "../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv"

sds = StrokeDataset(path)

In [None]:
sds.df.head()

# Super Learner

In [None]:
class SuperLearner():
    '''
        
        Super Learner
        
        Models:
            - LogisticRegression
            - DecisionTreeClassifier
            - SVC
            - GaussianNB
            - KNeighborsClassifier
            - AdaBoostClassifier
            - BaggingClassifier
            - RandomForestClassifier
            - ExtraTreesClassifier
            
    '''
    def __init__(self, n_estimators):
        self.n_estimators = n_estimators
        self.models = self.get_models(n_estimators)
        
    def get_models(self, n_estimators):
        models = list()
        models.append(LogisticRegression(solver='liblinear'))
        models.append(DecisionTreeClassifier())
        models.append(SVC(gamma='scale', probability=True))
        models.append(GaussianNB())
        models.append(KNeighborsClassifier())
        models.append(AdaBoostClassifier())
        models.append(BaggingClassifier(n_estimators=n_estimators))
        models.append(RandomForestClassifier(n_estimators=n_estimators))
        models.append(ExtraTreesClassifier(n_estimators=n_estimators))
        return models
    
    def fit_base_models(self, X, y, models):
        for model in models:
            model.fit(X, y)
            
    def fit_meta_model(self, X, y):
        model = LogisticRegression(solver='liblinear')
        model.fit(X, y)
        return model
    
    def evaluate_models(self, X, y, models):
        for model in models:
            yhat = model.predict(X)
            acc = accuracy_score(y, yhat)
            print("{:25s}: {:1.3f}".format(model.__class__.__name__, acc))
            
    def super_learner_predictions(self, X, models, meta_model):
        meta_X = list()
        for model in models:
            yhat = model.predict_proba(X)
            meta_X.append(yhat)
        meta_X = hstack(meta_X)
        return meta_model.predict(meta_X)

    def get_out_of_fold_predictions(self, X, y, models, n_splits=10):
        meta_X, meta_y = list(), list()
        kfold = KFold(n_splits=n_splits, shuffle=True)
        
        for train_ix, test_ix in kfold.split(X):
            fold_yhats = list()
            meta_y.extend(y[test_ix])
            for model in models:
                model.fit(X[train_ix], y[train_ix])
                yhat = model.predict_proba(X[test_ix])
                fold_yhats.append(yhat)
            meta_X.append(hstack(fold_yhats))
        return vstack(meta_X), asarray(meta_y)

# Training

In [None]:
# split data
X_train, X_test, y_train, y_test = train_test_split(sds.features, sds.labels, test_size=0.50)

# Super Learner
sl = SuperLearner(10)

In [None]:
# meta out of fold predictions
meta_X, meta_y = sl.get_out_of_fold_predictions(X_train, y_train, sl.models)

# fit base models
sl.fit_base_models(X_train, y_train, sl.models)

# fit the meta model
meta_model = sl.fit_meta_model(meta_X, meta_y)

# Evaluate

In [None]:
# evaluate base models
print("Base models:")
print("--------------------------")
sl.evaluate_models(X_test, y_test, sl.models)

# evaluate meta model
yhat = sl.super_learner_predictions(X_test, sl.models, meta_model)
print()
print("Logistic regression:")
print("--------------------------")
print("{:25s}: {:1.3f}".format("Super Learner", accuracy_score(y_test, yhat)))