In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import re
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score, learning_curve, ShuffleSplit
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler, StandardScaler, PolynomialFeatures, OneHotEncoder, OrdinalEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import make_column_transformer, make_column_selector, ColumnTransformer
from sklearn.feature_selection import SelectPercentile, SelectorMixin
from sklearn.base import TransformerMixin, BaseEstimator

pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head()

# EDA

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df['stroke'].value_counts()

In [None]:
df[df['age'] == 0.08]

In [None]:
plt.figure(figsize = (14, 8))
plt.hist(x = df['age'], bins = 40)    #, bins = 'fd')
plt.title('Age distribution')
plt.show()

In [None]:
g = sns.FacetGrid(df, hue='stroke', height = 7, aspect = 2)
g.map(sns.kdeplot, 'age')
plt.title('Age distribution with class')
plt.legend()
plt.show()

In [None]:
g = sns.FacetGrid(df, hue='stroke', height = 7, aspect = 2)
g.map(sns.kdeplot, 'avg_glucose_level')
plt.title('Glucose level distribution with class')
plt.legend()
plt.show()

In [None]:
g = sns.FacetGrid(df, hue='stroke', height = 7, aspect = 2)
g.map(sns.kdeplot, 'bmi')
plt.title('Body Mass Index distribution with class')
plt.legend()
plt.show()

In [None]:
sns.pairplot(
    data = df[['age', 'avg_glucose_level', 'bmi', 'stroke']],
    hue = 'stroke',  
    palette = 'magma',
    height = 4
)
plt.show()

### Effect of smoking

In [None]:
for status in df['smoking_status'].unique():
    df_temp = df[df['smoking_status'] == status]
    df_temp = df_temp.groupby("stroke")['id'].count()
    df_temp.plot.pie(autopct="%.1f%%")
    plt.ylabel('#')
    plt.title('Strokes among ' + status)
    plt.show() 

### Marriage status difference

In [None]:
for status in df['ever_married'].unique():
    df_temp = df[df['ever_married'] == status]
    df_temp = df_temp.groupby("stroke")['id'].count()
    df_temp.plot.pie(autopct="%.1f%%")
    plt.ylabel('#')
    plt.title('Strokes among ' + status)
    plt.show() 

### Worktype

In [None]:
for status in df['work_type'].unique():
    df_temp = df[df['work_type'] == status]
    df_temp = df_temp.groupby("stroke")['id'].count()
    df_temp.plot.pie(autopct="%.1f%%")
    plt.ylabel('#')
    plt.title('Strokes among ' + status)
    plt.show() 

### Gender

In [None]:
for status in df['gender'].unique():
    df_temp = df[df['gender'] == status]
    df_temp = df_temp.groupby("stroke")['id'].count()
    df_temp.plot.pie(autopct="%.1f%%")
    plt.ylabel('#')
    plt.title('Strokes among ' + status)
    plt.show() 

In [None]:
df[df.drop('bmi', axis= 1).isnull().any(axis=1)]

N/A values are only in bmi columns

# Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('stroke', axis = 1), 
                                                    df['stroke'], 
                                                    test_size=0.3, 
                                                    random_state=101)

In [None]:
X_train.head()

# Data preparation

### Columns preprocessing

In [None]:
class DataPreprocessor(BaseEstimator, TransformerMixin):
    
    def __init__(self, binning_age = False, binning_glucose_level = False, binning_bmi = False):
        self.binning_age = binning_age
        self.binning_glucose_level = binning_glucose_level
        self.binning_bmi = binning_bmi

        
    def __binning_func(self, X, column_to_bin):
        X = X.copy()
        min_value = self.columns_data[column_to_bin]['min']
        max_value = self.columns_data[column_to_bin]['max']
        bins = np.linspace(min_value,max_value, 8)
        X[column_to_bin] = pd.cut(X[column_to_bin], bins=bins,  include_lowest=True)
        X[column_to_bin] = X[column_to_bin].astype('str')
        return X

    def fit(self, X, y = None):
        self.columns_data = dict()
        
        detection_list = []
        if self.binning_age:
            detection_list.append('age')
        
        if self.binning_glucose_level:
            detection_list.append('avg_glucose_level')
        
        if self.binning_bmi:
            detection_list.append('bmi')
    
        for col in detection_list:
            self.columns_data[col] = {
                'min': min(X[col]),
                'max': max(X[col]),
            }
        return self

    def transform(self, X, y = None):
        X_copy = X.copy()
        
        if 'id' in X_copy.columns:
            X_copy.drop('id', axis = 1, inplace = True)

        if self.binning_age:
            X_copy = self.__binning_func(X_copy, 'age')

        if self.binning_glucose_level:
            X_copy = self.__binning_func(X_copy, 'avg_glucose_level')

        if self.binning_bmi:
            X_copy = self.__binning_func(X_copy, 'bmi')
            X_copy['bmi'].fillna('N\A', inplace = True)
        else:
            X_copy['bmi'].fillna(-1, inplace = True)
            
        return X_copy
    

In [None]:
processor = DataPreprocessor(
    binning_age = True,
    binning_glucose_level = True,
    binning_bmi = True
)
processor.fit_transform(df).head()

### Interactions and polinoms

In [None]:
class InterPolinomsFeatures(BaseEstimator, TransformerMixin):
    
    def __init__(self, degree = 1, interaction_only = False):
        self.degree = degree
        self.interaction_only = interaction_only
        self.encoder_inner = PolynomialFeatures(
            degree = degree, 
            interaction_only = interaction_only,
            include_bias = False
        )
        self.encoder = make_column_transformer(
                    (self.encoder_inner,make_column_selector(dtype_exclude='object')),
                    remainder='passthrough'
                )
        
    @staticmethod
    def __convert_to_float(X):
        X_copy = X.copy()
        for name in X.columns:
            if 'polynomialfeatures__' in name:
                X_copy[name] = X_copy[name].astype(float)
        return X_copy
    
    def __columns_name_change(self, name):
        name = name.replace('polynomialfeatures__', '')
        for col_name in re.findall(r'\bx\d+\b', name):
            name = name.replace(col_name, self.object_columns_dict[col_name])
        return name
    
    def fit(self, X, y = None):
        X_copy = X.copy()
        self.encoder.fit(X_copy)
        object_columns = X_copy.select_dtypes(exclude='object').columns
        self.object_columns_dict = dict()
        for i in enumerate(object_columns):
            self.object_columns_dict[f'x{i[0]}'] = i[1]
        return self
                 
    def transform(self, X, y = None):
        X_copy = X.copy()
        X_copy = self.encoder.transform(X_copy)
        X_copy = pd.DataFrame(X_copy, columns = self.encoder.get_feature_names())
        X_copy = self.__convert_to_float(X_copy)
        X_copy.columns = [self.__columns_name_change(name) for name in X_copy.columns]
        return X_copy

In [None]:
InterPolinomsFeatures(
    degree = 2, 
    interaction_only = False
).fit_transform(
    X_train.fillna(0)
               ).head()

In [None]:
X_train.head()

### Categorical encoding

In [None]:
class CategoricalEncoderOneHot():
    
    def __init__(self):
        self.encoder_inner = OneHotEncoder(sparse=False, handle_unknown='ignore')
        self.encoder = make_column_transformer(
                    (self.encoder_inner,make_column_selector(dtype_include='object')),
                    remainder='passthrough'
                )
        
    def __columns_name_change(self, name):
        if 'onehotencoder__' in name:
            name = name.replace('onehotencoder__', '')
            col_name = re.findall(r'^x\d+', name)[0]
            name = name.replace(col_name, self.object_columns_dict[col_name])
        return name
    
    def fit(self, X, y = None):
        X_copy = X.copy()
        self.encoder.fit(X_copy)
        object_columns = X_copy.select_dtypes(include='object').columns
        self.object_columns_dict = dict()
        for i in enumerate(object_columns):
            self.object_columns_dict[f'x{i[0]}'] = i[1]
        return self
    
    def transform(self, X, y = None):
        X_copy = X.copy()
        X_proc = self.encoder.transform(X_copy)
        X_proc = pd.DataFrame(X_proc)
        X_proc.columns = [self.__columns_name_change(name) for name in self.encoder.get_feature_names()]
        return X_proc



class CategoricalEncoderOrdinal():
    
    def __init__(self):
        self.encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan)
        
    def fit(self, X, y = None):
        self.object_columns = X.select_dtypes(include='object').columns
        self.non_object_columns = X.select_dtypes(exclude='object').columns
        self.encoder.fit(X[self.object_columns])
        return self
    
    def transform(self, X, y = None):
        X_copy = X.copy()
        X_copy[self.object_columns] = self.encoder.transform(X_copy[self.object_columns])
        return X_copy

    
class CategoricalEncoder(BaseEstimator, TransformerMixin):
    
    encoders = {
        'one_hot' : CategoricalEncoderOneHot,
        'ordinal' : CategoricalEncoderOrdinal
    }
        
    def __init__(self, mode = 'one_hot'):
        if mode in self.encoders.keys():
            self.mode = mode
            self.encoder = self.encoders[mode]()
        else:
            raise AttributeError('Wrong mode name')
    
    
    def fit(self, X, y = None):
        self.encoder.fit(X, y)
        return self
    
    def transform(self, X, y = None):
        return self.encoder.transform(X, y)

In [None]:
test_df = processor.fit_transform(X_train)
a = CategoricalEncoder(mode = 'ordinal')
a.fit(test_df)
test_df_cat = a.transform(test_df).head()
test_df_cat

In [None]:
X_train.head()

In [None]:
a = CategoricalEncoder(mode = 'ordinal')
a.fit(X_train)
a.transform(X_train).head()

In [None]:
CategoricalEncoder(mode = 'ordinal').fit_transform(X_train).head()

In [None]:
CategoricalEncoder(mode = 'one_hot').fit_transform(X_train).head()

### Feature selection

In [None]:
class Selector(BaseEstimator, TransformerMixin):
    
    def __init__(self, percent = 50):
        self.percent = percent
        self.selector_inner = SelectPercentile(percentile=percent)
        
    def fit(self, X, y):
        self.selector_inner.fit(X, y)
        self.columns_names = X.columns[self.selector_inner.get_support()]
        return self
        
    
    def transform(self, X, y = None):
        X_copy = X.copy()
        X_proc = self.selector_inner.transform(X_copy)
        X_proc = pd.DataFrame(X_proc, columns = self.columns_names)
        return X_proc

In [None]:
Selector(percent = 30).fit_transform(X_train[['age', 'avg_glucose_level', 'id']], y_train).head()

### Scaling

In [None]:
class Scaler(BaseEstimator, TransformerMixin):
    
    scalers = {
        'standart': StandardScaler, 
        'minmax'  : MinMaxScaler
    }
    
    def __init__(self, mode = 'minmax'):
        if mode in self.scalers.keys():
            self.mode = mode
            self.scaler_inner = self.scalers[self.mode]()
        else:
            raise AttibuteError('Wrong mode name')
        
    def fit(self, X, y = None):
        self.scaler_inner.fit(X)
        self.columns_names = X.columns
        return self
        
    def transform(self, X, y = None):
        X_copy = X.copy()
        X_proc = self.scaler_inner.transform(X_copy)
        X_proc = pd.DataFrame(X_proc, columns = self.columns_names)
        return X_proc

In [None]:
Scaler(mode = 'minmax').fit_transform(df[['age', 'heart_disease']]).head()

# Modeling

In [None]:
def plot_learning_curve(estimator, X, y, axes=None, ylim=None, cv=5,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    '''
    https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html#sphx-glr-auto-examples-model-selection-plot-learning-curve-py
    '''

    if axes is None:
        _, axes = plt.subplots(1, 3, figsize=(20, 5))

    axes[0].set_title('Learning curve')
    if ylim is not None:
        axes[0].set_ylim(*ylim)
    axes[0].set_xlabel("Training examples")
    axes[0].set_ylabel("Score")

    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)

    # Plot learning curve
    axes[0].grid()
    axes[0].fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    axes[0].fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    axes[0].plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    axes[0].plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    axes[0].legend(loc="best")

    # Plot n_samples vs fit_times
    axes[1].grid()
    axes[1].plot(train_sizes, fit_times_mean, 'o-')
    axes[1].fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    axes[1].set_xlabel("Training examples")
    axes[1].set_ylabel("fit_times")
    axes[1].set_title("Scalability of the model")

    # Plot fit_time vs score
    axes[2].grid()
    axes[2].plot(fit_times_mean, test_scores_mean, 'o-')
    axes[2].fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    axes[2].set_xlabel("fit_times")
    axes[2].set_ylabel("Score")
    axes[2].set_title("Performance of the model")
    plt.show()



def precision_recall(model, X_test, y_test):
    precision, recall, thresholds = precision_recall_curve(
        y_test, 
        model.decision_function(X_test)
    )
    close_zero = np.argmin(np.abs(thresholds))
    plt.plot(precision[close_zero], recall[close_zero], 'o', markersize=10,
     label="threshold 0", fillstyle="none", c='k', mew=2)
    plt.plot(precision, recall, label="precision recall curve")
    plt.xlabel("Precision")
    plt.ylabel("Recall")
    plt.legend(loc="best")
    plt.show()


def eval_result(model, X_test, y_test, X_train, y_train, validation = False):
    if type(model) == GridSearchCV:
        model = model.best_estimator_
    pipeline = False
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        pred = model.predict(X_test)
        print(classification_report(y_test, pred, target_names = ['No Stroke', 'Stroke']))
        display(pd.DataFrame(confusion_matrix(y_test, pred), 
                         columns = ['Stroke Not Predicted', 'Stroke Predicted'],
                         index = ['No Stroke', 'Stroke']))
        
        if type(model) == Pipeline:
            pipeline = True
            pipe = model[:-1]
            model = model[-1]
        
        if pipeline:
            X_test = pipe.transform(X_test)
            X_train = pipe.transform(X_train)
            
        if ((hasattr(model, 'feature_importances_') 
        or hasattr(model, 'coef_')) 
            and not validation):
            try:
                model_feat_imp = model.feature_importances_
            except:
                model_feat_imp = [abs(i) for i in model.coef_[0]]
            
                
            features = pd.DataFrame({
                'Variable'  :X_test.columns,
                'Importance':model_feat_imp
            })
            features.sort_values('Importance', ascending=False, inplace=True)
            display(features.head(20))
        if not validation:
            try:
                precision_recall(model, X_test, y_test)
                plot_learning_curve(model, X_train, y_train, n_jobs=-1)
            except:
                pass
        

### Linear model

In [None]:
pipe_lm = make_pipeline(
    DataPreprocessor(),
    InterPolinomsFeatures(),
    CategoricalEncoder(),
    Selector(),
    Scaler(),
    LogisticRegression(max_iter = 500, random_state = 1)
)

param_grid = {
    'datapreprocessor__binning_age': [True, False],
    'datapreprocessor__binning_glucose_level': [True, False],
    'datapreprocessor__binning_bmi': [True, False],
    'interpolinomsfeatures__interaction_only': [True, False],
    'interpolinomsfeatures__degree': [1, 2, 3],
    'categoricalencoder__mode': ['one_hot', 'ordinal'],
    'selector__percent': [30, 50, 100],
    'scaler__mode': ['standart', 'minmax'],
    'logisticregression__class_weight': [
        {0:1, 1:5},
        {0:1, 1:7},
        {0:1, 1:10},
        {0:1, 1:15},
    ],
    'logisticregression__C': [0.1, 0.5, 0.7, 1, 2]
}
pipe_lm

In [None]:
pipe_lm[:-1].fit_transform(X_train, y_train).head()

In [None]:
grid = GridSearchCV(
        pipe_lm, 
        param_grid=param_grid, 
        cv=5, 
        n_jobs = -1, 
        verbose = 2,
        scoring = 'f1_macro'
    )
grid.fit(X_train, y_train)

Best and worst parameters combination:

In [None]:
lm_results = pd.DataFrame(grid.cv_results_)
lm_results.sort_values(by='mean_test_score', ascending = False, inplace = True)

display(lm_results.head(4))
display(lm_results.tail(4))

In [None]:
print(f"Best cross val score: {grid.best_score_}")
print(f"\nBest params:")
for param, val in grid.best_params_.items():
    print(f'{param}: {val}')
print('\n')
eval_result(grid, X_test, y_test, X_train, y_train)