In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
ds = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv', delimiter=',')

In [None]:
ds.head(3)

In [None]:
ds.drop(['id'], axis=1, inplace=True)

In [None]:
ds.stroke[ds.gender == 'Other']

In [None]:
ds.drop([3116], inplace=True)

In [None]:
ds.columns

In [None]:
columns_numeric = ['age', 'avg_glucose_level', 'bmi']
columns_categorical = ['gender', 'hypertension', 'heart_disease', 'ever_married', 
                       'work_type', 'Residence_type', 'smoking_status']
columns_dummis = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
columns_target = ['stroke']

In [None]:
def replace_nan(data, to_replace, replacement_data):
    
    data_def = data.copy(deep=True)
    
    index_zero = list(data_def[to_replace][data_def[replacement_data] == 0].index)
    index_one = list(data_def[to_replace][data_def[replacement_data] == 1].index)
    
    for i in range(2):
        minimum = data_def[to_replace][data_def[replacement_data] == i].quantile(0.25)
        maximum = data_def[to_replace][data_def[replacement_data] == i].quantile(0.75)
        
        minimum -= (maximum - minimum) * 0.5
        maximum += (maximum - minimum) * 0.5
    
        count = data_def[to_replace][data_def[replacement_data] == i].isnull().sum()
        
        data_for_nan = np.random.choice(range(int(minimum), int(maximum)), count)
    
        if i == 0:
            index_null = data_def[to_replace][index_zero][data_def[to_replace].isnull()].index
        else:
            index_null = data_def[to_replace][index_one][data_def[to_replace].isnull()].index
        
        data_def[to_replace][index_null] = data_for_nan
        
    return data_def

In [None]:
ds = replace_nan(ds, 'bmi', 'stroke')

In [None]:
ds.info()

In [None]:
ds = pd.get_dummies(ds, columns=columns_dummis, prefix_sep='_', drop_first=True)

In [None]:
ds.head(3)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
ds_train, ds_test = train_test_split(ds, test_size=0.3, random_state=42, stratify=ds.stroke)

In [None]:
ds_train.stroke.value_counts(normalize=True), ds_test.stroke.value_counts(normalize=True)

In [None]:
ds_train.shape[0] + ds_test.shape[0], ds.shape

In [None]:
add_rows = int(ds_train.stroke.value_counts()[0] * .8 - ds_train.stroke.value_counts()[1])
add_rows

In [None]:
int(add_rows / ds_train.stroke.value_counts()[1]), ds_train.stroke.value_counts()

In [None]:
index_train_one = ds_train.stroke[ds_train.stroke == 1].index
ds_train_one = ds_train.loc[index_train_one]

In [None]:
for i in range(14):
    ds_train = ds_train.append(ds_train_one)
    #y_train = pd.concat([y_train, y_train[y_train == 1]])

In [None]:
ds_train.stroke.value_counts(normalize=True)

In [None]:
from sklearn.utils import shuffle

In [None]:
ds_train = shuffle(ds_train)

### --------------- metrics ------------------------

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix

In [None]:
def print_metrics(actual, predict):
    print('Accuracy: {:.3f}'.format(accuracy_score(actual, predict)))
    print('Precision: {:.3f}'.format(precision_score(actual, predict)))
    print('Recall: {:.3f}'.format(recall_score(actual, predict)))
    print('F1 score: {:.3f}'.format(f1_score(actual, predict)))

In [None]:
def plot_roc_auc(actual, predict):
    
    fpr, tpr, threshold = roc_curve(actual, predict)
    plt.plot(fpr, tpr, color='b')
    #plt.plot()
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.plot([0.0, 1.0], [0.0, 1.0], color='r')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC AUC = {:.3f}'.format(roc_auc_score(actual, predict)))

### -------------- Tree ----------------------

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
np.random.seed(33)

In [None]:
model_tree = DecisionTreeClassifier(random_state=33)

In [None]:
model_tree.fit(ds_train.drop(['stroke'], axis=1), ds_train.stroke)

In [None]:
y_pred = model_tree.predict(ds_test.drop(['stroke'], axis=1))

In [None]:
y_pred_proba = model_tree.predict_proba(ds_test.drop(['stroke'], axis=1))

In [None]:
print_metrics(ds_test.stroke, y_pred)

In [None]:
conf_matrix = confusion_matrix(ds_test.stroke, y_pred)

In [None]:
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='');

### ---------- selection of parameters ----------------

In [None]:
%%time
best_f1 = []
best_recall = []
best_roc_auc = []
best_precision = [] 
x = []
x_keys = []
n = 0

for cr in ['gini', 'entropy']:
    for sp in ['best', 'random']:
        for depth in [1, 2, 3, 4, 5, 6, 7, None]:
            for leaf in [1, 2, 3, 4, 5]:
                for feature in [1, 2, 3, 4, 'sqrt', 'log2']:
                    model = DecisionTreeClassifier(criterion=cr, splitter=sp, max_depth=depth, 
                                                   min_samples_leaf=leaf, max_features=feature, 
                                                   random_state=33)
                                
                    model.fit(ds_train.drop(['stroke'], axis=1), ds_train.stroke)
                    y_pred = model.predict(ds_test.drop(['stroke'], axis=1))
                                
                    f1 = f1_score(ds_test.stroke, y_pred)
                    best_f1.append(f1)
                                
                    recall = recall_score(ds_test.stroke, y_pred)
                    best_recall.append(recall)
                                
                    precision = precision_score(ds_test.stroke, y_pred)
                    best_precision.append(precision)
                                
                    y_pred_prob = model.predict_proba(ds_test.drop(['stroke'], axis=1))
                    roc_auc = roc_auc_score(ds_test.stroke, y_pred_prob[:, 1])
                    best_roc_auc.append(roc_auc)
                                
                    x_keys.append(str(cr) +' '+ str(sp) +' '+ str(depth) +' '+ str(leaf) +' '+ 
                                  str(feature))
                    x.append(n)
                    n += 1

In [None]:
plt.figure(figsize=(15, 5))
plt.plot(x, best_f1, c='r', label='f1')
plt.plot(x, best_recall, c='b', label='recall')
plt.plot(x, best_roc_auc, c='green', label='roc_auc')
plt.plot(x, best_precision, c='yellow', label='precision')
plt.legend()
plt.show()

In [None]:
data_score = {'f1': best_f1, 'recall': best_recall, 'precision': best_precision, 'roc_auc': best_roc_auc}

In [None]:
ds_scores = pd.DataFrame(data_score, index=x_keys)

In [None]:
plt.figure(figsize=(4, 6))
sns.heatmap(ds_scores.sort_values(by=['f1'], ascending=False)[:30], annot=True, fmt='.3f');

In [None]:
model_tree = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=6, min_samples_leaf=1, 
                                   max_features=4, random_state=151)

In [None]:
model_tree.fit(ds_train.drop(['stroke'], axis=1), ds_train.stroke)

In [None]:
y_pred_prob = model_tree.predict_proba(ds_test.drop(['stroke'], axis=1))

In [None]:
print_metrics(ds_test.stroke, y_pred)

In [None]:
conf_matrix = confusion_matrix(ds_test.stroke, y_pred)

In [None]:
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='');

### ------------- selection of threshold -----------------

In [None]:
model_tree = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=6, min_samples_leaf=1, 
                                   max_features=4, random_state=151)

In [None]:
model_tree.fit(ds_train.drop(['stroke'], axis=1), ds_train.stroke)

In [None]:
y_pred_prob = model_tree.predict_proba(ds_test.drop(['stroke'], axis=1))

In [None]:
f1_sc = []
threshold = []
rec_sc = []
prec_sc = []
x = []

for i in np.linspace(0.0, 1.0, 100):
    recall_sc = recall_score(ds_test.stroke, y_pred_prob[:, 1] > i)
    precision_sc = precision_score(ds_test.stroke, y_pred_prob[:, 1] > i)
    f_one = f1_score(ds_test.stroke, y_pred_prob[:, 1] > i)
    f1_sc.append(f_one)
    threshold.append(i)
    rec_sc.append(recall_sc)
    prec_sc.append(precision_sc)
    x.append(i)

In [None]:
plt.figure(figsize=(15, 8))
plt.plot(np.linspace(0.0, 1.0, 100), rec_sc, color='b', label='recall')
plt.plot(np.linspace(0.0, 1.0, 100), prec_sc, color='r', label='precision')
plt.plot(np.linspace(0.0, 1.0, 100), f1_sc, color='green', label='f1')
plt.legend(loc='upper right')
plt.show()

In [None]:
ds_sc = pd.DataFrame({'threshold': threshold,'f1': f1_sc, 'recall': rec_sc, 'precision': prec_sc}, index=x)

In [None]:
plt.figure(figsize=(7, 30))
sns.heatmap(ds_sc.sort_values(by='f1',ascending=False), annot=True, fmt='.6f')
plt.title('Table scores');

In [None]:
conf_matrix = confusion_matrix(ds_test.stroke, y_pred_prob[:, 1] > 0.65656565656565)

In [None]:
sns.heatmap(conf_matrix, cmap='Blues', annot=True, fmt='');

In [None]:
print_metrics(ds_test.stroke, y_pred_prob[:, 1] > 0.65656565656565)

### --------------- Bagging ------------------

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
model_bagging = BaggingClassifier(model_tree, n_estimators=1000, max_samples=100, bootstrap=True,
                                 random_state=160, n_jobs=-1)

In [None]:
%%time
model_bagging.fit(ds_train.drop(['stroke'], axis=1), ds_train.stroke)

In [None]:
y_pred = model_bagging.predict(ds_test.drop(['stroke'], axis=1))

In [None]:
y_pred_prob = model_bagging.predict_proba(ds_test.drop(['stroke'], axis=1))

In [None]:
print_metrics(ds_test.stroke, y_pred)

In [None]:
f1_sc = []
threshold = []
rec_sc = []
prec_sc = []
x = []

for i in np.linspace(0.0, 1.0, 100):
    recall_sc = recall_score(ds_test.stroke, y_pred_prob[:, 1] > i)
    precision_sc = precision_score(ds_test.stroke, y_pred_prob[:, 1] > i)
    f_one = f1_score(ds_test.stroke, y_pred_prob[:, 1] > i)
    f1_sc.append(f_one)
    threshold.append(i)
    rec_sc.append(recall_sc)
    prec_sc.append(precision_sc)
    x.append(i)

In [None]:
plt.figure(figsize=(15, 8))
plt.plot(np.linspace(0.0, 1.0, 100), rec_sc, color='b', label='recall')
plt.plot(np.linspace(0.0, 1.0, 100), prec_sc, color='r', label='precision')
plt.plot(np.linspace(0.0, 1.0, 100), f1_sc, color='green', label='f1')
plt.legend(loc='upper right')
plt.show()

In [None]:
ds_sc = pd.DataFrame({'threshold': threshold,'f1': f1_sc, 'recall': rec_sc, 'precision': prec_sc}, index=x)

In [None]:
plt.figure(figsize=(7, 30))
sns.heatmap(ds_sc.sort_values(by='f1',ascending=False), annot=True, fmt='.6f')
plt.title('Table scores');

In [None]:
conf_matrix = confusion_matrix(ds_test.stroke, y_pred_prob[:, 1] > 0.59595959595959)

In [None]:
sns.heatmap(conf_matrix, cmap='Blues', annot=True, fmt='');

In [None]:
print_metrics(ds_test.stroke, y_pred_prob[:, 1] > 0.5959595959595959)