In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
ds = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv', delimiter=',')

In [None]:
ds.head(3)

In [None]:
ds.drop(['id'], axis=1, inplace=True)

In [None]:
ds.stroke[ds.gender == 'Other']

In [None]:
ds.drop([3116], inplace=True)

In [None]:
ds.columns

In [None]:
columns_numeric = ['age', 'avg_glucose_level', 'bmi']
columns_categorical = ['gender', 'hypertension', 'heart_disease', 'ever_married', 
                       'work_type', 'Residence_type', 'smoking_status']
columns_dummis = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
columns_target = ['stroke']

In [None]:
def replace_nan(data, to_replace, replacement_data):
    
    data_def = data.copy(deep=True)
    
    index_zero = list(data_def[to_replace][data_def[replacement_data] == 0].index)
    index_one = list(data_def[to_replace][data_def[replacement_data] == 1].index)
    
    for i in range(2):
        minimum = data_def[to_replace][data_def[replacement_data] == i].quantile(0.25)
        maximum = data_def[to_replace][data_def[replacement_data] == i].quantile(0.75)
        
        minimum -= (maximum - minimum) * 0.5
        maximum += (maximum - minimum) * 0.5
    
        count = data_def[to_replace][data_def[replacement_data] == i].isnull().sum()
        
        data_for_nan = np.random.choice(range(int(minimum), int(maximum)), count)
    
        if i == 0:
            index_null = data_def[to_replace][index_zero][data_def[to_replace].isnull()].index
        else:
            index_null = data_def[to_replace][index_one][data_def[to_replace].isnull()].index
        
        data_def[to_replace][index_null] = data_for_nan
        
    return data_def

In [None]:
ds = replace_nan(ds, 'bmi', 'stroke')

In [None]:
ds.info()

In [None]:
ds = pd.get_dummies(ds, columns=columns_dummis, prefix_sep='_', drop_first=True)

In [None]:
ds.head(3)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
ds_train, ds_test = train_test_split(ds, test_size=0.3, random_state=42, stratify=ds.stroke)

In [None]:
ds_train.stroke.value_counts(normalize=True), ds_test.stroke.value_counts(normalize=True)

In [None]:
ds_train.shape[0] + ds_test.shape[0], ds.shape

In [None]:
add_rows = int(ds_train.stroke.value_counts()[0] * .8 - ds_train.stroke.value_counts()[1])
add_rows

In [None]:
int(add_rows / ds_train.stroke.value_counts()[1]), ds_train.stroke.value_counts()

In [None]:
index_train_one = ds_train.stroke[ds_train.stroke == 1].index
ds_train_one = ds_train.loc[index_train_one]

In [None]:
for i in range(14):
    ds_train = ds_train.append(ds_train_one)
    #y_train = pd.concat([y_train, y_train[y_train == 1]])

In [None]:
ds_train.stroke.value_counts(normalize=True)

In [None]:
from sklearn.utils import shuffle

In [None]:
ds_train = shuffle(ds_train)

### --------------- metrics ------------------------

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix

In [None]:
def print_metrics(actual, predict):
    print('Accuracy: {:.3f}'.format(accuracy_score(actual, predict)))
    print('Precision: {:.3f}'.format(precision_score(actual, predict)))
    print('Recall: {:.3f}'.format(recall_score(actual, predict)))
    print('F1 score: {:.3f}'.format(f1_score(actual, predict)))

In [None]:
def plot_roc_auc(actual, predict):
    
    fpr, tpr, threshold = roc_curve(actual, predict)
    plt.plot(fpr, tpr, color='b')
    #plt.plot()
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.plot([0.0, 1.0], [0.0, 1.0], color='r')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC AUC = {:.3f}'.format(roc_auc_score(actual, predict)))

### -------------- Voting ------------------------

In [None]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

In [None]:
np.random.seed(160)
model_tree = DecisionTreeClassifier(criterion='gini', splitter='best', max_depth=6, min_samples_leaf=1, 
                                    max_features=4, random_state=160)

np.random.seed(151)
model_RF = RandomForestClassifier(n_estimators=1000, criterion='entropy', max_leaf_nodes=3, 
                                  max_features='auto', bootstrap=False, random_state=151, n_jobs=-1)
np.random.seed(33)
model_svc = SVC(kernel='rbf', C=0.4, gamma=0.01, probability=True, random_state=33)
model_LR = LogisticRegression(penalty='l2', C=1.0, random_state=33, n_jobs=-1)

model_voting = VotingClassifier(estimators=[('tree', model_tree), ('RF', model_RF), ('SVC', model_svc), ('LR', model_LR)], 
                                voting='hard', weights=[0.8, 0.8, 1.0, 0.8], n_jobs=-1)

In [None]:
%%time
model_tree.fit(ds_train.drop(['stroke'], axis=1), ds_train.stroke)
model_RF.fit(ds_train.drop(['stroke'], axis=1), ds_train.stroke)
model_svc.fit(ds_train.drop(['stroke'], axis=1), ds_train.stroke)
model_LR.fit(ds_train.drop(['stroke'], axis=1), ds_train.stroke)
model_voting.fit(ds_train.drop(['stroke'], axis=1), ds_train.stroke)

In [None]:
for model, label in zip([model_tree, model_RF, model_svc, model_LR, model_voting], 
                        ['tree', 'RF', 'SVC', 'LR', 'Voting']):
    print('-'*5, label, '-'*5)
    print_metrics(ds_train.stroke, model.predict(ds_train.drop(['stroke'], axis=1)))
    print()

In [None]:
model_voting = VotingClassifier(estimators=[('tree', model_tree), ('RF', model_RF), ('SVC', model_svc), 
                                            ('LR', model_LR)], 
                                voting='hard', weights=[0.8, 0.8, 1.0, 0.8], n_jobs=-1)

In [None]:
model_voting.fit(ds_train.drop(['stroke'], axis=1), ds_train.stroke)

#### **TRAIN**

In [None]:
y_pred = model_voting.predict(ds_train.drop(['stroke'], axis=1))

In [None]:
print_metrics(ds_train.stroke, y_pred)

In [None]:
sns.heatmap(confusion_matrix(ds_train.stroke, y_pred), cmap='Blues', annot=True, fmt='');

#### **TEST**

In [None]:
y_pred_test = model_voting.predict(ds_test.drop(['stroke'], axis=1))

In [None]:
print_metrics(ds_test.stroke, y_pred_test)

In [None]:
sns.heatmap(confusion_matrix(ds_test.stroke, y_pred_test), cmap='Blues', annot=True, fmt='');