In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
ds = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv', delimiter=',')

In [None]:
ds.head(3)

In [None]:
ds.drop(['id'], axis=1, inplace=True)

In [None]:
ds.stroke[ds.gender == 'Other']

In [None]:
ds.drop([3116], inplace=True)

In [None]:
ds.columns

In [None]:
columns_numeric = ['age', 'avg_glucose_level', 'bmi']
columns_categorical = ['gender', 'hypertension', 'heart_disease', 'ever_married', 
                       'work_type', 'Residence_type', 'smoking_status']
columns_dummis = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
columns_target = ['stroke']

In [None]:
def replace_nan(data, to_replace, replacement_data):
    
    data_def = data.copy(deep=True)
    
    index_zero = list(data_def[to_replace][data_def[replacement_data] == 0].index)
    index_one = list(data_def[to_replace][data_def[replacement_data] == 1].index)
    
    for i in range(2):
        minimum = data_def[to_replace][data_def[replacement_data] == i].quantile(0.25)
        maximum = data_def[to_replace][data_def[replacement_data] == i].quantile(0.75)
        
        minimum -= (maximum - minimum) * 0.5
        maximum += (maximum - minimum) * 0.5
    
        count = data_def[to_replace][data_def[replacement_data] == i].isnull().sum()
        
        data_for_nan = np.random.choice(range(int(minimum), int(maximum)), count)
    
        if i == 0:
            index_null = data_def[to_replace][index_zero][data_def[to_replace].isnull()].index
        else:
            index_null = data_def[to_replace][index_one][data_def[to_replace].isnull()].index
        
        data_def[to_replace][index_null] = data_for_nan
        
    return data_def

In [None]:
ds = replace_nan(ds, 'bmi', 'stroke')

In [None]:
ds.info()

In [None]:
ds = pd.get_dummies(ds, columns=columns_dummis, prefix_sep='_', drop_first=True)

In [None]:
ds.head(3)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
ds_train, ds_test = train_test_split(ds, test_size=0.3, random_state=42, stratify=ds.stroke)

In [None]:
ds_train.stroke.value_counts(normalize=True), ds_test.stroke.value_counts(normalize=True)

In [None]:
ds_train.shape[0] + ds_test.shape[0], ds.shape

In [None]:
add_rows = int(ds_train.stroke.value_counts()[0] * .8 - ds_train.stroke.value_counts()[1])
add_rows

In [None]:
int(add_rows / ds_train.stroke.value_counts()[1]), ds_train.stroke.value_counts()

In [None]:
index_train_one = ds_train.stroke[ds_train.stroke == 1].index
ds_train_one = ds_train.loc[index_train_one]

In [None]:
for i in range(14):
    ds_train = ds_train.append(ds_train_one)
    #y_train = pd.concat([y_train, y_train[y_train == 1]])

In [None]:
ds_train.stroke.value_counts(normalize=True)

In [None]:
from sklearn.utils import shuffle

In [None]:
ds_train = shuffle(ds_train)

### --------------- metrics ------------------------

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score, roc_curve

In [None]:
def print_metrics(actual, predict):
    print('Accuracy: {:.3f}'.format(accuracy_score(actual, predict)))
    print('Precision: {:.3f}'.format(precision_score(actual, predict)))
    print('Recall: {:.3f}'.format(recall_score(actual, predict)))
    print('F1 score: {:.3f}'.format(f1_score(actual, predict)))

In [None]:
def plot_roc_auc(actual, predict):
    
    fpr, tpr, threshold = roc_curve(actual, predict)
    plt.plot(fpr, tpr, color='b')
    #plt.plot()
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.plot([0.0, 1.0], [0.0, 1.0], color='r')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC AUC = {:.3f}'.format(roc_auc_score(actual, predict)))

### --------------- Scaler ---------------------

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

In [None]:
scaler = StandardScaler()

In [None]:
ds_train[columns_numeric] =  scaler.fit_transform(ds_train[columns_numeric])

In [None]:
ds_train

In [None]:
ds_test[columns_numeric] = scaler.transform(ds_test[columns_numeric])

### ------------ SVC ----------------------

In [None]:
from sklearn.svm import SVC

In [None]:
np.random.seed(33)
model_svc = SVC(probability=True, random_state=33)

In [None]:
%%time
model_svc.fit(ds_train.drop(['stroke'], axis=1), ds_train.stroke)

In [None]:
y_pred = model_svc.predict(ds_test.drop(['stroke'], axis=1))

In [None]:
y_pred_prob = model_svc.predict_proba(ds_test.drop(['stroke'], axis=1))

In [None]:
print_metrics(ds_test.stroke, y_pred)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
sns.heatmap(confusion_matrix(ds_test.stroke, y_pred), cmap='Blues', annot=True, fmt='');

In [None]:
plot_roc_auc(ds_test.stroke, y_pred_prob[:, 1])

#### selection of parameters

In [None]:
%%time
best_f1 = []
best_recall = []
best_roc_auc = []
best_precision = [] 
x = []
x_keys = []
n = 0

for cr in ['rbf']:
    for c in [0.8, 0.85, 0.9, 0.95, 1.0]:
        for g in [0.004, 0.0045, 0.005, 0.0055, 0.006]:
            model = SVC(kernel=cr, C=c, gamma=g, probability=True, random_state=33)
                
            model.fit(ds_train.drop(['stroke'], axis=1), ds_train.stroke)
            y_pred = model.predict(ds_test.drop(['stroke'], axis=1))
                
            f1 = f1_score(ds_test.stroke, y_pred)
            best_f1.append(f1)
                
            recall = recall_score(ds_test.stroke, y_pred)
            best_recall.append(recall)
                
            precision = precision_score(ds_test.stroke, y_pred)
            best_precision.append(precision)
                
            y_pred_prob = model.predict_proba(ds_test.drop(['stroke'], axis=1))
            roc_auc = roc_auc_score(ds_test.stroke, y_pred_prob[:, 1])
            best_roc_auc.append(roc_auc)
                
            x_keys.append(str(cr) +' '+ str(c) +' '+ str(g))
            x.append(n)
            n += 1

In [None]:
scores = {'f1': best_f1, 'recall': best_recall, 'precision': best_precision, 'roc_auc': best_roc_auc, 
          'key': x_keys}

In [None]:
df_scores = pd.DataFrame(scores, index=x)

In [None]:
df_scores.sort_values(by=['f1', 'recall', 'precision'], ascending=False)

In [None]:
np.random.seed(33)

In [None]:
model_svc = SVC(kernel='rbf', C=0.95, gamma=0.0045, probability=True, random_state=33)

In [None]:
model_svc.fit(ds_train.drop(['stroke'], axis=1), ds_train.stroke)

In [None]:
y_pred = model_svc.predict(ds_test.drop(['stroke'], axis=1))

In [None]:
print_metrics(ds_test.stroke, y_pred)

In [None]:
y_ped_prob = model_svc.predict_proba(ds_test.drop(['stroke'], axis=1))

In [None]:
sns.heatmap(confusion_matrix(ds_test.stroke, y_pred), 
            cmap='Blues', annot=True, fmt='');

In [None]:
plot_roc_auc(ds_test.stroke, y_pred_prob[:, 1])

#### selection of threshold

In [None]:
%%time
f1_sc = []
threshold = []
rec_sc = []
prec_sc = []

for i in np.linspace(0.5, 0.75, 1000):
    recall_sc = recall_score(ds_test.stroke, y_pred_prob[:, 1] > i)
    precision_sc = precision_score(ds_test.stroke, y_pred_prob[:, 1] > i)
    f_one = f1_score(ds_test.stroke, y_pred_prob[:, 1] > i)
    f1_sc.append(f_one)
    threshold.append(i)
    rec_sc.append(recall_sc)
    prec_sc.append(precision_sc)

In [None]:
plt.figure(figsize=(15, 8))
plt.plot(np.linspace(0.5, 0.75, 1000), rec_sc, color='b', label='recall')
plt.plot(np.linspace(0.50, 0.75, 1000), prec_sc, color='r', label='precision')
plt.plot(np.linspace(0.50, 0.75, 1000), f1_sc, color='green', label='f1')
plt.legend(loc='upper right')
plt.show()

In [None]:
df_threshold = pd.DataFrame({'threshold': threshold, 'f1': f1_sc, 'recall': rec_sc, 'precision': prec_sc}, 
                           index=range(len(threshold)))

In [None]:
plt.figure(figsize=(8, 8))
sns.heatmap(df_threshold.sort_values(by=['f1'], ascending=False)[:40], annot=True, fmt='.4f');

In [None]:
threshold[703]

In [None]:
conf_matrix = confusion_matrix(ds_test.stroke, y_pred_prob[:, 1] > 0.6759259259259259)

In [None]:
sns.heatmap(conf_matrix, cmap='Blues', annot=True, fmt='');

In [None]:
print_metrics(ds_test.stroke, y_pred_prob[:, 1] > 0.6759259259259259)

### --------------- Scaler MM---------------------

#### load ds for Scaler Min Max

In [None]:
ds = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv', delimiter=',')

In [None]:
ds.head(3)

In [None]:
ds.drop(['id'], axis=1, inplace=True)

In [None]:
ds.stroke[ds.gender == 'Other']

In [None]:
ds.drop([3116], inplace=True)

In [None]:
def replace_nan(data, to_replace, replacement_data):
    
    data_def = data.copy(deep=True)
    
    index_zero = list(data_def[to_replace][data_def[replacement_data] == 0].index)
    index_one = list(data_def[to_replace][data_def[replacement_data] == 1].index)
    
    for i in range(2):
        minimum = data_def[to_replace][data_def[replacement_data] == i].quantile(0.25)
        maximum = data_def[to_replace][data_def[replacement_data] == i].quantile(0.75)
        
        minimum -= (maximum - minimum) * 0.5
        maximum += (maximum - minimum) * 0.5
    
        count = data_def[to_replace][data_def[replacement_data] == i].isnull().sum()
        
        data_for_nan = np.random.choice(range(int(minimum), int(maximum)), count)
    
        if i == 0:
            index_null = data_def[to_replace][index_zero][data_def[to_replace].isnull()].index
        else:
            index_null = data_def[to_replace][index_one][data_def[to_replace].isnull()].index
        
        data_def[to_replace][index_null] = data_for_nan
        
    return data_def

In [None]:
ds = replace_nan(ds, 'bmi', 'stroke')

In [None]:
ds = pd.get_dummies(ds, columns=columns_dummis, prefix_sep='_', drop_first=True)

In [None]:
ds.head(3)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
ds_train, ds_test = train_test_split(ds, test_size=0.3, random_state=42, stratify=ds.stroke)

In [None]:
ds_train.stroke.value_counts(normalize=True), ds_test.stroke.value_counts(normalize=True)

In [None]:
ds_train.shape[0] + ds_test.shape[0], ds.shape

In [None]:
add_rows = int(ds_train.stroke.value_counts()[0] * .8 - ds_train.stroke.value_counts()[1])
add_rows

In [None]:
int(add_rows / ds_train.stroke.value_counts()[1]), ds_train.stroke.value_counts()

In [None]:
index_train_one = ds_train.stroke[ds_train.stroke == 1].index
ds_train_one = ds_train.loc[index_train_one]

In [None]:
for i in range(14):
    ds_train = ds_train.append(ds_train_one)
    #y_train = pd.concat([y_train, y_train[y_train == 1]])

In [None]:
ds_train.stroke.value_counts(normalize=True)

In [None]:
from sklearn.utils import shuffle

In [None]:
ds_train = shuffle(ds_train)

In [None]:
scaler_MM = MinMaxScaler()

In [None]:
ds_train[columns_numeric] =  scaler_MM.fit_transform(ds_train[columns_numeric])

In [None]:
ds_train.head(3)

In [None]:
ds_test[columns_numeric] = scaler_MM.transform(ds_test[columns_numeric])

### --------------- SVC --------------------------

#### selection of params

In [None]:
%%time
best_f1 = []
best_recall = []
best_roc_auc = []
best_precision = [] 
x = []
x_keys = []
n = 0

for cr in ['rbf']:
    for c in [0.20, 0.25, 0.30, 0.35, 0.4]:
        for g in [0.0001, 0.005, 0.01, 0.05, 0.1]:
            model = SVC(kernel=cr, C=c, gamma=g, probability=True, random_state=33)
                
            model.fit(ds_train.drop(['stroke'], axis=1), ds_train.stroke)
            y_pred = model.predict(ds_test.drop(['stroke'], axis=1))
                
            f1 = f1_score(ds_test.stroke, y_pred)
            best_f1.append(f1)
                
            recall = recall_score(ds_test.stroke, y_pred)
            best_recall.append(recall)
                
            precision = precision_score(ds_test.stroke, y_pred)
            best_precision.append(precision)
                
            y_pred_prob = model.predict_proba(ds_test.drop(['stroke'], axis=1))
            roc_auc = roc_auc_score(ds_test.stroke, y_pred_prob[:, 1])
            best_roc_auc.append(roc_auc)
                
            x_keys.append(str(cr) +' '+ str(c) +' '+ str(g))
            x.append(n)
            n += 1

In [None]:
scores = {'f1': best_f1, 'recall': best_recall, 'precision': best_precision, 'roc_auc': best_roc_auc, 'key': x_keys}

df_scores = pd.DataFrame(scores, index=x)

In [None]:
df_scores.sort_values(by=['f1', 'recall'], ascending=False)[:20]

In [None]:
plt.figure(figsize=(8, 10))
sns.heatmap(df_scores.loc[:, ['f1', 'recall', 'precision', 'roc_auc']].sort_values(by='f1', ascending=False)[:30], 
           annot=True, fmt='.5f');

In [None]:
model_svc = SVC(kernel='rbf', C=0.4, gamma=0.01, probability=True, random_state=33)

In [None]:
model_svc.fit(ds_train.drop(['stroke'], axis=1), ds_train.stroke)

In [None]:
y_pred = model_svc.predict(ds_test.drop(['stroke'], axis=1))

In [None]:
print_metrics(ds_test.stroke, y_pred)

In [None]:
y_ped_prob = model_svc.predict_proba(ds_test.drop(['stroke'], axis=1))

In [None]:
sns.heatmap(confusion_matrix(ds_test.stroke, y_pred), 
            cmap='Blues', annot=True, fmt='');

In [None]:
plot_roc_auc(ds_test.stroke, y_pred_prob[:, 1])

#### selection of threshold

In [None]:
f1_sc = []
threshold = []
rec_sc = []
prec_sc = []

for i in np.linspace(0.50, 0.80, 1000):
    recall_sc = recall_score(ds_test.stroke, y_pred_prob[:, 1] > i)
    precision_sc = precision_score(ds_test.stroke, y_pred_prob[:, 1] > i)
    f_one = f1_score(ds_test.stroke, y_pred_prob[:, 1] > i)
    f1_sc.append(f_one)
    threshold.append(i)
    rec_sc.append(recall_sc)
    prec_sc.append(precision_sc)

In [None]:
plt.figure(figsize=(15, 8))
plt.plot(np.linspace(0.50, 0.80, 1000), rec_sc, color='b', label='recall')
plt.plot(np.linspace(0.50, 0.80, 1000), prec_sc, color='r', label='precision')
plt.plot(np.linspace(0.50, 0.80, 1000), f1_sc, color='green', label='f1')
plt.legend(loc='upper right')
plt.show()

In [None]:
df_threshold = pd.DataFrame({'threshold': threshold, 'f1': f1_sc, 'recall': rec_sc, 'precision': prec_sc}, 
                           index=range(len(threshold)))

In [None]:
plt.figure(figsize=(8, 10))
sns.heatmap(df_threshold.sort_values(by=['f1'], ascending=False)[:50], annot=True, fmt='.4f');

In [None]:
threshold[504]

In [None]:
conf_matrix = confusion_matrix(ds_test.stroke, y_pred_prob[:, 1] > 0.6513513513513514)

In [None]:
sns.heatmap(conf_matrix, cmap='Blues', annot=True, fmt='');

In [None]:
print_metrics(ds_test.stroke, y_pred_prob[:, 1] > 0.6513513513513514)

### ------ Best params --------------

- Scaler -> StandardScaler      
- SVC      
    - kernel='rbf'       
    - C=0.95         
    - gamma=0.0045       

- threshold - 0.6759259259259259      


results:      
- Accuracy: 0.871
- Precision: 0.214
- Recall: 0.613
- F1 score: 0.317   