### 1. Load library

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE

import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px

### 2. Read data and EDA using plotly

In [None]:
data = pd.read_csv("../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")

In [None]:
data.isnull().sum(axis=0)

In [None]:
fig = make_subplots(rows=5, cols=2)
fig.add_trace(go.Bar(x=data.gender.value_counts().index, y=data.gender.value_counts().values, name="Gender"), row=1, col=1)
fig.add_trace(go.Histogram(x=data.age, name="Age"), row=1, col=2)
fig.add_trace(go.Bar(x=data.hypertension.value_counts().index, y=data.hypertension.value_counts().values, name="HyperTension"), row=2, col=1)
fig.add_trace(go.Bar(x=data.heart_disease.value_counts().index, y=data.heart_disease.value_counts().values, name="heart_disease"), row=2, col=2)
fig.add_trace(go.Bar(x=data.ever_married.value_counts().index, y=data.ever_married.value_counts().values, name="ever_married"), row=3, col=1)
fig.add_trace(go.Bar(x=data.work_type.value_counts().index, y=data.work_type.value_counts().values, name="work_type"), row=3, col=2)
fig.add_trace(go.Bar(x=data.Residence_type.value_counts().index, y=data.Residence_type.value_counts().values, name="Residence_type"), row=4, col=1)
fig.add_trace(go.Histogram(x=data.avg_glucose_level, name="avg_glucose_level"), row=4, col=2)
fig.add_trace(go.Bar(x=data.smoking_status.value_counts().index, y=data.smoking_status.value_counts().values, name="smoking_status"), row=5, col=1)
fig.add_trace(go.Bar(x=data.stroke.value_counts().index, y=data.stroke.value_counts().values, name="stroke"), row=5, col=2)
fig.show()

In [None]:
other_index = data[data['gender'] == 'Other'].index
data = data.drop(other_index)

In [None]:
px.histogram(data, x='bmi', color='stroke')

In [None]:
bmi_mean = round(data.bmi.mean(skipna=True),1)
data['bmi'] = data['bmi'].fillna(bmi_mean)
data.bmi.isnull().any()

### 3. Data Preprosessing

> In general, children is 'never smoked' category.

> So, If work_types are children, it can be modified from 'Unknown' to 'never smoked'.

In [None]:
px.bar(data, x='smoking_status', color='work_type')

In [None]:
data.loc[(data['smoking_status'] == 'Unknown') & (data['work_type'] == 'children'), 'smoking_status'] = 'never smoked'

In [None]:
px.bar(data, x='smoking_status', color='work_type')

In [None]:
gender_dummy = pd.get_dummies(data.gender)
ever_married_dummy = pd.get_dummies(data.ever_married)
work_type_dummy = pd.get_dummies(data.work_type)
Residence_type_dummy = pd.get_dummies(data.Residence_type)

In [None]:
sc = StandardScaler()
sc.fit(data[['age', 'avg_glucose_level', 'bmi']])

In [None]:
data[['age', 'avg_glucose_level', 'bmi']] = sc.transform(data[['age', 'avg_glucose_level', 'bmi']])
data = pd.concat([data, gender_dummy, ever_married_dummy,work_type_dummy,Residence_type_dummy], axis='columns')
data = data.drop(['gender','ever_married','work_type','Residence_type'], axis='columns')

> For data analysis, I will predict 'Unknown' type using randomforest

In [None]:
smoke_train = data.copy()

In [None]:
smoke_tr = smoke_train.loc[smoke_train['smoking_status'] != 'Unknown', :]
smoke_te = smoke_train.loc[smoke_train['smoking_status'] == 'Unknown', :]

In [None]:
smoke_x = smoke_tr.drop(['smoking_status'], axis='columns')
smoke_y = smoke_tr['smoking_status']

In [None]:
smoke_y.isnull().any()

In [None]:
rf = RandomForestClassifier()
rf.fit(smoke_x, smoke_y)

In [None]:
smoke_te = smoke_te.drop(['smoking_status'], axis='columns')

In [None]:
smoke_pred = rf.predict(smoke_te)
smoke_te['smoking_status'] = smoke_pred

In [None]:
data_pre = pd.concat([smoke_tr, smoke_te], axis='rows')

In [None]:
data_pre.isnull().any()

### 4. SMOTE (Unbalancing stroke data)

In [None]:
data_pre['stroke'].value_counts()

In [None]:
smoking_status_dummy = pd.get_dummies(data_pre.smoking_status)

In [None]:
data_pre = pd.concat([data_pre, smoking_status_dummy], axis='columns')
data_pre = data_pre.drop(['smoking_status'], axis='columns')
data_pre

In [None]:
sm = SMOTE(random_state=0)

X_train = data_pre.drop(['stroke'], axis='columns')
y_train = data_pre['stroke']

X_train_res, y_train_res = sm.fit_resample(X_train, y_train.ravel())

In [None]:
print(X_train_res.shape)
print(y_train_res.shape)
print(sum(y_train_res == 1))
print(sum(y_train_res == 0))

> Define confusion_matrix plot function.

In [None]:
import itertools
import matplotlib.pyplot as plt

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        1#print('Confusion matrix, without normalization')

    #print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

### 5. LogisticRegression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc, roc_auc_score, roc_curve, recall_score, classification_report, accuracy_score

In [None]:
lr_cv_param = {'C': list(range(1,16))}
lr = LogisticRegression()
clf = GridSearchCV(lr, lr_cv_param, cv=5, verbose=5, n_jobs=3)
clf.fit(X_train_res, y_train_res.ravel())

In [None]:
clf.best_params_

In [None]:
lr1 = LogisticRegression(C=4, penalty='l2', verbose=5)
lr1.fit(X_train_res, y_train_res.ravel())

In [None]:
y_train_pre = lr1.predict(X_train)

cnf_matrix_tra = confusion_matrix(y_train, y_train_pre)
print(round(accuracy_score(y_train,y_train_pre),2), '%')

class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix_tra , classes=class_names, title='Confusion matrix')
plt.show()

### 6. Randomforest

In [None]:
rf = RandomForestClassifier()
rf_param_grid = {'n_estimators' : list(range(5,16)), 'max_depth' : list(range(5,16))}
rf_grid = GridSearchCV(estimator=rf,
                      param_grid=rf_param_grid,
                      scoring='accuracy',
                      cv=4,
                      return_train_score=True)

rf_grid.fit(X_train_res, y_train_res.ravel())

In [None]:
rf_grid.best_params_

In [None]:
rf_best = RandomForestClassifier(max_depth=15, n_estimators=15)
rf_best.fit(X_train_res, y_train_res.ravel())

In [None]:
y_train_pre = rf_best.predict(X_train)

cnf_matrix_tra = confusion_matrix(y_train, y_train_pre)
print(round(accuracy_score(y_train,y_train_pre),2), '%')

class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix_tra , classes=class_names, title='Confusion matrix')
plt.show()