In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
np.set_printoptions(precision=4)
from sklearn.metrics import accuracy_score, recall_score, precision_score , confusion_matrix
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head(5)

In [None]:
df[['gender','id']].groupby(['gender']).count()

In [None]:
df = df[df['gender']!='Other']
df.head(5)

In [None]:
df.isnull().sum()

In [None]:
mean_bmi_has_stroke = df[df['stroke']==1]['bmi'].mean()
mean_bmi_has_stroke

In [None]:
mean_bmi_no_stroke = df[df['stroke']==0]['bmi'].mean()
mean_bmi_no_stroke

In [None]:
df.loc[df['stroke']==1,'bmi'] = df.loc[df['stroke']==1, 'bmi'].fillna(mean_bmi_has_stroke)
df.loc[df['stroke']==0,'bmi'] = df.loc[df['stroke']==0, 'bmi'].fillna(mean_bmi_no_stroke)

In [None]:
df

In [None]:
df[['stroke','id']].groupby(['stroke']).count().reset_index()

In [None]:
df_all = df.copy()

df_stroke = df[df['stroke']==1]

df_non = df[df['stroke']==0]

In [None]:
def plot_pie(column, title="All"):
    fig,axs = plt.subplots(1,1)
    data = df_all[column].value_counts()
    plt.pie(data,autopct='%1.2f%%',labels=data.index)
    plt.title(title)
    plt.show()
    
def plot_hist(column, title="all"):
    plt.hist(df_all[column],density=True)
    plt.title(title)
    plt.show()

def plot_bar(column, sort=False, title="all"):
    if sort:
        data_all = df_all[column].value_counts().sort_index()
    else:
        data_all = df_all[column].value_counts()
    plt.bar(data_all.index,data_all)
    plt.title(title)
    plt.show()
    
def plot_bar_compare(column, sort=False):
    if sort:
        data_churned = df_stroke[column].value_counts().sort_index()
        data_nonchurned = df_non[column].value_counts().sort_index()
    else:
        data_churned = df_stroke[column].value_counts()
        data_nonchurned = df_non[column].value_counts()
    
    fig,axs = plt.subplots(2,1)
    plt.subplots_adjust(left=0, bottom=0, right=1, top=2, wspace=0, hspace=0.2)
    axs[0].bar(data_nonchurned.index,data_nonchurned)
    axs[0].title.set_text('No Stroke')
    axs[1].bar(data_churned.index,data_churned)
    axs[1].title.set_text('Has Stroke')
    plt.show()

def plot_hist_compare(column, bins=5):
    plt.hist([df_non[column], df_stroke[column]] , color=['c','r'])
    plt.legend(('No Stroke', 'Has Stroke'))
    plt.show()
    
def plot_pie_compare(column):
    data_churned = df_stroke[column].value_counts()
    data_nonchurned = df_non[column].value_counts()
    
    fig,axs = plt.subplots(2,1)
    plt.subplots_adjust(left=0, bottom=0, right=1, top=2, wspace=0, hspace=0.2)
    axs[0].pie(data_nonchurned,autopct='%1.2f%%',labels=data_nonchurned.index)
    axs[0].title.set_text('No Stroke')
    axs[1].pie(data_churned,autopct='%1.2f%%',labels=data_churned.index)
    axs[1].title.set_text('Has Stroke')
    plt.show()

def plot_boxplot(column, title=""):
    sns.boxplot(x="stroke", y=column, palette=["c", "r"],
            hue="stroke",  data=df_all).set_title(title, fontsize=15)

def check_median(column):
    data_churned = df_stroke[column].describe()
    data_nonchurned = df_non[column].describe()
    print('No Stroke: {}'.format(data_nonchurned['50%']))
    print('Has Stroke: {}'.format(data_churned['50%']))

def check_most(column):
    data_churned = df_stroke[column].value_counts()
    data_nonchurned = df_non[column].value_counts()
    print('No Stroke: {}'.format(data_nonchurned.index[0]))
    print('Has Stroke: {}'.format(data_churned.index[0]))

# Stroke

In [None]:
plot_pie('stroke')

# Gender

In [None]:
plot_pie('gender')
plot_pie_compare('gender')

# Age

In [None]:
plot_hist('age')
plot_hist_compare('age')

In [None]:
check_median('age')

# hypertension

In [None]:
plot_pie('hypertension')
plot_pie_compare('hypertension')

In [None]:
check_most('hypertension')

# heart_disease

In [None]:
plot_pie('heart_disease')
plot_pie_compare('heart_disease')

# ever_married

In [None]:
plot_pie('ever_married')
plot_pie_compare('ever_married')

In [None]:
check_most('ever_married')

# work_type

In [None]:
plot_pie('work_type')
plot_pie_compare('work_type')

# Residence_type

In [None]:
plot_pie('Residence_type')
plot_pie_compare('Residence_type')

# avg_glucose_level

In [None]:
plot_hist('avg_glucose_level')
plot_hist_compare('avg_glucose_level')

In [None]:
plot_boxplot('avg_glucose_level')

In [None]:
check_median('avg_glucose_level')

# bmi

In [None]:
plot_hist('bmi')
plot_hist_compare('bmi')

In [None]:
plot_boxplot('bmi')

In [None]:
check_median('bmi')

# smoking_status

In [None]:
plot_pie('smoking_status')
plot_pie_compare('smoking_status')

### Data Visualization Result

| | No Stroke| Has Stroke | Note
| :- | :-: | :-: | :-: |
| gender (Most) | Female | Female | No Clear Difference
| age (Median) | 43 | 71 | the median age of stroke patients is higher than patient with no stroke
| hypertension (Most) | 0 | 0 | the patient who has hypertension from stroke patient is 18 % higher than the patient with no stroke
| heart_disease (Most) | 0 | 0 | the patient who has heart disease from stroke patient is 14 % higher than the patient with no stroke
| ever_married (Most) | Yes | Yes | the patient who ever married from stroke patient is 24 % higher than the patient with no stroke
| work_type (Most) | Private | Private | the patient who work as self-employed from stroke patient is 11.4% higher than the patient with no stroke
| Residence_type (Most) | Urban | Urban | No Clear Difference
| avg_glucose_level (Median) | 91.5 | 105.2 | the median of avg_glucose_level from Stroke Patient is higher than the Patient with no Stroke
| bmi (Median) | 28.3 | 30.5 | the median of bmi from Stroke Patient is little higher than the Patient with no Stroke
| smoking_status (Most) | never smoked | never smoked | The patient who smokes or formerly smoked from is 13% higher than the patient with no stroke

# Data Preprocessing

In [None]:
X = df.copy()

y = X['stroke']

#Drop the Attrition_Flag Column
X = X.drop(['stroke','gender','Residence_type','work_type','ever_married','smoking_status','bmi','avg_glucose_level','heart_disease','hypertension','id'], axis=1)

In [None]:
# transform categorical data
#X = pd.get_dummies(X, columns=['smoking_status'])

In [None]:
X.columns

In [None]:
#X = X.drop(columns=['smoking_status_smokes'])

In [None]:
X.columns

# Split data to train data and test data

In [None]:
#Split to data train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1234)

# Balancing Data

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=1234)

X_smote, y_smote = sm.fit_resample(X_train, y_train)

print(f'''Shape of X before SMOTE: {X.shape}
Shape of X after SMOTE: {X_smote.shape}''')

print('\nBalance of positive and negative classes (%):')
y_smote.value_counts(normalize=True) * 100

# Feature Scalling

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_smote = sc.fit_transform(X_smote)
X_test = sc.transform(X_test)

In [None]:
# Import ML Libraries
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

classifiers = [[CatBoostClassifier(verbose=0),'CatBoost Classifier'],[XGBClassifier(),'XGB Classifier'], [RandomForestClassifier(),'Random Forest'], 
    [KNeighborsClassifier(), 'K-Nearest Neighbours'], [SGDClassifier(),'SGD Classifier'], [SVC(),'SVC'],[LGBMClassifier(),'LGBM Classifier'],
              [GaussianNB(),'GaussianNB'],[DecisionTreeClassifier(),'Decision Tree Classifier'],[LogisticRegression(),'Logistic Regression']]

In [None]:
for cls in classifiers:
    model = cls[0]
    model.fit(X_smote, y_smote)
    
    y_pred = model.predict(X_test)
    print(cls[1])
    print ('Confusion Matrix:')
    print(confusion_matrix(y_test, y_pred))
    print("Accuracy : ", accuracy_score(y_test, y_pred) *  100)
    print("Recall : ", recall_score(y_test, y_pred) *  100)
    print("Precision : ", precision_score(y_test, y_pred) *  100)

# Result

The Best Algotrithms to predict stroke are SVC and GaussianNB with Recall score 88.6% and Accuracy score 67.12%

Need your advice guys.

Thank you.