In [None]:
import os
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.simplefilter('ignore')


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
df = pd.read_csv('/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.nunique()

In [None]:
num_columns = ['age','avg_glucose_level','bmi']
category_columns = ['gender','hypertension','ever_married',
                    'work_type','Residence_type','smoking_status']
target_columns = ['stroke']

In [None]:
sns.countplot(df['stroke'])

In [None]:
df['stroke'].value_counts()

# EDA

In [None]:
fig = plt.figure(figsize = (16,16))
for i, col in enumerate(num_columns):
    ax = fig.add_subplot(2, 2, i+1)
    sns.distplot(df[df['stroke']==0][col], label='stoke_0', color='green', ax=ax)
    sns.distplot(df[df['stroke']==1][col], label='stroke_1', color='blue', ax=ax)
    ax.set_title(f'{col} stroke data', fontsize=20)
    plt.legend()
plt.show()

In [None]:
fig = plt.figure(figsize = (16,18))
for i, col in enumerate(category_columns):
    ax = fig.add_subplot(3, 2, i+1)
    pd.crosstab(index=df['stroke'], columns=df[col]).plot(kind='bar',ax=ax)
    ax.set_title(f'{col} stroke data', fontsize=20)
plt.show()

In [None]:
sns.catplot('stroke','age',data=df,kind='box')

In [None]:
sns.catplot('stroke','avg_glucose_level',data=df,kind='box')

In [None]:
sns.catplot('stroke','bmi',data=df,kind='box')

In [None]:
sns.catplot('work_type','age',data=df,hue='stroke',kind='violin')

In [None]:
sns.catplot('smoking_status','age',data=df,hue='stroke',kind='violin')

In [None]:
pd.crosstab(index=df['work_type'],columns=df['smoking_status']).plot(kind='bar')

In [None]:
g = sns.FacetGrid(df,hue='stroke',col='heart_disease',size=5)
g.map(sns.scatterplot,'age','avg_glucose_level')
g.add_legend()
plt.show()

In [None]:
df.corr()

In [None]:
fig, ax = plt.subplots(1,2,figsize=(12,5))
df['bmi'].plot(kind='hist', ax=ax[0])
df['bmi'].fillna(df['bmi'].median()).plot(kind='hist', ax=ax[1])

In [None]:
# bmi NaN_data fill median
df['bmi'] = df['bmi'].fillna(df['bmi'].median())


In [None]:
# label encode
from sklearn.preprocessing import LabelEncoder

for col in category_columns:
    label_encoder = LabelEncoder()
    df[col] = label_encoder.fit_transform(df[col])

In [None]:
df.head()

In [None]:
sns.pairplot(df,hue='stroke')

In [None]:
data = df.drop(columns=['id','stroke'], axis=1)
target = df['stroke']
data_cols = data.columns

In [None]:
# clustering
# make_feature => cluster_data
from sklearn.cluster import KMeans
from sklearn.mixture import BayesianGaussianMixture
from sklearn.model_selection import train_test_split

train_data, test_data, train_target, test_target = train_test_split(data,target,test_size=0.2,stratify=target,random_state=42)

for cluster in [3,5,7,10]:
    kme = KMeans(n_clusters=cluster,random_state=42)
    kme.fit(train_data[data_cols])
    train_data[f'kmeans_cluster_{cluster}'] = kme.predict(train_data[data_cols])
    test_data[f'kmeans_cluster_{cluster}'] = kme.predict(test_data[data_cols])
    
    bgm = BayesianGaussianMixture(n_components=cluster,random_state=42)
    bgm.fit(train_data[data_cols])
    train_data[f'gaussian_cluster_{cluster}'] = bgm.predict(train_data[data_cols])
    test_data[f'gaussian_cluster_{cluster}'] = bgm.predict(test_data[data_cols])
    
train_data.head()

In [None]:
fig = plt.figure(figsize=(12,4))
ax1 = fig.add_subplot(1,2,1) 
sns.scatterplot(train_data['age'],train_data['avg_glucose_level'],hue=train_data['kmeans_cluster_5'],ax=ax1)
ax1.set_title('Kmeans Cluster')

ax2 = fig.add_subplot(1,2,2)
sns.scatterplot(train_data['age'],train_data['avg_glucose_level'],hue=train_data['gaussian_cluster_5'],ax=ax2)
ax2.set_title('Gaussian Cluster')
plt.show()

In [None]:
g = sns.FacetGrid(pd.concat([train_data,train_target],axis=1),hue='stroke',col='kmeans_cluster_5',size=5)
g.map(sns.scatterplot,'age','avg_glucose_level')
g.add_legend()
plt.show()

In [None]:
g = sns.FacetGrid(pd.concat([train_data,train_target],axis=1),hue='stroke',col='gaussian_cluster_5',size=5)
g.map(sns.scatterplot,'age','avg_glucose_level')
g.add_legend()
plt.show()

# Model Train

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix,f1_score,accuracy_score,roc_auc_score,roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import lightgbm as lgb

from imblearn.over_sampling import SMOTE

In [None]:
data_columns = train_data.columns
data_columns

In [None]:
# scale
scaler = StandardScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)

# dataframe
train_data = pd.DataFrame(train_data,columns=data_columns)
test_data = pd.DataFrame(test_data,columns=data_columns)

## SMOTE Oversampling

In [None]:
smote = SMOTE()
train_smote_data, train_smote_target = smote.fit_resample(train_data,train_target)
train_smote_target.value_counts()

In [None]:
train_smote_data, val_smote_data, train_smote_target, val_smote_target = train_test_split(train_smote_data,train_smote_target,test_size=0.2)
train_smote_data.shape,val_smote_data.shape

In [None]:
train_smote_data.head()

In [None]:
# train_model func
def train_and_plot(x,y,xval,yval,model,model_name):
    model.fit(x,y)
    preds = model.predict(xval)
    acc = accuracy_score(yval,preds)
    f1 = f1_score(yval,preds)
    auc = roc_auc_score(yval,preds)
    print(f'{model_name.upper()} Validation Score:\nAccuracy Score:{acc:.3f} F1 Score:{f1:.3f} Auc Score:{auc:.3f}')
    mat = confusion_matrix(yval,preds)
    sns.heatmap(mat,cmap='Blues',annot=True,fmt='g')
    plt.show()
    predict_proba = model.predict_proba(xval)
    fpr, tpr, thres = roc_curve(yval,predict_proba[:,1])
    plt.figure(figsize=(5,5))
    plt.plot(fpr,tpr,label='roc_curve')
    plt.plot([0,1],[0,1])
    plt.title(f'{model_name.upper()} ROC CURVE',fontsize=16)
    plt.show()
    

In [None]:
logistic = LogisticRegression()
train_and_plot(train_smote_data,train_smote_target,val_smote_data,val_smote_target,logistic,'logistic')

In [None]:
tree = DecisionTreeClassifier()
train_and_plot(train_smote_data,train_smote_target,val_smote_data,val_smote_target,tree,'tree')

In [None]:
randomforest = RandomForestClassifier()
train_and_plot(train_smote_data,train_smote_target,val_smote_data,val_smote_target,randomforest,'randomforest')

In [None]:
kneighbors = KNeighborsClassifier(n_neighbors=5,weights='distance')
train_and_plot(train_smote_data,train_smote_target,val_smote_data,val_smote_target,kneighbors,'kneighbor')

In [None]:
xgb_model = xgb.XGBClassifier(eval_metric='logloss')
train_and_plot(train_smote_data,train_smote_target,val_smote_data,val_smote_target,xgb_model,'xgb')

In [None]:
lgb_model = lgb.LGBMClassifier()
train_and_plot(train_smote_data,train_smote_target,val_smote_data,val_smote_target,lgb_model,'lgb')

In [None]:
feature_importances = randomforest.feature_importances_
feature_df = pd.DataFrame({'importances':feature_importances})
feature_df.index = data_columns
feature_df = feature_df.sort_values('importances',ascending=False)
feature_df.plot(kind='bar')
plt.title('RandomForest Feature Importance',fontsize=16)

In [None]:
feature_importances = xgb_model.feature_importances_
feature_df = pd.DataFrame({'importances':feature_importances})
feature_df.index = data_columns
feature_df = feature_df.sort_values('importances',ascending=False)
feature_df.plot(kind='bar')
plt.title('XGB_Model Feature Importance',fontsize=16)

In [None]:
feature_importances = lgb_model.feature_importances_
feature_df = pd.DataFrame({'importances':feature_importances})
feature_df.index = data_columns
feature_df = feature_df.sort_values('importances',ascending=False)
feature_df.plot(kind='bar')
plt.title('LGB_Model Feature Importance',fontsize=16)

## Undersampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler

In [None]:
sampler = RandomUnderSampler()
train_under_data, train_under_target = sampler.fit_resample(train_data,train_target)
train_under_target.value_counts()

In [None]:
train_under_data, val_under_data, train_under_target, val_under_target = train_test_split(train_under_data,train_under_target,test_size=0.2)
train_under_data.shape,val_under_data.shape

In [None]:
logistic = LogisticRegression()
train_and_plot(train_under_data,train_under_target,val_under_data,val_under_target,logistic,'logistic')

In [None]:
tree = DecisionTreeClassifier()
train_and_plot(train_under_data,train_under_target,val_under_data,val_under_target,tree,'tree')

In [None]:
randomforest = RandomForestClassifier()
train_and_plot(train_under_data,train_under_target,val_under_data,val_under_target,randomforest,'randomforest')

In [None]:
kneighbors = KNeighborsClassifier(n_neighbors=5,weights='distance')
train_and_plot(train_under_data,train_under_target,val_under_data,val_under_target,kneighbors,'kneighbors')

In [None]:
xgb_model = xgb.XGBClassifier(eval_metric='logloss')
train_and_plot(train_under_data,train_under_target,val_under_data,val_under_target,xgb_model,'xgb')

In [None]:
lgb_model = lgb.LGBMClassifier()
train_and_plot(train_under_data,train_under_target,val_under_data,val_under_target,lgb_model,'lgb')

# Test Score (XGB and LGB)

In [None]:
xgb_smote_model = xgb.XGBClassifier(eval_metric='logloss')
xgb_smote_model.fit(train_smote_data,train_smote_target)
smote_model_preds = xgb_smote_model.predict(test_data)
smote_model_accuracy = accuracy_score(test_target,smote_model_preds)
smote_model_f1 = f1_score(test_target,smote_model_preds)
smote_model_auc = roc_auc_score(test_target,smote_model_preds)
smote_model_matrix = confusion_matrix(test_target,smote_model_preds)

xgb_under_model = xgb.XGBClassifier(eval_metric='logloss')
xgb_under_model.fit(train_under_data,train_under_target)
under_model_preds = xgb_under_model.predict(test_data)
under_model_accuracy = accuracy_score(test_target,under_model_preds)
under_model_f1 = f1_score(test_target,under_model_preds)
under_model_auc = roc_auc_score(test_target,under_model_preds)
under_model_matrix = confusion_matrix(test_target,under_model_preds)

print('XGB Model')
print(f'Oversampling Model:Accuracy:{smote_model_accuracy:.3f} F1:{smote_model_f1:.3f} AUC:{smote_model_auc:.3f}')
print(f'Undersampling Model:Accuracy:{under_model_accuracy:.3f} F1:{under_model_f1:.3f} AUC:{under_model_auc:.3f}')

In [None]:
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(1,2,1)
sns.heatmap(smote_model_matrix,annot=True,fmt='g',cmap='Blues',ax=ax1)
ax1.set_title('Oversampling XGB-Model Matrix',fontsize=16)

ax2 = fig.add_subplot(1,2,2)
sns.heatmap(under_model_matrix,annot=True,fmt='g',cmap='Blues',ax=ax2)
ax2.set_title('Undersampling XGB-Model Matrix',fontsize=16)


In [None]:
lgb_smote_model = lgb.LGBMClassifier()
lgb_smote_model.fit(train_smote_data,train_smote_target)
smote_model_preds = lgb_smote_model.predict(test_data)
smote_model_accuracy = accuracy_score(test_target,smote_model_preds)
smote_model_f1 = f1_score(test_target,smote_model_preds)
smote_model_auc = roc_auc_score(test_target,smote_model_preds)
smote_model_matrix = confusion_matrix(test_target,smote_model_preds)

lgb_under_model = lgb.LGBMClassifier()
lgb_under_model.fit(train_under_data,train_under_target)
under_model_preds = lgb_under_model.predict(test_data)
under_model_accuracy = accuracy_score(test_target,under_model_preds)
under_model_f1 = f1_score(test_target,under_model_preds)
under_model_auc = roc_auc_score(test_target,under_model_preds)
under_model_matrix = confusion_matrix(test_target,under_model_preds)

print('LGB Model')
print(f'Oversampling Model:Accuracy:{smote_model_accuracy:.3f} F1:{smote_model_f1:.3f} AUC:{smote_model_auc:.3f}')
print(f'Undersampling Model:Accuracy:{under_model_accuracy:.3f} F1:{under_model_f1:.3f} AUC:{under_model_auc:.3f}')

In [None]:
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(1,2,1)
sns.heatmap(smote_model_matrix,annot=True,fmt='g',cmap='Blues',ax=ax1)
ax1.set_title('Oversampling LGB-Model Matrix',fontsize=16)

ax2 = fig.add_subplot(1,2,2)
sns.heatmap(under_model_matrix,annot=True,fmt='g',cmap='Blues',ax=ax2)
ax2.set_title('Undersampling LGB-Model Matrix',fontsize=16)
