In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# IMPORTING DATA

**In order to import the data, pandas function read_csv is used**

In [1]:
df = pd.read_csv('../input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv')
df.head()

**.info() function is used to check the columns data types so it can be processed further**

In [1]:
df.info()

# DATA CLEANING

**Checked the duplicate values based on ID column, if theres none, the column can be dropped since its no longer serves a purpose in the analysis**

In [1]:
duplicates = df.duplicated(subset='id')
df[duplicates]

In [1]:
df = df.drop(['id'],axis=1)

**Created a list containing columns based on their data types.**

In [1]:
date= list(df.select_dtypes(include=['datetime64[ns]']))
cats= list(df.select_dtypes(include=['object','bool']) )
nums= list(df.select_dtypes(include=['int64','float64']))
print(date)
print(cats)
print(nums)

**Search for null values in the data.**

In [1]:
df.isna().sum()

**Figured that there are 201 missing values in the data, .describe() function is used to check whether the "mean" values are close enough to the median. If they are close enough, it can be assumed that the mean is robust enough and the data doesnt really contain a lot of extreme outliers**

In [1]:
df.describe()

**Since the mean and the median values are pretty close in the "bmi" column of the dataframe, the mean values can be used to fill the null.**

In [1]:
df['bmi'].fillna(df['bmi'].mean(), inplace=True)

**Search for the duplicate values in the dataframe, got 0 value.**

In [1]:
df.duplicated().sum()

**Boxplot are used to check whether there are outlier contained in the data. age_glucose_level dan bmi columns contains outlier based on the boxplot.**

In [1]:
plt.figure(figsize=(15, 7))
for i in range(0, len(nums)):
    plt.subplot(2, 3, i+1)
    sns.boxplot(y=df[nums[i]],color='green',orient='v')
    plt.tight_layout()

**Distplot are used to check the distribution of the data. If the data are skewed we can list it to process it further.**

In [1]:
plt.figure(figsize=(15, 7))
for i in range(0, len(nums)):
    plt.subplot(2, 3, i+1)
    sns.distplot(df[nums[i]], color='gray')
    plt.tight_layout()

**The distribution of avg_glucose_level and bmi columns are skewed to the left. Other than that, avg_glucose_level and bmi columns also contains outlier.**

In [1]:
outlier = ['avg_glucose_level','bmi']
skewed = ['avg_glucose_level','bmi']

**Log transformation is used to remove the skewness from the data.**

In [1]:
for col in skewed:
    df[col] = np.log(df[col])

In [1]:
plt.figure(figsize=(12, 5))
for i in range(0, len(nums)):
    plt.subplot(2, len(nums)/2, i+1)
    sns.distplot(df[nums[i]], color='gray')
    plt.tight_layout()

**The distribution of avg_glucose_level and bmi columns are a lot better than before, its closer to normal distribution.**

**Remove the outlier from the data using z-score method because the data are already close to normal distribution thanks to the log transformation!**

In [1]:
from scipy import stats
print(f'Length of the data before filtering outlier: {len(df)}')

filtered_entries = np.array([True] * len(df))

for col in outlier:
    zscore = abs(stats.zscore(df[col]))
    filtered_entries = (zscore < 3) & filtered_entries
    
df = df[filtered_entries]

print(f'Length of the data after filtering outlier: {len(df)}')

# DATA ENCODING AND EXPLORATORY DATA ANALYSIS 

**Used countplot just to see a brief summary of the categorical data.**

In [1]:
for i in range(0, len(cats)):
    plt.subplot(3, len(cats)/2, i+1)
    sns.countplot(df[cats[i]], color='gray', orient='v')
    plt.tight_layout()

**Used .value_counts() function on the stroke/target columns to see whether the target are imbalanced, and it is!**

In [1]:
df['stroke'].value_counts().plot(kind='bar')

In [1]:
for col in cats:
    print(str(col))
    print(df[col].unique())

**Encode the data based on their unique values, labelencoding and one-hot encoding are used.**

In [1]:
labenco = []
onehot = []
for col in cats:
    if len(df[col].unique()) == 2:
        labenco.append(col)
    else:
        onehot.append(col)
print(labenco)
print(onehot)

In [1]:
df_labencoded = df.copy()
for col in labenco:
    df_labencoded[col] = df_labencoded[col].astype('category').cat.codes
df_labencoded.head()

**NUMERICAL VARIABLES EDA**

**Mainly, heatmap and barplot will be used hence its representative enough to see whether there are a correlation between the data.**

In [1]:
plt.figure(figsize=(15, 8))
sns.heatmap(df_labencoded.corr(), cmap='Blues', annot=True, fmt='.2f')

In [1]:
for col in nums:
    plt.figure(figsize=(15, 8))
    print(sns.barplot(x='stroke',y=col,data=df_labencoded))

**Based from the heatmap and barplot above, it can be said that age, hypertension, and heart_disease columns can be used to predict stroke.**

**ONEHOT ENCODING EDA**

In [1]:
for col in onehot:
    df_loop = df_labencoded[[col,'stroke']].copy()
    onehots = pd.get_dummies(df_loop[col], prefix=col)
    df_loop = df_loop.join(onehots)
    plt.figure(figsize=(15, 8))
    print(sns.heatmap(df_loop.corr(), cmap='Blues', annot=True, fmt='.2f'))

In [1]:
for col in onehot:
    df_loop = df_labencoded[[col,'stroke']].copy()
    onehots = pd.get_dummies(df_loop[col], prefix=col)
    df_loop = df_loop.join(onehots)
    plt.figure(figsize=(15, 8))
    print(sns.barplot(x=col,y='stroke',data=df_labencoded))

**Based on the heatmap and barplot above, it can be said that smoking_status and work_type can be used to predict stroke.**

# DATA STANDARDIZATION

**Create a new dataframe containing only the selected columns from the original dataframe.**

In [1]:
selected_feat=['age','hypertension','heart_disease','work_type','smoking_status','stroke']
df_pre_model = df[selected_feat].copy()
df_pre_model.head()

In [1]:
onehot_pre_model = ['work_type','smoking_status']
for col in onehot_pre_model:
    onehots = pd.get_dummies(df_pre_model[col], prefix=col)
    df_pre_model = df_pre_model.join(onehots)
    df_pre_model = df_pre_model.drop([col],axis=1)
df_pre_model.head()

**Age column's values are relatively larger compared to the other column's values in the data, so the age column's are standardized.**

In [1]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
df_model = df_pre_model.copy()
df_model['age'] = StandardScaler().fit_transform(df_pre_model['age'].values.reshape(len(df), 1))
df_model.head()

# MODELING AND EVALUATION

In [1]:
X = df_model.drop(['stroke'],axis=1)
y = df_model['stroke']

**Since the target values are imbalanced, SMOTE method for imbalanced data are used.**

In [1]:
from imblearn import under_sampling, over_sampling
X_under, y_under = under_sampling.RandomUnderSampler(0.5).fit_resample(X, y)
X_over, y_over = over_sampling.RandomOverSampler(0.5).fit_resample(X, y)
X_over_SMOTE, y_over_SMOTE = over_sampling.SMOTE(0.5).fit_resample(X, y)

In [1]:
print('Original')
print(pd.Series(y).value_counts())
print('UNDERSAMPLING')
print(pd.Series(y_under).value_counts())
print('OVERSAMPLING')
print(pd.Series(y_over).value_counts())
print('SMOTE')
print(pd.Series(y_over_SMOTE).value_counts())

**As we can see above, the data are balanced!**

**Created a function so it will be easier to evaluate a model.**

In [1]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc

def eval_classification(model, pred, xtrain, ytrain, xtest, ytest):
    print("Accuracy (Test Set): %.2f" % accuracy_score(ytest, pred))
    print("Precision (Test Set): %.2f" % precision_score(ytest, pred))
    print("Recall (Test Set): %.2f" % recall_score(ytest, pred))
    print("F1-Score (Test Set): %.2f" % f1_score(ytest, pred))
    
    fpr, tpr, thresholds = roc_curve(ytest, pred, pos_label=1) # pos_label: label yang kita anggap positive
    print("AUC: %.2f" % auc(fpr, tpr))

def show_feature_importance(model):
    feat_importances = pd.Series(model.feature_importances_, index=X.columns)
    ax = feat_importances.nlargest(25).plot(kind='barh', figsize=(10, 8))
    ax.invert_yaxis()

    plt.xlabel('score')
    plt.ylabel('feature')
    plt.title('feature importance score')

def show_best_hyperparameter(model, hyperparameters):
    for key, value in hyperparameters.items() :
        print('Best '+key+':', model.get_params()[key])

In [1]:
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X_over_SMOTE, y_over_SMOTE, test_size = 0.3, random_state = 42)

**Predict the target using various model to see which one has the most satisfying result.**

In [1]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train,y_train)
print(str(model)+' '+'EVALUATION')

y_pred = model.predict(X_test)
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)
print('Train score: ' + str(model.score(X_train, y_train)))
print('Test score:' + str(model.score(X_test, y_test)))

In [1]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=42)
model.fit(X_train,y_train)
print(str(model)+' '+'EVALUATION')

y_pred = model.predict(X_test)
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)
print('Train score: ' + str(model.score(X_train, y_train)))
print('Test score:' + str(model.score(X_test, y_test)))

In [1]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, y_train)
print(str(model)+' '+'EVALUATION')

y_pred = model.predict(X_test)
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)
print('Train score: ' + str(model.score(X_train, y_train)))
print('Test score:' + str(model.score(X_test, y_test)))

In [1]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier(random_state=42)
model.fit(X_train, y_train)
print(str(model)+' '+'EVALUATION')

y_pred = model.predict(X_test)
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)
print('Train score: ' + str(model.score(X_train, y_train)))
print('Test score:' + str(model.score(X_test, y_test)))

In [1]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train,y_train)
print(str(model)+' '+'EVALUATION')

y_pred = model.predict(X_test)
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)
print('Train score: ' + str(model.score(X_train, y_train)))
print('Test score:' + str(model.score(X_test, y_test)))

In [1]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(random_state=42)
model.fit(X_train,y_train)
print(str(model)+' '+'EVALUATION')

y_pred = model.predict(X_test)
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)
print('Train score: ' + str(model.score(X_train, y_train)))
print('Test score:' + str(model.score(X_test, y_test)))

In [1]:
from xgboost import XGBClassifier
model = XGBClassifier(random_state=42)
model.fit(X_train, y_train)
print(str(model)+' '+'EVALUATION')

y_pred = model.predict(X_test)
eval_classification(model, y_pred, X_train, y_train, X_test, y_test)
print('Train score: ' + str(model.score(X_train, y_train)))
print('Test score:' + str(model.score(X_test, y_test)))

**The XGBoost model has the biggest accuracy and recall score, so it can be said that decision tree has the most satisfying result of the bunch. Other than that, the train score and the test score arent too far apart, theres only 2% of a difference in accuracy, so it can be said that the model are not overfitted.**

**Used hyperparameter tuning method to improve the model quality. Scoring method based on the recall value is used because we dont want to mispredict the patient who got stroke but we predict otherwise.**

In [1]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import numpy as np

hyperparameters = {
                    'max_depth' : [int(x) for x in np.linspace(10, 110, num = 11)],
                    'min_child_weight' : [int(x) for x in np.linspace(1, 20, num = 11)],
                    'gamma' : [float(x) for x in np.linspace(0, 1, num = 11)],
                    'tree_method' : ['auto', 'exact', 'approx', 'hist'],

                    'colsample_bytree' : [float(x) for x in np.linspace(0, 1, num = 11)],
                    'eta' : [float(x) for x in np.linspace(0, 1, num = 100)],

                    'lambda' : [float(x) for x in np.linspace(0, 1, num = 11)],
                    'alpha' : [float(x) for x in np.linspace(0, 1, num = 11)]
                    }

xg = XGBClassifier(random_state=42)
xg_tuned = RandomizedSearchCV(xg, hyperparameters,random_state=42, cv=5, scoring='recall')
xg_tuned.fit(X_train,y_train)

y_pred = xg_tuned.predict(X_test)
eval_classification(xg_tuned, y_pred, X_train, y_train, X_test, y_test)
print('Train score: ' + str(model.score(X_train, y_train)))
print('Test score:' + str(model.score(X_test, y_test)))

**Since hyperparameter tuning doesnt provide a better recall score, we use the original xgboost model instead.**

# CONCLUSION

**We can be predict a stroke with a recall score of 83% and accuracy score of 92% that a patient with the disease can be predicted.**