<a href="https://colab.research.google.com/github/ranggaadinugraha/stroke-prediction-using-machine-learning/blob/main/stroke_prediction_machine_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Stroke Prediction Using Machine Learning
##This project aims to predict stroke risk based on healthcare data using Machine Learning algorithms MLP & KNN

In [None]:
# Environment Setup
# Installing required libraries

!pip install -U imbalanced-learn
!pip install -U scikit-learn
!pip install -U pywaffle

ðŸ”· IMPORT LIBRARIES

In [None]:
# Import Libraries for Data Visualization
import plotly.graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
import missingno as msno
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from pywaffle import Waffle

# Import Libraries for Data Processing
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Import Libraries for Model Development
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

# Import Libraries for Model Evaluation
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
# Load Healthcare Stroke Dataset
df = pd.read_csv("healthcare-dataset-stroke-data.csv")

ðŸ”· EXPLORATORY DATA ANALYSIS (EDA)


In [None]:
# Dataset Structure Information
df.info()

In [None]:
# Check Missing Values in Each Column
df.isna().sum()

ðŸ”· DATA CLEANING

In [None]:
# Remove Irrelevant Columns
df = df.drop('id', axis = 1)

ðŸ”· DESCRIPTIVE STATISTICS

In [None]:
# Numerical Feature Summary
round (df.describe(exclude = 'object'), 2)

In [None]:
df.head()

In [None]:
df['work_type']

In [None]:
# Categorical Feature Overview
round(df.describe(exclude = ['float', 'int64']))

ðŸ”· MISSING VALUE VISUALIZATION

In [None]:
# Visualize Missing Values
colors = ['grey','grey','grey','grey','grey','grey','grey','grey','#FE1143','grey','grey']
msno.bar(df, color=colors)

ðŸ”· TARGET DISTRIBUTION ANALYSIS

In [None]:
# Target Variable Distribution (Stroke)
stroke_data = pd.DataFrame(df['stroke'].value_counts())
figures = px.pie(data_frame=stroke_data,
             values='count',
             names=['healthy','stroke'],
             hole=0.38,
             color_discrete_sequence=["#9FD8DF","#FF7171"],
             labels={'label':'stroke','count':'No. Of Samples'})

figures.update_layout(font_family='Open Sans',
                      title=dict(text='Distribusi kelas pada kolom stroke',
                                 x=0.49,
                                 y=0.98,
                                 font=dict(color="#333",size=20)),
                      hoverlabel=dict(bgcolor='white'))

figures.update_traces(textposition='outside', textinfo='percent+label')

figures.show()

ðŸ”· FEATURE & TARGET SEPARATION

In [None]:
# Separate Features and Target Variable
X = df.drop('stroke', axis = 1)
y = df['stroke']

In [None]:
# initiation OneHotEncoder and MinMaxScaler
ohe = OneHotEncoder()
mms = MinMaxScaler()

In [None]:
category =  ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
num_con = ['age','bmi', 'avg_glucose_level']

ðŸ”· DATA PREPROCESSING

In [None]:
# Feature Scaling using Min-Max Normalization
X[num_con] = mms.fit_transform(X[num_con])

In [None]:
# One-Hot Encoding for Categorical Features
temp = X.drop(columns = category)
dummies = pd.get_dummies(X[category])
X = pd.concat([temp,dummies], axis = 1)

In [None]:
# Impute Missing Values using KNN Imputer
imputer = KNNImputer(n_neighbors=5)
X = pd.DataFrame(imputer.fit_transform(X),columns = X.columns)

In [None]:
X.isna().sum()

In [None]:
print("Label '1': {}".format(sum(y==1)))
print("Label '0': {} \n".format(sum(y==0)))

ðŸ”· HANDLING IMBALANCED DATA

In [None]:
# Handle Class Imbalance using SMOTE
sm = SMOTE(random_state = 42)

In [None]:
X_res, y_res = sm.fit_resample(X, y.ravel())

In [None]:
print('Shape of Training features: {}'.format(X_res.shape))
print('Shape of Training targets: {}'.format(y_res.shape))
print("Label '1': {}".format(sum(y_res==1)))
print("Label '0': {} \n".format(sum(y_res==0)))

In [None]:
Y_ser = pd.Series(y_res, name='is_stroke')
Y_ser

In [None]:
result = pd.merge(X_res,Y_ser, left_index=True, right_index=True)

In [None]:
# Split Dataset into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.2, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

ðŸ”· MODEL DEVELOPMENT

In [None]:
# Build and Train MLP Classifier
model_mlp = MLPClassifier(hidden_layer_sizes=(100),random_state=2021)
model_mlp.fit(X_train, y_train)

In [None]:
y_pred_mlp = model_mlp.predict(X_test)

In [None]:
mlp_report = classification_report(y_test, y_pred_mlp, output_dict=True, target_names=['Healthy',"Stroke"])
pd.DataFrame(mlp_report).transpose()

In [None]:
mlp_cf = confusion_matrix(y_test, y_pred_mlp)

In [None]:
from keras import models
from keras import layers
from tensorflow.keras.utils import to_categorical

In [None]:
train_labels = to_categorical(y_train)
test_labels = to_categorical(y_test)

In [None]:
# Build and Train KNN Classifier
model_knn = KNeighborsClassifier()
model_knn.fit(X_train, y_train)

ðŸ”· MODEL EVALUATION

In [None]:
# Evaluate MLP Model Performance
y_pred_knn = model_knn.predict(X_test)

In [None]:
y_pred_knn

In [None]:
# Evaluate KNN Model Performance
knn_report = classification_report(y_test, y_pred_knn, output_dict=True, target_names=['Healthy',"Stroke"])
pd.DataFrame(knn_report).transpose()

In [None]:
knn_cf = confusion_matrix(y_test, y_pred_knn)

In [None]:
metrics = pd.DataFrame({'accuracy' : [knn_report['accuracy']],
                        'precision_0' : [knn_report['Healthy']['precision']],
                        'recall_0' : [knn_report['Healthy']['recall']],
                        'f1-score_0' : [knn_report['Healthy']['f1-score']],
                        'precision_1' : [knn_report['Stroke']['precision']],
                        'recall_1' : [knn_report['Stroke']['recall']],
                        'f1-score_1' : [mlp_report['Stroke']['f1-score'],knn_report['Stroke']['f1-score']]},
                        index=['Model Multi-Layer Perceptron','Model K-Nearest Neighbors'])
multiheader = [('','accuracy'),
               ('Healthy', 'precision'),
               ('Healthy', 'recall'),
               ('Healthy', 'f1-score'),
               ('Stroke', 'precision'),
               ('Stroke', 'recall'),
               ('Stroke', 'f1-score')]
metrics.columns = pd.MultiIndex.from_tuples(multiheader)
metrics

ðŸ”· MODEL COMPARISON & EXPORT RESULTS

In [None]:
metrics.to_csv("Final_Model_Results.csv", index=True)