In [1]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import RandomOverSampler

from sqlalchemy import create_engine

In [2]:
# Reading in data
df = pd.read_csv('Resources/healthcare-dataset-stroke-data.csv')
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
# Removing outlier values in 'gender' column, converting to numerical values
df = df[df['gender'] != 'Other']
df.gender = df.gender.replace({'Male':0, 'Female':1})

In [4]:
# Removing the 'id' column
df = df.drop('id', axis=1)

In [5]:
# Converting other binary data points into numerical values
df.ever_married = df.ever_married.replace({'No':0, 'Yes':1})
df.Residence_type = df.Residence_type.replace({'Urban':0, 'Rural':1})
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,Private,0,228.69,36.6,formerly smoked,1
1,1,61.0,0,0,1,Self-employed,1,202.21,,never smoked,1
2,0,80.0,0,1,1,Private,1,105.92,32.5,never smoked,1
3,1,49.0,0,0,1,Private,0,171.23,34.4,smokes,1
4,1,79.0,1,0,1,Self-employed,1,174.12,24.0,never smoked,1


In [6]:
# Filling in missing values from 'bmi' column, using a decision tree model that predicts the missing values
# Code originally written by Thomas Konstantin
DT_bmi_pipe = Pipeline( steps=[ 
                               ('scale',StandardScaler()),
                               ('lr',DecisionTreeRegressor(random_state=1))
                              ])
X = df[['age','gender','bmi']].copy()

Missing = X[X.bmi.isna()]
X = X[~X.bmi.isna()]
Y = X.pop('bmi')
DT_bmi_pipe.fit(X,Y)
predicted_bmi = pd.Series(DT_bmi_pipe.predict(Missing[['age','gender']]),index=Missing.index)
df.loc[Missing.index,'bmi'] = predicted_bmi

# Previewing the data
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,Private,0,228.69,36.6,formerly smoked,1
1,1,61.0,0,0,1,Self-employed,1,202.21,29.879487,never smoked,1
2,0,80.0,0,1,1,Private,1,105.92,32.5,never smoked,1
3,1,49.0,0,0,1,Private,0,171.23,34.4,smokes,1
4,1,79.0,1,0,1,Self-employed,1,174.12,24.0,never smoked,1


In [7]:
# Converting dataset into numerical values
df_num = pd.get_dummies(df)
df_num.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,stroke,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0,67.0,0,1,1,0,228.69,36.6,1,0,0,1,0,0,0,1,0,0
1,1,61.0,0,0,1,1,202.21,29.879487,1,0,0,0,1,0,0,0,1,0
2,0,80.0,0,1,1,1,105.92,32.5,1,0,0,1,0,0,0,0,1,0
3,1,49.0,0,0,1,0,171.23,34.4,1,0,0,1,0,0,0,0,0,1
4,1,79.0,1,0,1,1,174.12,24.0,1,0,0,0,1,0,0,0,1,0


In [8]:
# Viewing columns
df_num.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'Residence_type', 'avg_glucose_level', 'bmi', 'stroke',
       'work_type_Govt_job', 'work_type_Never_worked', 'work_type_Private',
       'work_type_Self-employed', 'work_type_children',
       'smoking_status_Unknown', 'smoking_status_formerly smoked',
       'smoking_status_never smoked', 'smoking_status_smokes'],
      dtype='object')

In [9]:
# Checking how many of each stroke result is in dataset, confirming imbalance
df_num['stroke'].value_counts()

0    4860
1     249
Name: stroke, dtype: int64

In [10]:
# Upsampling data so the dataset is not skewed towards 'no stroke' values
y = df_num['stroke']
X = df_num.drop('stroke', axis=1)

ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X, y)
ros_df = X_resampled.assign(Stroke = y_resampled)

# Verifying counts of target values
ros_df['Stroke'].value_counts()
#ros_df.head()

1    4860
0    4860
Name: Stroke, dtype: int64

In [11]:
# Creating connection to sqlite
engine = create_engine('sqlite:///stroke_prediction_data.sqlite', echo=False)

# Exporting encoded data to sqlite
ros_df.to_sql(name='stroke_prediction_data', con=engine, if_exists='append', index=True)

In [12]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=1)

In [13]:
# Setting up StandardScaler
scaler = StandardScaler()

# Fitting training data
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# Pipelines of different models, to check accuracy after upsampling data
svm_pipe = Pipeline(steps=[('scale',StandardScaler()),('DT',SVC(random_state=1))])
lrg_pipe = Pipeline(steps=[('scale',StandardScaler()),('DT',LogisticRegression(random_state=1))])
dtc_pipe = Pipeline(steps=[('scale',StandardScaler()),('DT',DecisionTreeClassifier(random_state=1))])

# Fitting data to pipelines
svm_pipe.fit(X_train_scaled, y_train)
lrg_pipe.fit(X_train_scaled, y_train)
dtc_pipe.fit(X_train_scaled, y_train)

In [15]:
# Classification report for SVC model
svmpred = svm_pipe.predict(X_test_scaled)
print(classification_report(y_test, svmpred, target_names=['no_stroke','stroke']))

              precision    recall  f1-score   support

   no_stroke       0.88      0.78      0.83      1209
      stroke       0.81      0.90      0.85      1221

    accuracy                           0.84      2430
   macro avg       0.84      0.84      0.84      2430
weighted avg       0.84      0.84      0.84      2430



In [16]:
# Classification report for Logistic Regression model
lrgpred = lrg_pipe.predict(X_test_scaled)
print(classification_report(y_test, lrgpred, target_names=['no_stroke','stroke']))

              precision    recall  f1-score   support

   no_stroke       0.81      0.73      0.77      1209
      stroke       0.75      0.83      0.79      1221

    accuracy                           0.78      2430
   macro avg       0.78      0.78      0.78      2430
weighted avg       0.78      0.78      0.78      2430



In [17]:
# Classification report for Decision Tree Classifier model
dtcpred = dtc_pipe.predict(X_test_scaled)
print(classification_report(y_test, dtcpred, target_names=['no_stroke','stroke']))

              precision    recall  f1-score   support

   no_stroke       1.00      0.95      0.97      1209
      stroke       0.95      1.00      0.97      1221

    accuracy                           0.97      2430
   macro avg       0.97      0.97      0.97      2430
weighted avg       0.97      0.97      0.97      2430



In [18]:
# Test accuracy for Decision Tree Classifier model
print('Test Acc: %.3f' % dtc_pipe.score(X_test_scaled, y_test))

Test Acc: 0.973
