In [1]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier

from imblearn.over_sampling import SMOTE

In [10]:
# Reading in data
df = pd.read_csv('Resources/healthcare-dataset-stroke-data.csv')
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [11]:
# Removing outlier values in 'gender' column
df = df[df['gender'] != 'Other']
df.gender.value_counts()

Female    2994
Male      2115
Name: gender, dtype: int64

In [12]:
# Removing the 'id' column
df = df.drop('id', axis=1)

In [13]:
# Filling in missing values from 'bmi' column, using a decision tree model that predicts the missing values
# Code originally written by Thomas Konstantin

DT_bmi_pipe = Pipeline( steps=[ 
                               ('scale',StandardScaler()),
                               ('lr',DecisionTreeRegressor(random_state=1))
                              ])
X = df[['age','gender','bmi']].copy()
X.gender = X.gender.replace({'Male':0,'Female':1,'Other':-1}).astype(np.uint8)

Missing = X[X.bmi.isna()]
X = X[~X.bmi.isna()]
Y = X.pop('bmi')
DT_bmi_pipe.fit(X,Y)
predicted_bmi = pd.Series(DT_bmi_pipe.predict(Missing[['age','gender']]),index=Missing.index)
df.loc[Missing.index,'bmi'] = predicted_bmi

In [14]:
# Previewing the data
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,29.879487,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [15]:
df['gender'] = df['gender'].replace(to_replace=['Male', 'Female'], value=[0, 1])
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,1,61.0,0,0,Yes,Self-employed,Rural,202.21,29.879487,never smoked,1
2,0,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,1,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [24]:
df['ever_married'] = df['ever_married'].replace(to_replace=['Yes', 'No'], value=[0, 1])
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,0,Private,Urban,228.69,36.6,formerly smoked,1
1,1,61.0,0,0,0,Self-employed,Rural,202.21,29.879487,never smoked,1
2,0,80.0,0,1,0,Private,Rural,105.92,32.5,never smoked,1
3,1,49.0,0,0,0,Private,Urban,171.23,34.4,smokes,1
4,1,79.0,1,0,0,Self-employed,Rural,174.12,24.0,never smoked,1


In [25]:
df['Residence_type'] = df['Residence_type'].replace(to_replace=['Urban', 'Rural'], value=[0, 1])
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,0,Private,0,228.69,36.6,formerly smoked,1
1,1,61.0,0,0,0,Self-employed,1,202.21,29.879487,never smoked,1
2,0,80.0,0,1,0,Private,1,105.92,32.5,never smoked,1
3,1,49.0,0,0,0,Private,0,171.23,34.4,smokes,1
4,1,79.0,1,0,0,Self-employed,1,174.12,24.0,never smoked,1


In [None]:
df['work_type'] = df['work_type'].replace(to_replace=['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'], value=[0, 1])
df.head()

In [23]:
#df.smoking_status.unique()
#array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],


#'df.work_type.unique()'
#worktype = '''array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      #dtype=object)'''

#df.Residence_type.unique()
#array(['Urban', 'Rural'], dtype=object)


array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

In [7]:
# Upsampling the data to resolve imbalance
upsample = SMOTE()

eval_df = df.sample(int(df.shape[0]*0.2), random_state=1)
train_df = df.drop(index=eval_df.index)

X_eval, y_eval = eval_df.drop('stroke',axis=1), eval_df['stroke']
X, y = train_df.drop('stroke',axis=1), train_df['stroke']

X, y = upsample.fit_resample(X, y)
up_df = X.assign(Stroke = y)

X_eval, y_eval = upsample.fit_resample(X_eval, y_eval)
up_eval_df = X_eval.assign(Stroke = y_eval)

ValueError: could not convert string to float: 'Male'

In [10]:
df_num = pd.get_dummies(df)
df_num.shape

(5109, 21)

In [11]:
y = df_num ['stroke']
X = df_num.drop('stroke', axis=1)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [13]:
# Setting up StandardScaler
scaler = StandardScaler()

# Fitting training data
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
model = SVC(kernel='linear')
model.fit(X_train_scaled, y_train)

SVC(kernel='linear')

In [15]:
print('Test Acc: %.3f' % model.score(X_test_scaled, y_test))

Test Acc: 0.941


In [16]:
from sklearn.metrics import classification_report
predictions = model.predict(X_test_scaled)
print(classification_report(y_test, predictions,
                            target_names=['no_stroke','stroke']))

              precision    recall  f1-score   support

   no_stroke       0.94      1.00      0.97      1203
      stroke       0.00      0.00      0.00        75

    accuracy                           0.94      1278
   macro avg       0.47      0.50      0.48      1278
weighted avg       0.89      0.94      0.91      1278



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#Next steps: upsample data so the dataset is not skewed towards 'no stroke' values

SMOTE()

In [None]:
#Pipelines of different models, to check accuracy after upsampling data
svm_pipe = Pipeline(steps=[('scale',StandardScaler()),('DT',SVC(random_state=42))])
lrg_pipe = Pipeline(steps=[('scale',StandardScaler()),('DT',LogisticRegression(random_state=42))])
dtc_pipe = Pipeline(steps=[('scale',StandardScaler()),('DT',DecisionTreeClassifier(random_state=42))])