In [1]:
# Bring in dependencies
import numpy as np
import pandas as pd
from pathlib import Path

# ML models and train split dependenceis
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Preprocessing dependencies
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Model accuracy evaluation dependency
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, recall_score



In [2]:
# Fetch data from CSV on local drive
file_path = Path("../Project_4/Resources/healthcare-dataset-stroke-data.csv")
df = pd.read_csv(file_path)
df.head()
                 

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
# Preprocessing Step #1 Review:  Identify Null values?
null = df.isnull().sum()
null



id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [5]:
# Preprocessing Step #1: Drop null values
df = df.dropna(how='any', axis=0)
null = df.isnull().sum()
null

id                   0
gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [6]:
# Preprocessing Step #2: Reduce features by dropping id and work_type, 
df = df.drop(['id', 'work_type'], axis=1)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.0,0,0,No,Rural,103.08,18.6,Unknown,0
5106,Female,81.0,0,0,Yes,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Rural,166.29,25.6,formerly smoked,0


In [7]:
# Drop the labels to create the X data (features), create y labels
X = df.drop('stroke', axis=1)
X

y = df["stroke"].values
y

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [8]:
# Preprocessing Step #2:  one-hot-encode the entire dataframe
X_dummies = pd.get_dummies(X)
print(X_dummies.columns)
X_dummies

Index(['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'bmi',
       'gender_Female', 'gender_Male', 'gender_Other', 'ever_married_No',
       'ever_married_Yes', 'Residence_type_Rural', 'Residence_type_Urban',
       'smoking_status_Unknown', 'smoking_status_formerly smoked',
       'smoking_status_never smoked', 'smoking_status_smokes'],
      dtype='object')


Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,0,1,0,0,1,0,1,0,1,0,0
2,80.0,0,1,105.92,32.5,0,1,0,0,1,1,0,0,0,1,0
3,49.0,0,0,171.23,34.4,1,0,0,0,1,0,1,0,0,0,1
4,79.0,1,0,174.12,24.0,1,0,0,0,1,1,0,0,0,1,0
5,81.0,0,0,186.21,29.0,0,1,0,0,1,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5104,13.0,0,0,103.08,18.6,1,0,0,1,0,1,0,1,0,0,0
5106,81.0,0,0,125.20,40.0,1,0,0,0,1,0,1,0,0,1,0
5107,35.0,0,0,82.99,30.6,1,0,0,0,1,1,0,0,0,1,0
5108,51.0,0,0,166.29,25.6,0,1,0,0,1,1,0,0,1,0,0


In [9]:
# Split data into Train and Test data
X_train, X_test, y_train, y_test = train_test_split(X_dummies, y, random_state=1)

In [10]:
# Create a StandardScaler model and fit it to the train and test data
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [17]:
#Train a Logistic Regression model with scaled data and print the model score
clasgistisifier_scaled = LocRegression(max_iter=10000)
classifier_scaled.fit(X_train_scaled, y_train)

y_true = y_test
y_pred = classifier_scaled.predict(X_test_scaled)

print(f'Actual:\t\t{list(y_test[:20])}')
print(f'Predicted:\t{list(classifier_scaled.predict(X_test_scaled[:20]))}')
print(f"Training Data Score: {classifier_scaled.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier_scaled.score(X_test_scaled, y_test)}")
print(f'Area Under Curve: {roc_auc_score(y_test, y_pred)}')






Actual:		[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Predicted:	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Training Data Score: 0.9592502037489813
Testing Data Score: 0.9527687296416938
Area Under Curve: 0.5


In [18]:
# Evaluate the accuracy of the Logistic Regression model based on a classification report
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, recall_score
print(classification_report(y_test, classifier_scaled.predict(X_test_scaled)))

              precision    recall  f1-score   support

           0       0.95      1.00      0.98      1170
           1       0.00      0.00      0.00        58

    accuracy                           0.95      1228
   macro avg       0.48      0.50      0.49      1228
weighted avg       0.91      0.95      0.93      1228



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(y_true, y_pred)

array([[1170,    0],
       [  58,    0]], dtype=int64)