In [38]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pandas as pd
import tensorflow as tf

In [2]:
# Import our input dataset
stroke_df = pd.read_csv('healthcare-dataset-stroke-data.csv')
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [3]:
# drop all null values 
stroke_df.dropna(inplace=True)

In [4]:
# check our data frame
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [5]:
#Drop the column id, it's not a feature that will help us predict stroke or not stroke
stroke_df.drop('id', axis=1, inplace=True)

In [6]:
#Check our dataset to identify data types to convert strings to numeric values
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4909 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4909 non-null   object 
 1   age                4909 non-null   float64
 2   hypertension       4909 non-null   int64  
 3   heart_disease      4909 non-null   int64  
 4   ever_married       4909 non-null   object 
 5   work_type          4909 non-null   object 
 6   Residence_type     4909 non-null   object 
 7   avg_glucose_level  4909 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     4909 non-null   object 
 10  stroke             4909 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 460.2+ KB


In [7]:
#Check feature columns to make sure we don't need to bin
stroke_df['Residence_type'].unique()

array(['Urban', 'Rural'], dtype=object)

In [8]:
#Check feature columns to make sure we don't need to bin
stroke_df['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [10]:
#Create variable to save columns to convert with get_dummies
#Convert our columns that have string to numeric values using get_dummies
columns_of_interest = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
encoded = pd.get_dummies(stroke_df[columns_of_interest])
encoded.head(10)

Unnamed: 0,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0
2,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0
3,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1
4,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0
5,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0
6,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0
7,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0
9,1,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0
10,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0
11,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1


In [11]:
#Concatinate the original data set and the enocoded datset- save to new df name
encoded_df = pd.concat([stroke_df,encoded], axis=1)

In [14]:
#Drop the columns of interest from the concatinated dataset
encoded_df = encoded_df.drop(columns=columns_of_interest, axis=1)

In [15]:
#View the new dtaset to confirm there are not duplicate columns
encoded_df.head(10)

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
0,67.0,0,1,228.69,36.6,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
2,80.0,0,1,105.92,32.5,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
3,49.0,0,0,171.23,34.4,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
4,79.0,1,0,174.12,24.0,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
5,81.0,0,0,186.21,29.0,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
6,74.0,1,1,70.09,27.4,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
7,69.0,0,0,94.39,22.8,1,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0
9,78.0,0,0,58.57,24.2,1,1,0,0,0,...,0,1,0,0,0,1,1,0,0,0
10,81.0,1,0,80.43,29.7,1,1,0,0,0,...,0,1,0,0,1,0,0,0,1,0
11,61.0,0,1,120.46,36.8,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1


In [16]:
#Split our data into test and train variables 
X_train, X_test, y_train, y_test = train_test_split(encoded_df.drop('stroke', axis=1), encoded_df['stroke'], stratify=encoded_df['stroke'], random_state=42)

#Instantiate the standardscaler
scaler = StandardScaler()

# Fit the scaler on training data
scaler.fit(X_train)

# Transform the training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
#View our X_train_scaled variable 
X_train_scaled

array([[ 0.6665675 , -0.32568345,  4.35952231, ..., -0.4548464 ,
        -0.78252934,  2.32896624],
       [-0.67171363, -0.32568345, -0.22938293, ..., -0.4548464 ,
         1.27790736, -0.42937505],
       [-0.27022929, -0.32568345, -0.22938293, ..., -0.4548464 ,
         1.27790736, -0.42937505],
       ...,
       [-0.44866678, -0.32568345, -0.22938293, ..., -0.4548464 ,
         1.27790736, -0.42937505],
       [ 0.53273938, -0.32568345, -0.22938293, ..., -0.4548464 ,
        -0.78252934,  2.32896624],
       [-1.89222602, -0.32568345, -0.22938293, ..., -0.4548464 ,
        -0.78252934, -0.42937505]])

In [19]:
#View our X_test_scaled variable 
X_test_scaled

array([[-0.58249489, -0.32568345, -0.22938293, ..., -0.4548464 ,
        -0.78252934, -0.42937505],
       [ 0.26508316, -0.32568345, -0.22938293, ..., -0.4548464 ,
         1.27790736, -0.42937505],
       [ 1.60336429, -0.32568345, -0.22938293, ..., -0.4548464 ,
         1.27790736, -0.42937505],
       ...,
       [-0.58249489, -0.32568345, -0.22938293, ..., -0.4548464 ,
         1.27790736, -0.42937505],
       [-0.00257307, -0.32568345, -0.22938293, ..., -0.4548464 ,
        -0.78252934, -0.42937505],
       [ 0.75578624, -0.32568345, -0.22938293, ..., -0.4548464 ,
         1.27790736, -0.42937505]])

In [41]:
#Create a Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=800,
                                random_state=1)
classifier

In [42]:
#Fit (train) our model using the training data
classifier.fit(X_train, y_train)

In [43]:
#Validate the model using the test data
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9573485465906004
Testing Data Score: 0.9576547231270358


In [44]:
#Make predictions- predict outcomes for the test data set
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [45]:
# Calculate and display the accuracy score for the test dataset
accuracy_score(y_test, predictions)

0.9576547231270358

# Move to Neural Network Code

In [47]:
number_input_features = X_train.shape[1]

In [48]:
number_input_features

21

In [69]:
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=75, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=60, activation="relu"))

# Third hidden layer
#nn.add(tf.keras.layers.Dense(units=20, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_18 (Dense)            (None, 75)                1650      
                                                                 
 dense_19 (Dense)            (None, 60)                4560      
                                                                 
 dense_20 (Dense)            (None, 1)                 61        
                                                                 
Total params: 6271 (24.50 KB)
Trainable params: 6271 (24.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [70]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [71]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=80)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [72]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

39/39 - 0s - loss: 0.2855 - accuracy: 0.9414 - 174ms/epoch - 4ms/step
Loss: 0.2854765057563782, Accuracy: 0.9413681030273438


# RandomForest Model- we ended up not suing this code

In [31]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [32]:
rf_model = rf_model.fit(X_train_scaled, y_train)

In [33]:
predictions = rf_model.predict(X_test_scaled)

In [34]:
print(predictions)

[0 0 0 ... 0 0 0]


In [35]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [36]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1176,0
Actual 1,52,0


Accuracy Score : 0.9576547231270358
Classification Report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1176
           1       0.00      0.00      0.00        52

    accuracy                           0.96      1228
   macro avg       0.48      0.50      0.49      1228
weighted avg       0.92      0.96      0.94      1228



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
