In [95]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pandas as pd
import tensorflow as tf

In [96]:
# Import our input dataset
stroke_df = pd.read_csv('healthcare-dataset-stroke-data.csv')
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [97]:
stroke_df.dropna(inplace=True)

In [98]:
stroke_df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [99]:
stroke_df.set_index(stroke_df['id'], inplace= True)

In [100]:
stroke_df.drop('id', axis=1, inplace=True)

In [101]:
stroke_df.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [102]:
stroke_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4909 entries, 9046 to 44679
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             4909 non-null   object 
 1   age                4909 non-null   float64
 2   hypertension       4909 non-null   int64  
 3   heart_disease      4909 non-null   int64  
 4   ever_married       4909 non-null   object 
 5   work_type          4909 non-null   object 
 6   Residence_type     4909 non-null   object 
 7   avg_glucose_level  4909 non-null   float64
 8   bmi                4909 non-null   float64
 9   smoking_status     4909 non-null   object 
 10  stroke             4909 non-null   int64  
dtypes: float64(3), int64(3), object(5)
memory usage: 460.2+ KB


In [103]:
stroke_df.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


In [104]:
stroke_df['Residence_type'].unique()

array(['Urban', 'Rural'], dtype=object)

In [105]:
stroke_df['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [106]:
columns_of_interest = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status']
encoded = pd.get_dummies(stroke_df[columns_of_interest])
encoded.head(10)

Unnamed: 0_level_0,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
9046,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0
31112,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0
60182,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1
1665,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0
56669,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0
53882,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0
10434,1,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0
60491,1,0,0,0,1,0,0,1,0,0,0,1,1,0,0,0
12109,1,0,0,0,1,0,0,1,0,0,1,0,0,0,1,0
12095,1,0,0,0,1,1,0,0,0,0,1,0,0,0,0,1


In [107]:
encoded_df = pd.concat([stroke_df,encoded], axis=1)

In [108]:
encoded_df = encoded_df.drop(columns=columns_of_interest, axis=1)

In [109]:
encoded_df.head(10)

Unnamed: 0_level_0,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9046,67.0,0,1,228.69,36.6,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
31112,80.0,0,1,105.92,32.5,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
60182,49.0,0,0,171.23,34.4,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
1665,79.0,1,0,174.12,24.0,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
56669,81.0,0,0,186.21,29.0,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
53882,74.0,1,1,70.09,27.4,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
10434,69.0,0,0,94.39,22.8,1,1,0,0,1,...,0,1,0,0,0,1,0,0,1,0
60491,78.0,0,0,58.57,24.2,1,1,0,0,0,...,0,1,0,0,0,1,1,0,0,0
12109,81.0,1,0,80.43,29.7,1,1,0,0,0,...,0,1,0,0,1,0,0,0,1,0
12095,61.0,0,1,120.46,36.8,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1


# Machine Learning




## PCA


In [110]:
to_scale = ['age','avg_glucose_level','bmi',]
stroke_scaled = StandardScaler().fit_transform(
    stroke_df[to_scale]
)

In [111]:
stroke_scaled_df = pd.DataFrame(stroke_scaled, columns = to_scale)
stroke_scaled_df['id'] = stroke_df.index
stroke_scaled_df.set_index(stroke_scaled_df['id'], inplace= True)
stroke_scaled_df.drop('id', axis=1, inplace=True)
stroke_scaled_df.head()

Unnamed: 0_level_0,age,avg_glucose_level,bmi
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
9046,1.070138,2.777698,0.981345
31112,1.646563,0.013842,0.459269
60182,0.272012,1.484132,0.701207
1665,1.602222,1.549193,-0.623083
56669,1.690903,1.821368,0.013595


In [112]:
to_concat = stroke_df[['hypertension','heart_disease','stroke']]
to_concat = pd.concat([to_concat,encoded], axis = 1)
to_concat.head()

Unnamed: 0_level_0,hypertension,heart_disease,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
9046,0,1,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0
31112,0,1,1,0,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0
60182,0,0,1,1,0,0,0,1,0,0,1,0,0,0,1,0,0,0,1
1665,1,0,1,1,0,0,0,1,0,0,0,1,0,1,0,0,0,1,0
56669,0,0,1,0,1,0,0,1,0,0,1,0,0,0,1,0,1,0,0


In [114]:
final_scaled = pd.concat([stroke_scaled_df,to_concat], axis=1)
final_scaled.head()

Unnamed: 0_level_0,age,avg_glucose_level,bmi,hypertension,heart_disease,stroke,gender_Female,gender_Male,gender_Other,ever_married_No,...,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban,smoking_status_Unknown,smoking_status_formerly smoked,smoking_status_never smoked,smoking_status_smokes
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9046,1.070138,2.777698,0.981345,0,1,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0
31112,1.646563,0.013842,0.459269,0,1,1,0,1,0,0,...,0,1,0,0,1,0,0,0,1,0
60182,0.272012,1.484132,0.701207,0,0,1,1,0,0,0,...,0,1,0,0,0,1,0,0,0,1
1665,1.602222,1.549193,-0.623083,1,0,1,1,0,0,0,...,0,0,1,0,1,0,0,0,1,0
56669,1.690903,1.821368,0.013595,0,0,1,0,1,0,0,...,0,1,0,0,0,1,0,1,0,0


In [117]:
pca = PCA(n_components = 2)
stroke_pca_data = pca.fit_transform(final_scaled)
pca.explained_variance_ratio_

array([0.30866108, 0.15001373])

In [119]:
df_stroke_pca = pd.DataFrame(stroke_pca_data, columns = ['PC1','PC2'])
df_stroke_pca.set_index(final_scaled.index, inplace=True)
df_stroke_pca.head()

Unnamed: 0_level_0,PC1,PC2
id,Unnamed: 1_level_1,Unnamed: 2_level_1
9046,2.506454,2.130528
31112,1.60629,-0.410911
60182,1.353443,0.938284
1665,1.620506,1.141683
56669,2.024892,1.337229


In [122]:
def pca_cor(pca_df, og_df):
    for pca in pca_df:
        print(pca)
        print('*'*50)
        print(
            pd.DataFrame(
                [
                    [x, pca_df[pca].corr(og_df[x])]
                    for x in og_df
                ],
                columns=['col','corr']
            ).sort_values('corr', ascending=False).reset_index(drop=True).head()
        )
        print()
pca_cor(df_stroke_pca,final_scaled)

PC1
**************************************************
                 col      corr
0                age  0.849556
1   ever_married_Yes  0.727200
2                bmi  0.691064
3  avg_glucose_level  0.485275
4       hypertension  0.301003

PC2
**************************************************
                      col      corr
0       avg_glucose_level  0.868306
1      work_type_children  0.286447
2         ever_married_No  0.227023
3             gender_Male  0.171028
4  smoking_status_Unknown  0.166195



## Train-test splitting


In [43]:
X_train, X_test, y_train, y_test = train_test_split(encoded_df.drop('stroke', axis=1), encoded_df['stroke'], stratify=encoded_df['stroke'], random_state=42)

scaler = StandardScaler()

X_train_encoded = X_train.drop(columns=['age','avg_glucose_level','bmi'])
X_test_encoded = X_test.drop(columns=['age','avg_glucose_level','bmi'])

train_to_scale = X_train[['age','avg_glucose_level','bmi']]
test_to_scale = X_test[['age','avg_glucose_level','bmi']]

# Fit the scaler on training data
scaler.fit(train_to_scale)

# Transform the training and testing data
X_train_scaled = scaler.transform(train_to_scale)
X_test_scaled = scaler.transform(test_to_scale)

X_train_scaled = pd.DataFrame(X_train_scaled, columns= ['age','avg_glucose_level','bmi'])
X_train_scaled.set_index(X_train_encoded.index, inplace=True)
X_train_scaled = pd.concat([X_train_scaled, X_train_encoded], axis=1)

X_test_scaled = pd.DataFrame(X_test_scaled, columns= ['age','avg_glucose_level','bmi'])
X_test_scaled.set_index(X_test_encoded.index, inplace=True)
X_test_scaled = pd.concat([X_test_scaled, X_test_encoded], axis=1)


### Logistic Regression Model

In [45]:
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=800,
                                random_state=1)
classifier

In [46]:
classifier.fit(X_train, y_train)

In [47]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.9573485465906004
Testing Data Score: 0.9576547231270358


In [48]:
predictions = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [49]:
# Display the accuracy score for the test dataset.
accuracy_score(y_test, predictions)

0.9576547231270358

In [51]:
scaled_classifier = LogisticRegression(solver='lbfgs',
                                       max_iter=800,
                                       random_state=1)
scaled_classifier

In [52]:
scaled_classifier.fit(X_train_scaled,y_train)

In [54]:
print(f"Training Data Score: {scaled_classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {scaled_classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.9573485465906004
Testing Data Score: 0.9576547231270358


In [55]:
scaled_predictions = scaled_classifier.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": scaled_predictions, "Actual": y_test}).reset_index(drop=True)
results.head(10)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [56]:
# Display the accuracy score for the test dataset.
accuracy_score(y_test, scaled_predictions)

0.9576547231270358

### Neural Networking

In [91]:
number_input_features = X_train.shape[1]
nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=75, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=60, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_22 (Dense)            (None, 75)                1650      
                                                                 
 dense_23 (Dense)            (None, 60)                4560      
                                                                 
 dense_24 (Dense)            (None, 1)                 61        
                                                                 
Total params: 6271 (24.50 KB)
Trainable params: 6271 (24.50 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [92]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [93]:
# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=80)

Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80
Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80
Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80


In [94]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

39/39 - 0s - loss: 0.3069 - accuracy: 0.9397 - 190ms/epoch - 5ms/step
Loss: 0.3069401979446411, Accuracy: 0.9397394061088562


### Random Forests (Attempt)

In [62]:
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [63]:
rf_model = rf_model.fit(X_train_scaled, y_train)

In [64]:
predictions = rf_model.predict(X_test_scaled)

In [66]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [67]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,1176,0
Actual 1,52,0


Accuracy Score : 0.9576547231270358
Classification Report
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1176
           1       0.00      0.00      0.00        52

    accuracy                           0.96      1228
   macro avg       0.48      0.50      0.49      1228
weighted avg       0.92      0.96      0.94      1228



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
