In [189]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [190]:
# read csv file
df = pd.read_csv("./Datasets/stroke_data_cleaned.csv")
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,3,0,3,36.6,2,1
1,0,80.0,0,1,1,3,1,2,32.5,0,1
2,1,49.0,0,0,1,3,0,3,34.4,1,1
3,1,79.0,1,0,1,2,1,3,24.0,0,1
4,0,81.0,0,0,1,3,0,3,29.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
4904,1,13.0,0,0,0,0,1,2,18.6,4,0
4905,1,81.0,0,0,1,2,0,2,40.0,0,0
4906,1,35.0,0,0,1,2,1,1,30.6,0,0
4907,0,51.0,0,0,1,3,1,3,25.6,2,0


### Building Classification model (preprocessing)


In [180]:
model = RandomForestClassifier()

In [181]:
# define the features set
X = df.drop(columns=['stroke'], axis=1)
y = df['stroke']
X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,0,67.0,0,1,3,0,3,36.6,2
1,0,80.0,0,1,3,1,2,32.5,0
2,1,49.0,0,0,3,0,3,34.4,1
3,1,79.0,1,0,2,1,3,24.0,0
4,0,81.0,0,0,3,0,3,29.0,2


In [182]:
# Splitting into Train and Test sets.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=10000)


In [183]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [184]:
print(f'"X_train" {X_train_scaled.shape}')
print(f'"X_test" {X_test_scaled.shape}')
print(f'"y_train" {y_train.shape}')
print(f'"y_test" {y_test.shape}')
#85,15 split

"X_train" (4172, 9)
"X_test" (737, 9)
"y_train" (4172,)
"y_test" (737,)


In [185]:
#fitting the model
model.fit(X_train_scaled, y_train)

RandomForestClassifier()

In [186]:
# making predictions using the test data
preds = model.predict(X_test_scaled)


In [187]:
print(classification_report(y_test, preds, zero_division=1)) # test_size=0.15

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       707
           1       0.00      0.00      0.00        30

    accuracy                           0.96       737
   macro avg       0.48      0.50      0.49       737
weighted avg       0.92      0.96      0.94       737



### Evaluating the model 

In [188]:
# calculating the confusion matrix test_size=0.15
#random_state=10000
cm = confusion_matrix(y_test, preds)

# create DataFrame from confusion matrix
cm_df = pd.DataFrame(
            cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,705,2
Actual 1,30,0


In [115]:
# calculating the confusion matrix test_size=0.15
#random_state=10000
cm = confusion_matrix(y_test, preds)

# create DataFrame from confusion matrix
cm_df = pd.DataFrame(
            cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,704,3
Actual 1,29,1


In [94]:
# calculating the confusion matrix test_size=0.20
#random_state=10000
cm = confusion_matrix(y_test, preds)

# create DataFrame from confusion matrix
cm_df = pd.DataFrame(
            cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,940,1
Actual 1,40,1


In [73]:
# calculating the confusion matrix test_size=0.20
#random_state=1000
cm = confusion_matrix(y_test, preds)

# create DataFrame from confusion matrix
cm_df = pd.DataFrame(
            cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,927,2
Actual 1,52,1


In [65]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, preds)

In [66]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, preds))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,944,4
Actual 1,34,0


Accuracy Score : 0.9613034623217923
Classification Report
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       948
           1       0.00      0.00      0.00        34

    accuracy                           0.96       982
   macro avg       0.48      0.50      0.49       982
weighted avg       0.93      0.96      0.95       982



### Rank importance of features

In [134]:
# calculate importance of features in Random Forest model
importances = model.feature_importances_
importances

array([0.04229099, 0.2842503 , 0.03109188, 0.0284288 , 0.01739746,
       0.05873765, 0.04404376, 0.07240987, 0.33839531, 0.08295398])

In [135]:
# sort features by their importance (most import first)
sorted(zip(model.feature_importances_, X.columns), reverse=True)

[(0.3383953087356457, 'bmi'),
 (0.28425029824705356, 'age'),
 (0.08295398481256129, 'smoking_status'),
 (0.072409867951349, 'avg_glucose_level'),
 (0.058737654120824316, 'work_type'),
 (0.04404376346381904, 'Residence_type'),
 (0.04229098593487658, 'gender'),
 (0.031091884676481697, 'hypertension'),
 (0.028428796468858646, 'heart_disease'),
 (0.0173974555885301, 'ever_married')]

### TESTING OTHER MODELS

### Building Logistic Regression Model


In [20]:
#separate features
X = df.drop(['stroke'], axis=1)
y = df['stroke']

In [21]:
# split data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(3681, 10)

In [22]:
# create logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [23]:
# train model using train data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

### Making predicitons

In [25]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)


In [26]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9576547231270358


In [27]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[1176    0]
 [  52    0]]


In [29]:
report = classification_report(y_test, y_pred, zero_division=1)
print(report)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1176
           1       1.00      0.00      0.00        52

    accuracy                           0.96      1228
   macro avg       0.98      0.50      0.49      1228
weighted avg       0.96      0.96      0.94      1228



In [30]:
#zero_division“warn”, 0 or 1, default=”warn”
#Sets the value to return when there is a zero division. If set to “warn”, this acts as 0, but warnings are also raised.