In [5]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [6]:
# read csv file
df = pd.read_csv("./Datasets/stroke_data_cleaned.csv")
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,3,0,3,36.6,2,1
1,0,80.0,0,1,1,3,1,2,32.5,0,1
2,1,49.0,0,0,1,3,0,3,34.4,1,1
3,1,79.0,1,0,1,2,1,3,24.0,0,1
4,0,81.0,0,0,1,3,0,3,29.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
4904,1,13.0,0,0,0,0,1,2,18.6,4,0
4905,1,81.0,0,0,1,2,0,2,40.0,0,0
4906,1,35.0,0,0,1,2,1,1,30.6,0,0
4907,0,51.0,0,0,1,3,1,3,25.6,2,0


### Building Classification model (preprocessing)


In [9]:
model = RandomForestClassifier()

In [10]:
# define the features set
X = df.drop(['stroke'], axis=1)
y = df['stroke']
X.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,0,67.0,0,1,1,3,0,3,36.6,2
1,0,80.0,0,1,1,3,1,2,32.5,0
2,1,49.0,0,0,1,3,0,3,34.4,1
3,1,79.0,1,0,1,2,1,3,24.0,0
4,0,81.0,0,0,1,3,0,3,29.0,2


In [11]:
# Splitting into Train and Test sets.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)


In [12]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
print(f'"X_train" {X_train_scaled.shape}')
print(f'"X_test" {X_test_scaled.shape}')
print(f'"y_train" {y_train.shape}')
print(f'"y_test" {y_test.shape}')
#80,20 split

"X_train" (3927, 10)
"X_test" (982, 10)
"y_train" (3927,)
"y_test" (982,)


In [14]:
#fitting the model
model.fit(X_train_scaled, y_train)

RandomForestClassifier()

In [15]:
# making predictions using the test data
preds = model.predict(X_test_scaled)


In [16]:
print(classification_report(y_test, preds, zero_division=1)) # test_size=0.20

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       931
           1       0.00      0.00      0.00        51

    accuracy                           0.95       982
   macro avg       0.47      0.50      0.49       982
weighted avg       0.90      0.95      0.92       982



### Evaluating the model 

In [17]:
# calculating the confusion matrix test_size=0.25
cm = confusion_matrix(y_test, preds)

# create DataFrame from confusion matrix
cm_df = pd.DataFrame(
            cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,929,2
Actual 1,51,0


### TESTING OTHER MODELS

### Building Logistic Regression Model


In [20]:
#separate features
X = df.drop(['stroke'], axis=1)
y = df['stroke']

In [21]:
# split data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(3681, 10)

In [22]:
# create logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [23]:
# train model using train data
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=200, random_state=1)

### Making predicitons

In [25]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)


In [26]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9576547231270358


In [27]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[1176    0]
 [  52    0]]


In [29]:
report = classification_report(y_test, y_pred, zero_division=1)
print(report)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1176
           1       1.00      0.00      0.00        52

    accuracy                           0.96      1228
   macro avg       0.98      0.50      0.49      1228
weighted avg       0.96      0.96      0.94      1228



In [30]:
#zero_division“warn”, 0 or 1, default=”warn”
#Sets the value to return when there is a zero division. If set to “warn”, this acts as 0, but warnings are also raised.