### Data Exploration

In [351]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [352]:
# read csv file
df = pd.read_csv("./Datasets/healthcare-dataset-stroke-data.csv")


In [353]:
# display dataframe
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [354]:
# drop "id" column
df.drop(['id'], axis=1, inplace=True)

In [355]:
df
## key: 
## gender: 'Male':0, 'Female':1, 'Other':2
## hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
## heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
## ever_married: 0 no, 1 yes
## work_type: 'Private':4, 'Self-employed':3, 'Govt_job':2, 'children':1, 'Never_worked':0
## residence_type: 'Urban': 0, 'Rural':1
## avg_glucose_level: '<70':0, '70-100':1, '101-125':2, '>126':3
## smoking_status: 'formerly smoked': 2, 'never smoked':0, 'smokes':1, 'Unknown':4
## Stroke:  0 = no stroke, 1 = stroke

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [356]:
df.info

<bound method DataFrame.info of       gender   age  hypertension  heart_disease ever_married      work_type  \
0       Male  67.0             0              1          Yes        Private   
1     Female  61.0             0              0          Yes  Self-employed   
2       Male  80.0             0              1          Yes        Private   
3     Female  49.0             0              0          Yes        Private   
4     Female  79.0             1              0          Yes  Self-employed   
...      ...   ...           ...            ...          ...            ...   
5105  Female  80.0             1              0          Yes        Private   
5106  Female  81.0             0              0          Yes  Self-employed   
5107  Female  35.0             0              0          Yes  Self-employed   
5108    Male  51.0             0              0          Yes        Private   
5109  Female  44.0             0              0          Yes       Govt_job   

     Residence_type

In [357]:
# check data type
df.dtypes

gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [358]:
# check if there are null values
df['bmi'].isnull().values.any()

True

In [359]:
# drop nan values
df = df.dropna(axis=0)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [360]:
# check unique values
df['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [361]:
# merge 'children' and 'Never_worked' categories
df = df.replace({'work_type': {'Never_worked':'no_work', 'children': 'no_work'}})


In [362]:
# check unique values

df['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'no_work'], dtype=object)

In [363]:
# check unique values
df['smoking_status'].unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

In [364]:
# check unique values
df['ever_married'].unique()

array(['Yes', 'No'], dtype=object)

In [365]:
# check unique values
df['Residence_type'].unique()

array(['Urban', 'Rural'], dtype=object)

In [366]:
# check unique values

df['gender'].unique()

array(['Male', 'Female', 'Other'], dtype=object)

In [367]:
# make dicts
work_type_dict = {'Private':3, 'Self-employed':2, 'Govt_job':1, 'no_work':0}
smoke_dict = {'formerly smoked': 2, 'never smoked':0, 'smokes':1, 'Unknown':4}
ever_married_dict = {'Yes':1, 'No':0}
resi_type_dict = {'Urban': 0, 'Rural':1}
gender_dict = {'Male':0, 'Female':1, 'Other':2}

In [368]:
# map series

df['work_type'] = df['work_type'].map(work_type_dict)
df['smoking_status'] = df['smoking_status'].map(smoke_dict)
df['ever_married'] = df['ever_married'].map(ever_married_dict)
df['Residence_type'] = df['Residence_type'].map(resi_type_dict)
df['gender'] = df['gender'].map(gender_dict)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,3,0,228.69,36.6,2,1
2,0,80.0,0,1,1,3,1,105.92,32.5,0,1
3,1,49.0,0,0,1,3,0,171.23,34.4,1,1
4,1,79.0,1,0,1,2,1,174.12,24.0,0,1
5,0,81.0,0,0,1,3,0,186.21,29.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,1,13.0,0,0,0,0,1,103.08,18.6,4,0
5106,1,81.0,0,0,1,2,0,125.20,40.0,0,0
5107,1,35.0,0,0,1,2,1,82.99,30.6,0,0
5108,0,51.0,0,0,1,3,1,166.29,25.6,2,0


In [388]:
# bin avg_glucose_level
avg_glucose_lvl = df['avg_glucose_level']


In [392]:
glucose_lvl_bins = [0, 69, 100, 126, 272]
glucose_lvl_labels = ['<70','70-100','101-125','>126']
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,3,0,>126,36.6,2,1
2,0,80.0,0,1,1,3,1,101-125,32.5,0,1
3,1,49.0,0,0,1,3,0,>126,34.4,1,1
4,1,79.0,1,0,1,2,1,>126,24.0,0,1
5,0,81.0,0,0,1,3,0,>126,29.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,1,13.0,0,0,0,0,1,101-125,18.6,4,0
5106,1,81.0,0,0,1,2,0,101-125,40.0,0,0
5107,1,35.0,0,0,1,2,1,70-100,30.6,0,0
5108,0,51.0,0,0,1,3,1,>126,25.6,2,0


In [391]:
df['avg_glucose_level'] = pd.cut(avg_glucose_lvl, bins=glucose_lvl_bins, labels=glucose_lvl_labels)
df

TypeError: '<' not supported between instances of 'int' and 'str'

In [394]:
glucose_lvl_dict = {'<70':0,'70-100':1,'101-125':2,'>126':3}
df['avg_glucose_level'] = df['avg_glucose_level'].map(glucose_lvl_dict)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,3,0,3,36.6,2,1
2,0,80.0,0,1,1,3,1,2,32.5,0,1
3,1,49.0,0,0,1,3,0,3,34.4,1,1
4,1,79.0,1,0,1,2,1,3,24.0,0,1
5,0,81.0,0,0,1,3,0,3,29.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,1,13.0,0,0,0,0,1,2,18.6,4,0
5106,1,81.0,0,0,1,2,0,2,40.0,0,0
5107,1,35.0,0,0,1,2,1,1,30.6,0,0
5108,0,51.0,0,0,1,3,1,3,25.6,2,0


In [395]:
df.dtypes


gender                  int64
age                   float64
hypertension            int64
heart_disease           int64
ever_married            int64
work_type               int64
Residence_type          int64
avg_glucose_level    category
bmi                   float64
smoking_status          int64
stroke                  int64
dtype: object

In [396]:
# how many people did not have a stroke (0) vs how many people had a stroke (1)
df['stroke'].value_counts()

0    4700
1     209
Name: stroke, dtype: int64

In [398]:
# save csv

df.to_csv('stroke_data_cleaned.csv', index=False)

### Building Classification model

In [399]:
from sklearn.ensemble import RandomForestClassifier

In [400]:
model = RandomForestClassifier()

In [401]:
X = df.drop(['stroke'], axis=1)
y = df['stroke']

In [402]:
from sklearn.model_selection import train_test_split

In [403]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)


In [404]:
print(f'"X_train" {X_train.shape}')
print(f'"X_test" {X_test.shape}')
print(f'"y_train" {y_train.shape}')
print(f'"y_test" {y_test.shape}')
#85,15 split

"X_train" (3927, 10)
"X_test" (982, 10)
"y_train" (3927,)
"y_test" (982,)


### Training the Model

In [405]:
model.fit(X_train, y_train)

RandomForestClassifier()

In [406]:
preds = model.predict(X_test)

In [407]:
from sklearn.metrics import classification_report

In [348]:
print(classification_report(y_test, preds, zero_division=1))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       931
           1       1.00      0.00      0.00        51

    accuracy                           0.95       982
   macro avg       0.97      0.50      0.49       982
weighted avg       0.95      0.95      0.92       982



### Building Logistic Regression Model

In [329]:
#separate features
X = df.drop(['stroke'], axis=1)
y = df['stroke']

In [330]:
# split data into train and test
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(3681, 10)

In [331]:
# create logistic regression model
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=200,
                                random_state=1)

In [332]:
# train model using train data
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=200, random_state=1)

### Making predicitons

In [333]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [334]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.9576547231270358


In [335]:
from sklearn.metrics import confusion_matrix, classification_report

In [336]:
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

[[1176    0]
 [  52    0]]


In [337]:
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1176
           1       0.00      0.00      0.00        52

    accuracy                           0.96      1228
   macro avg       0.48      0.50      0.49      1228
weighted avg       0.92      0.96      0.94      1228



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [338]:
report = classification_report(y_test, y_pred, zero_division=1)
print(report)

              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1176
           1       1.00      0.00      0.00        52

    accuracy                           0.96      1228
   macro avg       0.98      0.50      0.49      1228
weighted avg       0.96      0.96      0.94      1228



In [None]:
#zero_division“warn”, 0 or 1, default=”warn”
#Sets the value to return when there is a zero division. If set to “warn”, this acts as 0, but warnings are also raised.