# Stroke

In [1]:
# Importing Required Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

### Importing dataset 

In [2]:
# Importing the dataset

stroke = pd.read_csv(r"D:\Capstone_project_2\healthcare-dataset-stroke-data.csv")
stroke.shape

(5110, 12)

In [3]:
stroke.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [4]:
# Value counts of the Target Variable
# Here we can see that there is high class imbalance in our target variable

stroke.stroke.value_counts()

No     4861
Yes     249
Name: stroke, dtype: int64

### Missing value imputation 

In [5]:
# Checking the null values in the dataset

stroke.isnull().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [6]:
# We have nulls present in only one column i.e. bmi
# bmi column is of float type so we will fill the nulls by median value.

stroke.bmi = stroke.bmi.fillna(stroke.bmi.median())

In [7]:
stroke.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                5110 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   object 
dtypes: float64(3), int64(3), object(6)
memory usage: 479.2+ KB


In [8]:
# We do not want id column while creating a model
# So we will drop the id column

stroke = stroke.iloc[:,1:]

### Converting categorical data into numeric

In [9]:
# Checking the datatypes of ever column in the dataset

stroke.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   object 
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   object 
 5   work_type          5110 non-null   object 
 6   Residence_type     5110 non-null   object 
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                5110 non-null   float64
 9   smoking_status     5110 non-null   object 
 10  stroke             5110 non-null   object 
dtypes: float64(3), int64(2), object(6)
memory usage: 439.3+ KB


In [10]:
stroke.gender.value_counts()

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64

In [11]:
stroke.ever_married.value_counts()

Yes    3353
No     1757
Name: ever_married, dtype: int64

In [12]:
stroke.work_type.value_counts()

Private          2925
Self-employed     819
children          687
Govt_job          657
Never_worked       22
Name: work_type, dtype: int64

In [13]:
stroke.Residence_type.value_counts()

Urban    2596
Rural    2514
Name: Residence_type, dtype: int64

In [14]:
stroke.smoking_status.value_counts()

never smoked       1892
Unknown            1544
formerly smoked     885
smokes              789
Name: smoking_status, dtype: int64

In [15]:
stroke.stroke.value_counts()

No     4861
Yes     249
Name: stroke, dtype: int64

In [16]:
# Now we will convert categorical data into numeric

stroke.gender.replace({'Female':0,'Male':1,'Other':2},inplace=True)
stroke.ever_married.replace({'No':0,'Yes':1},inplace=True)
stroke.work_type.replace({'Private':0,'Self-employed':1,'children':2,'Govt_job':3,'Never_worked':4},inplace=True)
stroke.Residence_type.replace({'Urban':0,'Rural':1},inplace=True)
stroke.smoking_status.replace({'never smoked':0,'Unknown':1,'formerly smoked':2,'smokes':3},inplace=True)
stroke.stroke.replace({'No':0, 'Yes':1},inplace=True)

In [17]:
stroke.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             5110 non-null   int64  
 1   age                5110 non-null   float64
 2   hypertension       5110 non-null   int64  
 3   heart_disease      5110 non-null   int64  
 4   ever_married       5110 non-null   int64  
 5   work_type          5110 non-null   int64  
 6   Residence_type     5110 non-null   int64  
 7   avg_glucose_level  5110 non-null   float64
 8   bmi                5110 non-null   float64
 9   smoking_status     5110 non-null   int64  
 10  stroke             5110 non-null   int64  
dtypes: float64(3), int64(8)
memory usage: 439.3 KB


### Splitting the data into train and test 

In [18]:
# Splitting the dataset into train and test 

from sklearn.model_selection import train_test_split
train, test = train_test_split(stroke, test_size=.2,stratify=stroke.stroke)

In [19]:
# value counts of the target variable in train 

train.stroke.value_counts()

0    3889
1     199
Name: stroke, dtype: int64

In [20]:
# Dividing train and test into independent and target variables.
# i.e. train_x, train_y, test_x, test_y

train_x = train.iloc[:,:-1]
train_y = train.iloc[:,-1]

test_x = test.iloc[:,:-1]
test_y = test.iloc[:,-1]

In [21]:
# As we can see that there is a high class imbalance in our dataset
# So we will do over-sampling using SMOTE(Synthetic Minority Over-sampling Technique)

from imblearn.over_sampling import SMOTE
sm = SMOTE()

In [22]:
# We will only do over-sampling on train dataframe only

train_x, train_y = sm.fit_resample(train_x, train_y)

In [23]:
train_y.value_counts()

0    3889
1    3889
Name: stroke, dtype: int64

In [24]:
# We will create a list of classification algorithm for voting classifier

estimators = []

In [25]:
algorithm = []
accuracy = []
Recall0 = []
Recall1 = []

### Model Selection 

#### Decision Tree 

In [26]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

In [27]:
estimators.append(('DT',dt))

In [28]:
dt.fit(train_x, train_y)

DecisionTreeClassifier()

In [29]:
pred_dt = dt.predict(test_x)
pred_dt

array([0, 1, 1, ..., 0, 0, 0], dtype=int64)

In [30]:
from sklearn.metrics import confusion_matrix, classification_report
tab_dt = confusion_matrix(test_y, pred_dt)
tab_dt

array([[877,  95],
       [ 38,  12]], dtype=int64)

In [31]:
print(classification_report(test_y, pred_dt))

              precision    recall  f1-score   support

           0       0.96      0.90      0.93       972
           1       0.11      0.24      0.15        50

    accuracy                           0.87      1022
   macro avg       0.54      0.57      0.54      1022
weighted avg       0.92      0.87      0.89      1022



In [32]:
acc_dt = tab_dt.diagonal().sum()*100/tab_dt.sum()
recall0_dt = tab_dt[0][0]/(tab_dt[0][1] + tab_dt[0][0])
recall1_dt = tab_dt[1][1]/(tab_dt[1][0] + tab_dt[1][1])

In [33]:
algorithm.append('Decision Tree')
accuracy.append(acc_dt)
Recall0.append(recall0_dt)
Recall1.append(recall1_dt)

--------------------------------------------------------------------------------------------------------------------------

#### Random Forest 

In [34]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [35]:
estimators.append(('RF',rf))

In [36]:
rf.fit(train_x, train_y)

RandomForestClassifier()

In [37]:
pred_rf = rf.predict(test_x)
pred_rf

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [38]:
tab_rf = confusion_matrix(test_y, pred_rf)
tab_rf

array([[919,  53],
       [ 36,  14]], dtype=int64)

In [39]:
print(classification_report(test_y, pred_rf))

              precision    recall  f1-score   support

           0       0.96      0.95      0.95       972
           1       0.21      0.28      0.24        50

    accuracy                           0.91      1022
   macro avg       0.59      0.61      0.60      1022
weighted avg       0.93      0.91      0.92      1022



In [40]:
acc_rf = tab_rf.diagonal().sum()*100/tab_rf.sum()
recall0_rf = tab_rf[0][0]/(tab_rf[0][1] + tab_rf[0][0])
recall1_rf = tab_rf[1][1]/(tab_rf[1][0] + tab_rf[1][1])

In [41]:
algorithm.append('Random Forest')
accuracy.append(acc_rf)
Recall0.append(recall0_rf)
Recall1.append(recall1_rf)

-------------------------------------------------------------------------------------------------------------------------

#### KNN

In [42]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [43]:
estimators.append(('KNN',knn))

In [44]:
knn.fit(train_x, train_y)

KNeighborsClassifier()

In [45]:
pred_knn = knn.predict(test_x)
pred_knn

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [46]:
tab_knn = confusion_matrix(test_y, pred_knn)
tab_knn

array([[787, 185],
       [ 22,  28]], dtype=int64)

In [47]:
print(classification_report(test_y, pred_knn))

              precision    recall  f1-score   support

           0       0.97      0.81      0.88       972
           1       0.13      0.56      0.21        50

    accuracy                           0.80      1022
   macro avg       0.55      0.68      0.55      1022
weighted avg       0.93      0.80      0.85      1022



In [48]:
acc_knn = tab_knn.diagonal().sum()*100/tab_knn.sum()
recall0_knn = tab_knn[0][0]/(tab_knn[0][1] + tab_knn[0][0])
recall1_knn = tab_knn[1][1]/(tab_knn[1][0] + tab_knn[1][1])

In [49]:
algorithm.append('KNN')
accuracy.append(acc_knn)
Recall0.append(recall0_knn)
Recall1.append(recall1_knn)

--------------------------------------------------------------------------------------------------------------------------

#### SVM 

In [50]:
from sklearn.svm import SVC
svm = SVC()

In [51]:
estimators.append(('SVM',svm))

In [52]:
svm.fit(train_x, train_y)

SVC()

In [53]:
pred_svm = svm.predict(test_x)
pred_svm

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [54]:
tab_svm = confusion_matrix(test_y, pred_svm)
tab_svm

array([[712, 260],
       [ 10,  40]], dtype=int64)

In [55]:
print(classification_report(test_y,pred_svm))

              precision    recall  f1-score   support

           0       0.99      0.73      0.84       972
           1       0.13      0.80      0.23        50

    accuracy                           0.74      1022
   macro avg       0.56      0.77      0.53      1022
weighted avg       0.94      0.74      0.81      1022



In [56]:
acc_svm = tab_svm.diagonal().sum()*100/tab_svm.sum()
recall0_svm = tab_svm[0][0]/(tab_svm[0][1] + tab_svm[0][0])
recall1_svm = tab_svm[1][1]/(tab_svm[1][0] + tab_svm[1][1])

In [57]:
algorithm.append('SVM')
accuracy.append(acc_svm)
Recall0.append(recall0_svm)
Recall1.append(recall1_svm)

---------------------------------------------------------------------------------------------------------------------------

#### Logistic Regression 

In [58]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()

In [59]:
estimators.append(('LR',lg))

In [60]:
lg.fit(train_x, train_y)

pred_lg = lg.predict(test_x)
pred_lg

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [61]:
tab_lg = confusion_matrix(test_y, pred_lg)
tab_lg

array([[785, 187],
       [ 24,  26]], dtype=int64)

In [62]:
print(classification_report(test_y, pred_lg))

              precision    recall  f1-score   support

           0       0.97      0.81      0.88       972
           1       0.12      0.52      0.20        50

    accuracy                           0.79      1022
   macro avg       0.55      0.66      0.54      1022
weighted avg       0.93      0.79      0.85      1022



In [63]:
acc_lg = tab_lg.diagonal().sum()*100/tab_lg.sum()
recall0_lg = tab_lg[0][0]/(tab_lg[0][1] + tab_lg[0][0])
recall1_lg = tab_lg[1][1]/(tab_lg[1][0] + tab_lg[1][1])

In [64]:
algorithm.append('Logistic Regression')
accuracy.append(acc_lg)
Recall0.append(recall0_lg)
Recall1.append(recall1_lg)

--------------------------------------------------------------------------------------------------------------------------

#### Voting Classifier 

In [65]:
from sklearn.ensemble import VotingClassifier
vc= VotingClassifier(estimators=estimators)

In [66]:
vc.fit(train_x, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


VotingClassifier(estimators=[('DT', DecisionTreeClassifier()),
                             ('RF', RandomForestClassifier()),
                             ('KNN', KNeighborsClassifier()), ('SVM', SVC()),
                             ('LR', LogisticRegression())])

In [67]:
pred_vc = vc.predict(test_x)
pred_vc

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [68]:
tab_vc = confusion_matrix(test_y, pred_vc)
tab_vc

array([[848, 124],
       [ 28,  22]], dtype=int64)

In [69]:
print(classification_report(test_y, pred_vc))

              precision    recall  f1-score   support

           0       0.97      0.87      0.92       972
           1       0.15      0.44      0.22        50

    accuracy                           0.85      1022
   macro avg       0.56      0.66      0.57      1022
weighted avg       0.93      0.85      0.88      1022



In [70]:
acc_vc = tab_vc.diagonal().sum()*100/tab_vc.sum()
recall0_vc = tab_vc[0][0]/(tab_vc[0][1] + tab_vc[0][0])
recall1_vc = tab_vc[1][1]/(tab_vc[1][0] + tab_vc[1][1])

In [71]:
algorithm.append('Voting Classifier')
accuracy.append(acc_vc)
Recall0.append(recall0_vc)
Recall1.append(recall1_vc)

### Model Evaluation

In [72]:
Eval = pd.DataFrame({'Algorithms':algorithm, 'Accuracy':accuracy, 'Recall 0':Recall0, 'Recall 1':Recall1})
Eval

Unnamed: 0,Algorithms,Accuracy,Recall 0,Recall 1
0,Decision Tree,86.986301,0.902263,0.24
1,Random Forest,91.291585,0.945473,0.28
2,KNN,79.745597,0.809671,0.56
3,SVM,73.581213,0.73251,0.8
4,Logistic Regression,79.354207,0.807613,0.52
5,Voting Classifier,85.127202,0.872428,0.44


# * **We can see that all the algorithms are giving good accuracy but their recall for class 1 is very low except for SVM**
# * **And in this problem statement we want model who is giving good value of Recall 1**
# * **Here, SVM is giving good values of Recall 0 as well as Recall 1**