In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np        
import pandas as pd     
import matplotlib.pyplot as plt       

# Machine learning libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score,f1_score,precision_score,recall_score
from sklearn.preprocessing import MinMaxScaler

In [3]:
# Download the data

# !wget -O stroke-data.csv https://www.dropbox.com/s/zgburk3yces5tee/healthcare-dataset-stroke-data.csv?dl=0

In [4]:
"""importing the dataset """

dataset = pd.read_csv('stroke-data_2.csv')
dataset

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly_smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never_smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never_smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never_smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never_smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never_smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never_smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly_smoked,0


In [5]:
dataset.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [7]:
# features
feature = dataset[['gender', 'age', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 
                   'Residence_type', 'avg_glucose_level', 'bmi', 'smoking_status']] 

# target variable
target = dataset[['stroke']]

In [8]:
feature.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly_smoked
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never_smoked
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never_smoked
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never_smoked


In [9]:
target.head()

Unnamed: 0,stroke
0,1
1,1
2,1
3,1
4,1


### One Hot Encoding

In [10]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [11]:
encoder = OneHotEncoder(sparse=False)
# encoded_labels = pd.DataFrame (encoder.fit_transform(feature[['gender', 'ever_married', 'work_type', 
#                    'Residence_type', 'smoking_status']]))


encoded_labels = pd.DataFrame (encoder.fit_transform(feature[['gender', 'ever_married', 'work_type', 
                   'Residence_type']]))

In [12]:
# encoded_labels.columns = encoder.get_feature_names(['gender', 'ever_married', 'work_type', 
#                    'Residence_type', 'smoking_status'])

encoded_labels.columns = encoder.get_feature_names(['gender', 'ever_married', 'work_type', 
                   'Residence_type'])

dataset = pd.concat([feature, encoded_labels ], axis=1)


In [13]:
dataset

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,...,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly_smoked,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never_smoked,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never_smoked,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never_smoked,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,,never_smoked,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never_smoked,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never_smoked,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly_smoked,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [14]:
new_features = dataset[['age', 'hypertension', 'heart_disease', 'avg_glucose_level', 'gender_Female',
                        'gender_Male', 'gender_Other', 'ever_married_No', 'ever_married_Yes', 'work_type_Govt_job',
                        'work_type_Never_worked', 'work_type_Private', 'work_type_Self-employed', 'work_type_children',
                        'Residence_type_Rural','Residence_type_Urban']]

In [15]:
new_features.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban
0,67.0,0,1,228.69,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,61.0,0,0,202.21,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2,80.0,0,1,105.92,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,49.0,0,0,171.23,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,79.0,1,0,174.12,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [16]:
"""Spliting the Dataset into Training Set and Test Set """

X_train,X_test,y_train,y_test=train_test_split(new_features,target,test_size = 0.2,random_state = 0)

In [17]:
print(X_train.shape)
print(X_test.shape)

(4088, 16)
(1022, 16)


In [18]:
X_train.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban
3029,36.0,0,0,67.29,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2277,34.0,0,0,83.53,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3002,60.0,0,0,65.16,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
246,75.0,0,0,78.8,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
2825,76.0,0,0,58.65,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [19]:
X_test.head()

Unnamed: 0,age,hypertension,heart_disease,avg_glucose_level,gender_Female,gender_Male,gender_Other,ever_married_No,ever_married_Yes,work_type_Govt_job,work_type_Never_worked,work_type_Private,work_type_Self-employed,work_type_children,Residence_type_Rural,Residence_type_Urban
42,82.0,0,1,144.9,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
380,4.0,0,0,106.22,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3524,58.0,0,0,79.95,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
352,20.0,0,0,96.57,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4927,10.0,0,0,69.84,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


In [20]:
from sklearn.linear_model import LogisticRegression

# Fitting Logistic Regression to the training dataset
lr = LogisticRegression()

# training
lr.fit(X_train,y_train)

LogisticRegression()

In [21]:
# prediction / testing

y_pred = lr.predict(X_test)

In [22]:
# Making confusing matrix (actual,prediction)

confusion_matrix(y_test,y_pred,)

array([[968,   0],
       [ 54,   0]], dtype=int64)

In [28]:
# Making confusing matrix
# it is used to check the accuracy of the classification
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       968
           1       0.00      0.00      0.00        54

    accuracy                           0.95      1022
   macro avg       0.47      0.50      0.49      1022
weighted avg       0.90      0.95      0.92      1022

