### Data Exploration

In [90]:
# import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [61]:
# read csv file
df = pd.read_csv("healthcare-dataset-stroke-data.csv")


In [62]:
# display dataframe
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [63]:
# drop "id" column
df.drop(['id'], axis=1, inplace=True)

In [64]:
df
## key: 
## gender: 'Male':0, 'Female':1, 'Other':2
## hypertension: 0 if the patient doesn't have hypertension, 1 if the patient has hypertension
## heart_disease: 0 if the patient doesn't have any heart diseases, 1 if the patient has a heart disease
## ever_married: 0 no, 1 yes
## work_type: 'Private':4, 'Self-employed':3, 'Govt_job':2, 'children':1, 'Never_worked':0
## residence_type: 'Urban': 0, 'Rural':1
## smoking_status: 'formerly smoked': 2, 'never smoked':0, 'smokes':1, 'Unknown':4
## Stroke:  0 = no stroke, 1 = stroke

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [65]:
df.info

<bound method DataFrame.info of       gender   age  hypertension  heart_disease ever_married      work_type  \
0       Male  67.0             0              1          Yes        Private   
1     Female  61.0             0              0          Yes  Self-employed   
2       Male  80.0             0              1          Yes        Private   
3     Female  49.0             0              0          Yes        Private   
4     Female  79.0             1              0          Yes  Self-employed   
...      ...   ...           ...            ...          ...            ...   
5105  Female  80.0             1              0          Yes        Private   
5106  Female  81.0             0              0          Yes  Self-employed   
5107  Female  35.0             0              0          Yes  Self-employed   
5108    Male  51.0             0              0          Yes        Private   
5109  Female  44.0             0              0          Yes       Govt_job   

     Residence_type

In [66]:
# check data type
df.dtypes

gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [67]:
# check if there are null values
df['bmi'].isnull().values.any()

True

In [68]:
# drop nan values
df = df.dropna(axis=0)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.0,0,0,No,children,Rural,103.08,18.6,Unknown,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [69]:
# check unique values
df['smoking_status'].unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

In [70]:
# make a dict to turn column values into numerical values
target_dict = {'formerly smoked': 2, 'never smoked':0, 'smokes':1, 'Unknown':4}

In [71]:
# map series from target_dict
df['smoking_status'] = df['smoking_status'].map(target_dict)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [72]:
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,2,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,0,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,1,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,0,1
5,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.0,0,0,No,children,Rural,103.08,18.6,4,0
5106,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,0,0
5107,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,0,0
5108,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,2,0


In [73]:
# check unique values

df['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [74]:
# merge 'children' and 'Never_worked' categories
df = df.replace({'work_type': {'Never_worked':'no_work', 'children': 'no_work'}})


In [75]:
# check unique values

df['work_type'].unique()

array(['Private', 'Self-employed', 'Govt_job', 'no_work'], dtype=object)

In [76]:
# make a dict
work_type_dict = {'Private':3, 'Self-employed':2, 'Govt_job':1, 'no_work':0}

In [77]:
# map series

df['work_type'] = df['work_type'].map(work_type_dict)
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,3,Urban,228.69,36.6,2,1
2,Male,80.0,0,1,Yes,3,Rural,105.92,32.5,0,1
3,Female,49.0,0,0,Yes,3,Urban,171.23,34.4,1,1
4,Female,79.0,1,0,Yes,2,Rural,174.12,24.0,0,1
5,Male,81.0,0,0,Yes,3,Urban,186.21,29.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,Female,13.0,0,0,No,0,Rural,103.08,18.6,4,0
5106,Female,81.0,0,0,Yes,2,Urban,125.20,40.0,0,0
5107,Female,35.0,0,0,Yes,2,Rural,82.99,30.6,0,0
5108,Male,51.0,0,0,Yes,3,Rural,166.29,25.6,2,0


In [78]:
# check unique values

df['ever_married'].unique()

array(['Yes', 'No'], dtype=object)

In [79]:
# make a dict
ever_married_dict = {'Yes':1, 'No':0}

In [80]:
# map series
df['ever_married'] = df['ever_married'].map(ever_married_dict)

In [81]:
# check unique values

df['Residence_type'].unique()

array(['Urban', 'Rural'], dtype=object)

In [82]:
# make a dict
resi_type_dict = {'Urban': 0, 'Rural':1}

In [83]:
# map series
df['Residence_type'] = df['Residence_type'].map(resi_type_dict)

In [84]:
# check unique values

df['gender'].unique()

array(['Male', 'Female', 'Other'], dtype=object)

In [85]:
# make a dict
gender_dict = {'Male':0, 'Female':1, 'Other':2}

In [86]:
# map series
df['gender'] = df['gender'].map(gender_dict)

In [87]:
df.info

<bound method DataFrame.info of       gender   age  hypertension  heart_disease  ever_married  work_type  \
0          0  67.0             0              1             1          3   
2          0  80.0             0              1             1          3   
3          1  49.0             0              0             1          3   
4          1  79.0             1              0             1          2   
5          0  81.0             0              0             1          3   
...      ...   ...           ...            ...           ...        ...   
5104       1  13.0             0              0             0          0   
5106       1  81.0             0              0             1          2   
5107       1  35.0             0              0             1          2   
5108       0  51.0             0              0             1          3   
5109       1  44.0             0              0             1          1   

      Residence_type  avg_glucose_level   bmi  smoking_

In [88]:
df.apply(pd.to_numeric, errors='ignore')


Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,1,3,0,228.69,36.6,2,1
2,0,80.0,0,1,1,3,1,105.92,32.5,0,1
3,1,49.0,0,0,1,3,0,171.23,34.4,1,1
4,1,79.0,1,0,1,2,1,174.12,24.0,0,1
5,0,81.0,0,0,1,3,0,186.21,29.0,2,1
...,...,...,...,...,...,...,...,...,...,...,...
5104,1,13.0,0,0,0,0,1,103.08,18.6,4,0
5106,1,81.0,0,0,1,2,0,125.20,40.0,0,0
5107,1,35.0,0,0,1,2,1,82.99,30.6,0,0
5108,0,51.0,0,0,1,3,1,166.29,25.6,2,0


In [89]:
df.dtypes


gender                 int64
age                  float64
hypertension           int64
heart_disease          int64
ever_married           int64
work_type              int64
Residence_type         int64
avg_glucose_level    float64
bmi                  float64
smoking_status         int64
stroke                 int64
dtype: object

### Building Classification model

In [91]:
from sklearn.ensemble import RandomForestClassifier

In [92]:
model = RandomForestClassifier()

In [93]:
X = df.drop(['stroke'], axis=1)
y = df['stroke']

In [94]:
from sklearn.model_selection import train_test_split

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)

### Training the Model

In [96]:
model.fit(X_train, y_train)

RandomForestClassifier()

In [97]:
preds = model.predict(X_test)

In [98]:
from sklearn.metrics import classification_report

In [99]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       698
           1       0.00      0.00      0.00        39

    accuracy                           0.95       737
   macro avg       0.47      0.50      0.49       737
weighted avg       0.90      0.95      0.92       737

