In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import r2_score

In [25]:
df= pd.read_csv("stroke.csv")
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40.0,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [26]:
df.stroke.value_counts()

0    4861
1     249
Name: stroke, dtype: int64

In [27]:
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [28]:
df.nunique()

id                   5110
gender                  3
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3979
bmi                   418
smoking_status          4
stroke                  2
dtype: int64

In [29]:
df.isnull().sum().sort_values(ascending=False)

bmi                  201
id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
smoking_status         0
stroke                 0
dtype: int64

In [30]:
df.dtypes

id                     int64
gender                object
age                  float64
hypertension           int64
heart_disease          int64
ever_married          object
work_type             object
Residence_type        object
avg_glucose_level    float64
bmi                  float64
smoking_status        object
stroke                 int64
dtype: object

In [31]:
df['gender'].unique(), df['work_type'].unique(), df['Residence_type'].unique(), df['smoking_status'].unique()

(array(['Male', 'Female', 'Other'], dtype=object),
 array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
       dtype=object),
 array(['Urban', 'Rural'], dtype=object),
 array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
       dtype=object))

In [32]:
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,4909.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,28.893237,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,7.854067,0.21532
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17741.25,25.0,0.0,0.0,77.245,23.5,0.0
50%,36932.0,45.0,0.0,0.0,91.885,28.1,0.0
75%,54682.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [33]:
df['gender']= df['gender'].astype('category').cat.codes
df['ever_married']= df['ever_married'].astype('category').cat.codes
df['work_type']= df['work_type'].astype('category').cat.codes
df['Residence_type']= df['Residence_type'].astype('category').cat.codes
df['smoking_status']= df['smoking_status'].astype('category').cat.codes

In [34]:
#bins= [0,10,20,30,40,50,60,70,80,90]
#labels= [1,2,3,4,5,6,7,8,9]
#df['binned_age']= pd.cut(df['age'], bins, labels= labels).astype('int')

In [35]:
df['age']= df['age'].astype('int')

In [36]:
df['bmi'].fillna(df['bmi'].mean().round(2), inplace= True)

In [37]:
#bins= [10,20,30,40,50,60,70,80,90,100]
#labels= [2,3,4,5,6,7,8,9,10]
#df['binned_bmi']= pd.cut(df['bmi'], bins, labels= labels).astype('int')

bins= [50, 100, 150, 200, 250,300]
labels= [5,10,15,20,25]
df['binned_avg_glucose_level']= pd.cut(df['avg_glucose_level'], bins, labels= labels).astype('int')

In [38]:
df

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,binned_avg_glucose_level
0,9046,1,67,0,1,1,2,1,228.69,36.60,1,1,20
1,51676,0,61,0,0,1,3,0,202.21,28.89,2,1,20
2,31112,1,80,0,1,1,2,0,105.92,32.50,2,1,10
3,60182,0,49,0,0,1,2,1,171.23,34.40,3,1,15
4,1665,0,79,1,0,1,3,0,174.12,24.00,2,1,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,0,80,1,0,1,2,1,83.75,28.89,2,0,5
5106,44873,0,81,0,0,1,3,1,125.20,40.00,2,0,10
5107,19723,0,35,0,0,1,3,0,82.99,30.60,2,0,5
5108,37544,1,51,0,0,1,2,0,166.29,25.60,1,0,15


In [39]:
#df= df.drop(columns=['age', 'bmi', 'avg_glucose_level','id'])
df= df.drop(columns=['id','avg_glucose_level'])
df

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,bmi,smoking_status,stroke,binned_avg_glucose_level
0,1,67,0,1,1,2,1,36.60,1,1,20
1,0,61,0,0,1,3,0,28.89,2,1,20
2,1,80,0,1,1,2,0,32.50,2,1,10
3,0,49,0,0,1,2,1,34.40,3,1,15
4,0,79,1,0,1,3,0,24.00,2,1,15
...,...,...,...,...,...,...,...,...,...,...,...
5105,0,80,1,0,1,2,1,28.89,2,0,5
5106,0,81,0,0,1,3,1,40.00,2,0,10
5107,0,35,0,0,1,3,0,30.60,2,0,5
5108,1,51,0,0,1,2,0,25.60,1,0,15


In [40]:
Target= df['stroke']
df.drop(columns=['stroke'], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(df, Target, test_size=0.25, random_state=20)

In [41]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((3832, 10), (3832,), (1278, 10), (1278,))

In [42]:
classifier =  DecisionTreeClassifier()
classifier.fit(X_train, y_train)

DecisionTreeClassifier()

In [43]:
y_pred = classifier.predict(X_train)

In [47]:
confusion_matrix(y_train, y_pred),r2_score(y_train, y_pred)

(array([[3650,    0],
        [   0,  182]], dtype=int64),
 1.0)

In [45]:
y_pred1 = classifier.predict(X_test)
confusion_matrix(y_test, y_pred1)

array([[1138,   73],
       [  53,   14]], dtype=int64)

In [46]:
r2_score(y_test, y_pred1)

-0.9846432576999395