In [19]:
pip install streamlit

Note: you may need to restart the kernel to use updated packages.


In [20]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,confusion_matrix



In [21]:
### load dataset
data=pd.read_csv(r"C:\Users\neses\OneDrive\Documents\Heart_Disease_Dataset.csv")

In [22]:
## check top rows
data.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,Yes,14.96,Yes,Yes,No,7,6,Yes,Female,25-29,White,Yes,Yes,Fair,17.1,Yes,Yes,Yes
1,Yes,36.93,Yes,Yes,No,3,10,No,Female,30-34,Hispanic,"No, borderline diabetes",Yes,Poor,2.7,Yes,No,No
2,No,18.7,Yes,No,Yes,26,15,No,Male,70-74,Hispanic,No,No,Poor,15.6,Yes,Yes,Yes
3,Yes,31.43,Yes,Yes,Yes,24,20,No,Female,40-44,American Indian/Alaskan Native,"No, borderline diabetes",No,Fair,1.4,No,No,Yes
4,Yes,75.64,No,No,Yes,2,29,No,Male,35-39,White,No,Yes,Excellent,8.8,Yes,Yes,Yes


In [23]:
## check the bottom rows
data.tail()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
100196,No,48.02,No,No,Yes,13,17,No,Female,45-49,White,"No, borderline diabetes",No,Fair,6.4,Yes,No,No
100197,Yes,82.88,No,No,No,29,23,No,Male,75-79,Other,"No, borderline diabetes",Yes,Very good,18.9,Yes,Yes,No
100198,No,39.46,No,Yes,Yes,23,6,Yes,Male,80 or older,Hispanic,Yes,No,Excellent,18.5,Yes,No,Yes
100199,No,46.25,Yes,Yes,Yes,9,14,No,Male,25-29,American Indian/Alaskan Native,No,No,Very good,22.7,Yes,Yes,No
100200,No,39.26,Yes,No,No,10,5,Yes,Female,25-29,Other,"No, borderline diabetes",Yes,Fair,4.1,No,No,Yes


In [24]:
data.shape

(100201, 18)

In [25]:
## information about data
data.info

<bound method DataFrame.info of        HeartDisease    BMI Smoking AlcoholDrinking Stroke  PhysicalHealth  \
0               Yes  14.96     Yes             Yes     No               7   
1               Yes  36.93     Yes             Yes     No               3   
2                No  18.70     Yes              No    Yes              26   
3               Yes  31.43     Yes             Yes    Yes              24   
4               Yes  75.64      No              No    Yes               2   
...             ...    ...     ...             ...    ...             ...   
100196           No  48.02      No              No    Yes              13   
100197          Yes  82.88      No              No     No              29   
100198           No  39.46      No             Yes    Yes              23   
100199           No  46.25     Yes             Yes    Yes               9   
100200           No  39.26     Yes              No     No              10   

        MentalHealth DiffWalking     Sex  A

In [26]:
## statistical analysis
data.describe()

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime
count,100201.0,100201.0,100201.0,100201.0
mean,53.373302,14.503937,14.498239,12.490389
std,23.878712,8.640423,8.658639,6.632122
min,12.02,0.0,0.0,1.0
25%,32.76,7.0,7.0,6.7
50%,53.38,14.0,15.0,12.5
75%,73.95,22.0,22.0,18.2
max,94.85,29.0,29.0,24.0


In [27]:
## missing values
data.isnull().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [28]:
data.duplicated()

0         False
1         False
2         False
3         False
4         False
          ...  
100196    False
100197    False
100198    False
100199    False
100200    False
Length: 100201, dtype: bool

In [29]:
len(data[data.duplicated()])

0

In [30]:
data.columns

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

In [31]:
##checking for counting of class in each column
for column in data.columns:
    print(data[column].value_counts())

HeartDisease
Yes    50134
No     50067
Name: count, dtype: int64
BMI
56.38    26
91.47    26
27.95    26
88.49    26
66.39    25
         ..
23.75     3
29.15     3
65.14     3
34.33     2
91.36     2
Name: count, Length: 8284, dtype: int64
Smoking
Yes    50445
No     49756
Name: count, dtype: int64
AlcoholDrinking
Yes    50435
No     49766
Name: count, dtype: int64
Stroke
Yes    50265
No     49936
Name: count, dtype: int64
PhysicalHealth
8     3430
20    3401
9     3395
16    3392
19    3386
13    3385
12    3377
26    3369
29    3363
15    3355
23    3354
22    3347
28    3345
14    3343
2     3341
1     3337
10    3334
11    3332
6     3329
0     3328
17    3327
7     3323
24    3316
5     3305
18    3301
3     3287
21    3285
27    3277
25    3270
4     3267
Name: count, dtype: int64
MentalHealth
27    3442
16    3442
13    3419
5     3403
26    3397
21    3394
22    3390
7     3382
2     3377
8     3376
1     3374
18    3367
14    3362
28    3361
6     3360
3     3357
20    3342
9

In [32]:
data['Diabetic'].value_counts()

Diabetic
Yes (during pregnancy)     25171
No, borderline diabetes    25077
Yes                        25001
No                         24952
Name: count, dtype: int64

In [33]:
data=pd.get_dummies(data,columns=['Diabetic'],drop_first=True)

In [34]:
data.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy)
0,Yes,14.96,Yes,Yes,No,7,6,Yes,Female,25-29,White,Yes,Fair,17.1,Yes,Yes,Yes,False,True,False
1,Yes,36.93,Yes,Yes,No,3,10,No,Female,30-34,Hispanic,Yes,Poor,2.7,Yes,No,No,True,False,False
2,No,18.7,Yes,No,Yes,26,15,No,Male,70-74,Hispanic,No,Poor,15.6,Yes,Yes,Yes,False,False,False
3,Yes,31.43,Yes,Yes,Yes,24,20,No,Female,40-44,American Indian/Alaskan Native,No,Fair,1.4,No,No,Yes,True,False,False
4,Yes,75.64,No,No,Yes,2,29,No,Male,35-39,White,Yes,Excellent,8.8,Yes,Yes,Yes,False,False,False


In [37]:
binary_columns=["Smoking","AlcoholDrinking","Stroke","DiffWalking","PhysicalActivity","Asthma","KidneyDisease","SkinCancer"]
for col in binary_columns:
    data[col]=data[col].map({'Yes':1,"No":0})


In [38]:
##applying the get dummies method
data+pd.get_dummies(data,columns=['Sex','AgeCategory','Race','GenHealth'],drop_first=True)
print(data.head())

  HeartDisease    BMI  Smoking  AlcoholDrinking  Stroke  PhysicalHealth  \
0          Yes  14.96      NaN              NaN     NaN               7   
1          Yes  36.93      NaN              NaN     NaN               3   
2           No  18.70      NaN              NaN     NaN              26   
3          Yes  31.43      NaN              NaN     NaN              24   
4          Yes  75.64      NaN              NaN     NaN               2   

   MentalHealth  DiffWalking     Sex AgeCategory  \
0             6          NaN  Female       25-29   
1            10          NaN  Female       30-34   
2            15          NaN    Male       70-74   
3            20          NaN  Female       40-44   
4            29          NaN    Male       35-39   

                             Race  PhysicalActivity  GenHealth  SleepTime  \
0                           White               NaN       Fair       17.1   
1                        Hispanic               NaN       Poor        2.7   
2    

In [41]:
##define the x and target
X=data.drop(columns=['HeartDisease'])
y=data['HeartDisease'].map({'Yes':1,"No":0})

In [42]:
X.head()

Unnamed: 0,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer,"Diabetic_No, borderline diabetes",Diabetic_Yes,Diabetic_Yes (during pregnancy)
0,14.96,,,,7,6,,Female,25-29,White,,Fair,17.1,,,,False,True,False
1,36.93,,,,3,10,,Female,30-34,Hispanic,,Poor,2.7,,,,True,False,False
2,18.7,,,,26,15,,Male,70-74,Hispanic,,Poor,15.6,,,,False,False,False
3,31.43,,,,24,20,,Female,40-44,American Indian/Alaskan Native,,Fair,1.4,,,,True,False,False
4,75.64,,,,2,29,,Male,35-39,White,,Excellent,8.8,,,,False,False,False


In [44]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.20,random_state=42)