Attribute Information
Age: age of the patient [years]
Sex: sex of the patient [M: Male, F: Female]
ChestPainType: chest pain type [TA: Typical Angina, ATA: Atypical Angina, NAP: Non-Anginal Pain, ASY: Asymptomatic]
RestingBP: resting blood pressure [mm Hg]
Cholesterol: serum cholesterol [mm/dl]
FastingBS: fasting blood sugar [1: if FastingBS > 120 mg/dl, 0: otherwise]
RestingECG: resting electrocardiogram results [Normal: Normal, ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]
MaxHR: maximum heart rate achieved [Numeric value between 60 and 202]
ExerciseAngina: exercise-induced angina [Y: Yes, N: No]
Oldpeak: oldpeak = ST [Numeric value measured in depression]
ST_Slope: the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]
HeartDisease: output class [1: heart disease, 0: Normal]

In [56]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [57]:
df=pd.read_csv("heart.csv")
df.sample(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
798,42,F,ASY,102,265,0,LVH,122,N,0.6,Flat,0
426,56,M,ATA,126,166,0,ST,140,N,0.0,Up,0
635,67,M,ASY,120,229,0,LVH,129,Y,2.6,Flat,1
832,51,M,NAP,94,227,0,Normal,154,Y,0.0,Up,0
91,39,M,ASY,130,307,0,Normal,140,N,0.0,Up,0
802,52,M,ASY,108,233,1,Normal,147,N,0.1,Up,0
834,44,M,ATA,120,220,0,Normal,170,N,0.0,Up,0
83,52,M,ATA,160,196,0,Normal,165,N,0.0,Up,0
52,45,M,ATA,140,224,1,Normal,122,N,0.0,Up,0
880,52,M,NAP,172,199,1,Normal,162,N,0.5,Up,0


In [58]:
df.describe()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [59]:
# check if there is any null values
df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

no null values

In [60]:
df["Sex"]=df["Sex"].apply(lambda x: 1 if x=='M' else 0 )

In [61]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,0,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,1,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,0,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,1,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,1,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,1,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,0,ATA,130,236,0,LVH,174,N,0.0,Flat,1


1 for male , 0 for female

In [62]:
df=pd.concat([df,pd.get_dummies(df["ChestPainType"]).add_prefix("ChestPainType_").astype(int)],axis=1)
df=df.drop("ChestPainType",axis=1)

In [63]:
df


Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,1,140,289,0,Normal,172,N,0.0,Up,0,0,1,0,0
1,49,0,160,180,0,Normal,156,N,1.0,Flat,1,0,0,1,0
2,37,1,130,283,0,ST,98,N,0.0,Up,0,0,1,0,0
3,48,0,138,214,0,Normal,108,Y,1.5,Flat,1,1,0,0,0
4,54,1,150,195,0,Normal,122,N,0.0,Up,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,110,264,0,Normal,132,N,1.2,Flat,1,0,0,0,1
914,68,1,144,193,1,Normal,141,N,3.4,Flat,1,1,0,0,0
915,57,1,130,131,0,Normal,115,Y,1.2,Flat,1,1,0,0,0
916,57,0,130,236,0,LVH,174,N,0.0,Flat,1,0,1,0,0


In [64]:
df["RestingECG"]

0      Normal
1      Normal
2          ST
3      Normal
4      Normal
        ...  
913    Normal
914    Normal
915    Normal
916       LVH
917    Normal
Name: RestingECG, Length: 918, dtype: object

In [65]:
df=pd.concat([df,pd.get_dummies(df["RestingECG"]).add_prefix("RestingECG_").astype(int)],axis=1)
df=df.drop("RestingECG",axis=1)

In [66]:
df

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST
0,40,1,140,289,0,172,N,0.0,Up,0,0,1,0,0,0,1,0
1,49,0,160,180,0,156,N,1.0,Flat,1,0,0,1,0,0,1,0
2,37,1,130,283,0,98,N,0.0,Up,0,0,1,0,0,0,0,1
3,48,0,138,214,0,108,Y,1.5,Flat,1,1,0,0,0,0,1,0
4,54,1,150,195,0,122,N,0.0,Up,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,110,264,0,132,N,1.2,Flat,1,0,0,0,1,0,1,0
914,68,1,144,193,1,141,N,3.4,Flat,1,1,0,0,0,0,1,0
915,57,1,130,131,0,115,Y,1.2,Flat,1,1,0,0,0,0,1,0
916,57,0,130,236,0,174,N,0.0,Flat,1,0,1,0,0,1,0,0


In [67]:
df["ExerciseAngina"]

0      N
1      N
2      N
3      Y
4      N
      ..
913    N
914    N
915    Y
916    N
917    N
Name: ExerciseAngina, Length: 918, dtype: object

In [68]:
df["ExerciseAngina"]=df["ExerciseAngina"].apply(lambda x:1 if x=="Y" else 0)


1-Yes , 0- No

In [69]:
df

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST
0,40,1,140,289,0,172,0,0.0,Up,0,0,1,0,0,0,1,0
1,49,0,160,180,0,156,0,1.0,Flat,1,0,0,1,0,0,1,0
2,37,1,130,283,0,98,0,0.0,Up,0,0,1,0,0,0,0,1
3,48,0,138,214,0,108,1,1.5,Flat,1,1,0,0,0,0,1,0
4,54,1,150,195,0,122,0,0.0,Up,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,110,264,0,132,0,1.2,Flat,1,0,0,0,1,0,1,0
914,68,1,144,193,1,141,0,3.4,Flat,1,1,0,0,0,0,1,0
915,57,1,130,131,0,115,1,1.2,Flat,1,1,0,0,0,0,1,0
916,57,0,130,236,0,174,0,0.0,Flat,1,0,1,0,0,1,0,0


In [70]:
df=pd.concat([df,pd.get_dummies(df["ST_Slope"]).add_prefix("ST_Slope_").astype(int)],axis=1)
df=df.drop("ST_Slope",axis=1)

In [71]:
df

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ChestPainType_ASY,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_LVH,RestingECG_Normal,RestingECG_ST,ST_Slope_Down,ST_Slope_Flat,ST_Slope_Up
0,40,1,140,289,0,172,0,0.0,0,0,1,0,0,0,1,0,0,0,1
1,49,0,160,180,0,156,0,1.0,1,0,0,1,0,0,1,0,0,1,0
2,37,1,130,283,0,98,0,0.0,0,0,1,0,0,0,0,1,0,0,1
3,48,0,138,214,0,108,1,1.5,1,1,0,0,0,0,1,0,0,1,0
4,54,1,150,195,0,122,0,0.0,0,0,0,1,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,110,264,0,132,0,1.2,1,0,0,0,1,0,1,0,0,1,0
914,68,1,144,193,1,141,0,3.4,1,1,0,0,0,0,1,0,0,1,0
915,57,1,130,131,0,115,1,1.2,1,1,0,0,0,0,1,0,0,1,0
916,57,0,130,236,0,174,0,0.0,1,0,1,0,0,1,0,0,0,1,0


In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                918 non-null    int64  
 1   Sex                918 non-null    int64  
 2   RestingBP          918 non-null    int64  
 3   Cholesterol        918 non-null    int64  
 4   FastingBS          918 non-null    int64  
 5   MaxHR              918 non-null    int64  
 6   ExerciseAngina     918 non-null    int64  
 7   Oldpeak            918 non-null    float64
 8   HeartDisease       918 non-null    int64  
 9   ChestPainType_ASY  918 non-null    int32  
 10  ChestPainType_ATA  918 non-null    int32  
 11  ChestPainType_NAP  918 non-null    int32  
 12  ChestPainType_TA   918 non-null    int32  
 13  RestingECG_LVH     918 non-null    int32  
 14  RestingECG_Normal  918 non-null    int32  
 15  RestingECG_ST      918 non-null    int32  
 16  ST_Slope_Down      918 non

NEXT


In [73]:
# sns.pairplot(df,hue="HeartDisease")

Train and test the model 

In [74]:
from sklearn.model_selection import train_test_split

In [75]:
df_features=df.drop("HeartDisease",axis=1)
df_target=df["HeartDisease"]

In [76]:
trainX,testX,trainY,testY=train_test_split(df_features,df_target,test_size=0.2,random_state=2)

In [77]:
# another split method
# from sklearn.model_selection import StratifiedShuffleSplit
# sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2,random_state=2)
# for train_index, test_index in sss.split(df_features,df_target):
#     trainX,testX = df_features.iloc[train_index], df_features.iloc[test_index]
#     trainY,testY = df_target.iloc[train_index], df_target.iloc[test_index]

In [78]:
from sklearn.neighbors import KNeighborsClassifier
model=KNeighborsClassifier(n_neighbors=6)
model.fit(trainX,np.array(trainY))
predicted=model.predict(np.array(testX))
result_df=pd.DataFrame(columns=['actual', 'predicted'])
result_df["actual"]=np.array(testY)
result_df["predicted"]=predicted




In [79]:
acc=model.score(np.array(testX),np.array(testY))
acc



0.7608695652173914

In [80]:
# 0.7010869565217391 -sss
# 0.7608695652173914 - train_test_split

we using train_test_split

In [81]:
from sklearn import svm
model=svm.SVC()
model.fit(trainX,trainY)
predicted=model.predict(testX)
result_df["actual"]=np.array(testY)
result_df["predicted"]=predicted
result_df

Unnamed: 0,actual,predicted
0,1,1
1,1,1
2,1,1
3,0,0
4,1,1
...,...,...
179,0,1
180,1,1
181,1,0
182,1,1


In [82]:
acc=model.score(testX,testY)
acc# 0.6630434782608695

0.7336956521739131

In [83]:
from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()
model.fit(trainX,trainY)
redicted=model.predict(testX)
result_df["actual"]=np.array(testY)
result_df["predicted"]=predicted
result_df

Unnamed: 0,actual,predicted
0,1,1
1,1,1
2,1,1
3,0,0
4,1,1
...,...,...
179,0,1
180,1,1
181,1,0
182,1,1


In [84]:
acc=model.score(testX,testY)
acc
# 0.8478260869565217

0.8586956521739131

In [85]:
# Evaluation of the model with cross-validation
from sklearn.model_selection import cross_validate
cv_results = cross_validate(model, df_features, df_target, cv=5)
mean=f'{cv_results["test_score"].mean():.3f} ± {cv_results["test_score"].std():.3f}'
print("mean accuracy is ",mean)

mean accuracy is  0.830 ± 0.042
