In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/heart-failure-prediction/heart.csv


Executed a comprehensive data mining project utilizing the CRISP-DM cycle, aiming to predict heart disease in individuals.
Managed data preparation, transformation, and modeling, employing stratified K-fold cross-validation with Decision Trees
(’gini’ and ’entropy’) for rigorous model evaluation using Jupyter notebook environment to code and execute the project.


In [2]:
from sklearn.datasets import load_iris
import numpy as rrr
import matplotlib.pyplot as plt
import pandas as rm
from sklearn.metrics import accuracy_score
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold,GridSearchCV,train_test_split
set=rm.read_csv('/kaggle/input/heart-failure-prediction/heart.csv')

In [3]:
set.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
set.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol         int64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
HeartDisease        int64
dtype: object

In [5]:
set.shape

(918, 12)

In [6]:
# Data Preparation
set.isna().any()

Age               False
Sex               False
ChestPainType     False
RestingBP         False
Cholesterol       False
FastingBS         False
RestingECG        False
MaxHR             False
ExerciseAngina    False
Oldpeak           False
ST_Slope          False
HeartDisease      False
dtype: bool

In [7]:
set.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [8]:
set.dropna(inplace = True)

In [9]:
set.isin(['Unknown']).sum(axis=0)

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [10]:
# Creating a dataframe for the transformed attributes
processing=rm.DataFrame()

In [11]:
#placing all the categorical attributes in an array
nom=['ChestPainType','RestingECG','Sex','ST_Slope']
for l in nom:
    new_nom=rm.get_dummies(set[l])
    processing=rm.concat([processing,new_nom],axis=1)


In [12]:
#Converting the exercise angina, 1 for yes and 0 for no
exercise_a={'Y':1,'N':0}
processing['ExerciseAngina']=set['ExerciseAngina'].map(exercise_a)


In [13]:
#Adding the remaining attributes for the data frame
remain=rm.DataFrame([set['FastingBS']]).T
remain.columns=['FastingBS']
processing=rm.concat([processing,remain],axis=1)
processing.head()


Unnamed: 0,ASY,ATA,NAP,TA,LVH,Normal,ST,F,M,Down,Flat,Up,ExerciseAngina,FastingBS
0,False,True,False,False,False,True,False,False,True,False,False,True,0,0
1,False,False,True,False,False,True,False,True,False,False,True,False,0,0
2,False,True,False,False,False,False,True,False,True,False,False,True,0,0
3,True,False,False,False,False,True,False,True,False,False,True,False,1,0
4,False,False,True,False,False,True,False,False,True,False,False,True,0,0


In [14]:
#For Ratio attributes
#Creating a function to give the values of the columns
def set_std_scale(item):
#Calculating the average and standard deviation for all the values
    aveg=np.mean(set[item],axis=0)
    dev=np.std(set[item],axis=0)
    processing[item]=(set[item]-aveg)/dev
#For loop to pass all the ratio attributes for calculating
for item in ['Age','RestingBP','Cholesterol','MaxHR','Oldpeak']:
    set_std_scale(item)
#printing the number of columns and rows for the resultant data frame
print(processing.shape)
processing.head(15)

(918, 19)


Unnamed: 0,ASY,ATA,NAP,TA,LVH,Normal,ST,F,M,Down,Flat,Up,ExerciseAngina,FastingBS,Age,RestingBP,Cholesterol,MaxHR,Oldpeak
0,False,True,False,False,False,True,False,False,True,False,False,True,0,0,-1.43314,0.410909,0.82507,1.382928,-0.832432
1,False,False,True,False,False,True,False,True,False,False,True,False,0,0,-0.478484,1.491752,-0.171961,0.754157,0.105664
2,False,True,False,False,False,False,True,False,True,False,False,True,0,0,-1.751359,-0.129513,0.770188,-1.525138,-0.832432
3,True,False,False,False,False,True,False,True,False,False,True,False,1,0,-0.584556,0.302825,0.13904,-1.132156,0.574711
4,False,False,True,False,False,True,False,False,True,False,False,True,0,0,0.051881,0.951331,-0.034755,-0.581981,-0.832432
5,False,False,True,False,False,True,False,False,True,False,False,True,0,0,-1.539213,-0.669935,1.282424,1.304332,-0.832432
6,False,True,False,False,False,True,False,True,False,False,False,True,0,0,-0.902775,-0.129513,0.349422,1.304332,-0.832432
7,False,True,False,False,False,True,False,False,True,False,False,True,0,0,0.051881,-1.210356,0.084157,0.203982,-0.832432
8,True,False,False,False,False,True,False,False,True,False,True,False,1,0,-1.751359,0.410909,0.07501,-0.267596,0.574711
9,False,True,False,False,False,True,False,True,False,False,False,True,0,0,-0.584556,-0.669935,0.779335,-0.660578,-0.832432


In [15]:
# Data modeling - K - Stratified  cross validation
#Creating an empty array for accuracy of gini index
accuracy_1=[]
#Creating an empty array for accuracy for entropy index
accuracy_2=[]


In [16]:
#Using the stratified k-fold in built function
rm1=StratifiedKFold(n_splits=5,shuffle=True,random_state=42)
#K will be the training data for the validation
K=processing
#’Heart Disease’ will be the testing data 
k=set['HeartDisease']
for m,(ind_training,ind_testing) in enumerate(rm1.split(K.values,k.values)):
    K_training,K_testing=K.iloc[ind_training],K.iloc[ind_testing]
    k_training,k_testing=k.iloc[ind_training],k.iloc[ind_testing]
#Decision tree is the classification model used for modeling for gini
    rm2_gini=DecisionTreeClassifier(criterion='gini',max_depth=10*processing.shape[0])
    rm2_gini.fit(K_training,k_training)
#Predicting the accuracy for the testing data - gini
    k_forecast=rm2_gini.predict(K_testing)
    accuracy_1.append(accuracy_score(k_testing,k_forecast))
#Decision tree is the classification model used for modeling for entropy
    rm2_entropy=DecisionTreeClassifier(criterion='entropy',max_depth=10*processing.shape[0])
    rm2_entropy.fit(K_training,k_training)
    k_forecast=rm2_entropy.predict(K_testing)
#Predicting the accuracy for the testing data - entropy
    accuracy_2.append(accuracy_score(k_testing,k_forecast))
if(rrr.average(accuracy_1)>rrr.average(accuracy_2)):
#Printing percentage of all the accuracies for gini and entropy
    print('Gini Best_average= ',(rrr.average(accuracy_1)*100))
    print('Entropy average= ',(rrr.average(accuracy_2)*100))
else:
    print('Entropy best_average= ',(rrr.average(accuracy_2)*100))
    print('Gini average= ',(rrr.average(accuracy_1)*100))


Entropy best_average=  78.31729626989784
Gini average=  76.46709432169162


In [17]:
#The precisions for gini and entropy at every fold: 
accuracy_1

[0.7880434782608695,
 0.7771739130434783,
 0.7608695652173914,
 0.7486338797814208,
 0.7486338797814208]

In [18]:
accuracy_2

[0.8260869565217391,
 0.8043478260869565,
 0.7663043478260869,
 0.7704918032786885,
 0.7486338797814208]

Decision tree classifier for the entropy of maximum depth 10 * processing.shape[0].