In [2]:
import pandas as pd
import numpy as np
data=pd.read_csv('data/heart_disease_dataset.csv')
df=pd.DataFrame(data)
df.head()

Unnamed: 0,Age,Gender,Cholesterol,Blood Pressure,Heart Rate,Smoking,Alcohol Intake,Exercise Hours,Family History,Diabetes,Obesity,Stress Level,Blood Sugar,Exercise Induced Angina,Chest Pain Type,Heart Disease
0,75,Female,228,119,66,Current,Heavy,1,No,No,Yes,8,119,Yes,Atypical Angina,1
1,48,Male,204,165,62,Current,,5,No,No,No,9,70,Yes,Typical Angina,0
2,53,Male,234,91,67,Never,Heavy,3,Yes,No,Yes,5,196,Yes,Atypical Angina,1
3,69,Female,192,90,72,Current,,4,No,Yes,No,7,107,Yes,Non-anginal Pain,0
4,62,Female,172,163,93,Never,,6,No,Yes,No,2,183,Yes,Asymptomatic,0


In [4]:
X=df.drop('Heart Disease',axis=1)
y=df['Heart Disease']#.replace([0,1],["No Heart Disease","Heart Disease"],regex=True)

In [5]:
y.head()

0    1
1    0
2    1
3    0
4    0
Name: Heart Disease, dtype: int64

In [6]:
categorical_cols=X.columns[X.dtypes=='object']
numerical_cols=X.columns[X.dtypes!='object']

In [7]:
categorical_cols,numerical_cols

(Index(['Gender', 'Smoking', 'Alcohol Intake', 'Family History', 'Diabetes',
        'Obesity', 'Exercise Induced Angina', 'Chest Pain Type'],
       dtype='object'),
 Index(['Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate', 'Exercise Hours',
        'Stress Level', 'Blood Sugar'],
       dtype='object'))

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder,StandardScaler

In [9]:
numerical_pipeline=Pipeline(steps=
                             [('imputer',SimpleImputer(strategy='mean')),
                              ('scaler',StandardScaler())
                             ])
categorical_pipeline=Pipeline(steps=
                              [('imputer',SimpleImputer(strategy='most_frequent')),
                               ('encoder',OrdinalEncoder()),
                               ('scaler',StandardScaler())])
preprocessor=ColumnTransformer([
    ('numerical_pipeline',numerical_pipeline,numerical_cols),
    ('categorical_pipeline',categorical_pipeline,categorical_cols)
])

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [11]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [12]:
X_train.head()

Unnamed: 0,numerical_pipeline__Age,numerical_pipeline__Cholesterol,numerical_pipeline__Blood Pressure,numerical_pipeline__Heart Rate,numerical_pipeline__Exercise Hours,numerical_pipeline__Stress Level,numerical_pipeline__Blood Sugar,categorical_pipeline__Gender,categorical_pipeline__Smoking,categorical_pipeline__Alcohol Intake,categorical_pipeline__Family History,categorical_pipeline__Diabetes,categorical_pipeline__Obesity,categorical_pipeline__Exercise Induced Angina,categorical_pipeline__Chest Pain Type
0,1.145853,1.403526,1.597409,1.208373,-0.201322,0.065785,-0.665065,-0.994302,1.220841,1.452966,-0.988636,1.031938,1.026054,1.052822,0.46278
1,-1.717189,0.312376,0.73226,-1.343642,1.500677,0.065785,-1.263872,1.005731,-1.217358,-0.688247,-0.988636,1.031938,1.026054,1.052822,-1.346973
2,-0.508349,0.797331,-1.37419,-1.431643,0.479477,0.422758,1.349105,-0.994302,1.220841,-0.688247,1.011495,1.031938,-0.974608,1.052822,-1.346973
3,0.445998,1.178368,-0.396195,-0.375636,-1.222521,0.422758,0.015398,-0.994302,1.220841,-0.688247,1.011495,-0.96905,-0.974608,-0.949828,-1.346973
4,0.445998,1.438166,-1.712726,-0.375636,-0.541722,0.065785,1.131357,-0.994302,-1.217358,1.452966,-0.988636,1.031938,-0.974608,-0.949828,-1.346973


In [13]:
X_test.head()

Unnamed: 0,numerical_pipeline__Age,numerical_pipeline__Cholesterol,numerical_pipeline__Blood Pressure,numerical_pipeline__Heart Rate,numerical_pipeline__Exercise Hours,numerical_pipeline__Stress Level,numerical_pipeline__Blood Sugar,categorical_pipeline__Gender,categorical_pipeline__Smoking,categorical_pipeline__Alcohol Intake,categorical_pipeline__Family History,categorical_pipeline__Diabetes,categorical_pipeline__Obesity,categorical_pipeline__Exercise Induced Angina,categorical_pipeline__Chest Pain Type
0,1.591215,-0.034021,-1.487035,-0.639638,-0.882122,1.136704,-1.046124,-0.994302,-1.217358,1.452966,-0.988636,1.031938,1.026054,-0.949828,-1.346973
1,-0.635595,0.745372,0.280878,0.064366,0.479477,1.493677,-0.229569,1.005731,1.220841,-0.688247,1.011495,-0.96905,1.026054,-0.949828,0.46278
2,-0.635595,-0.761455,-0.320965,1.560375,-0.882122,-0.648161,0.750298,-0.994302,-1.217358,-0.688247,1.011495,1.031938,1.026054,-0.949828,1.367656
3,-0.762842,0.658773,-1.298959,0.416368,1.500677,0.422758,0.478113,-0.994302,1.220841,-0.688247,1.011495,1.031938,-0.974608,1.052822,0.46278
4,1.654838,-0.640216,-0.621886,1.736376,-0.882122,-1.005134,1.131357,-0.994302,-1.217358,-0.688247,-0.988636,-0.96905,1.026054,-0.949828,0.46278


In [14]:
#model training
from sklearn.linear_model import LogisticRegression,RidgeClassifier,Perceptron
from sklearn.svm import SVC, LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [15]:
models={'LogisticRegression':LogisticRegression(),
        'RidgeClassifier':RidgeClassifier(),
        'Perceptron':Perceptron(),
        'LinearDiscriminantAnalysis':LinearDiscriminantAnalysis(),
        'DecisionTreeClassifier':DecisionTreeClassifier(),
        'RandomForestClassifier':RandomForestClassifier(),
        'GradientBoostingClassifier':GradientBoostingClassifier(),
        'AdaBoostClassifier':AdaBoostClassifier(),
        'BaggingClassifier':BaggingClassifier(),
        'XGBClassifier':XGBClassifier()
        }
results={}
for model_name, model in models.items():
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    accuracy=accuracy_score(y_test,y_pred)
    report=classification_report(y_test,y_pred)
    matrix=confusion_matrix(y_test,y_pred)
    
    results[model_name]={'model':model,
                         'accuracy_score':accuracy,
                         'classification_report':report,
                         'confusion_matrix':matrix}
for model_name,results in results.items():
        print(f'{model_name}')
        print(f"accuracy_score:{results['accuracy_score']}")
        print(f"classification_report:{results['classification_report']}")
        print(f"confusion_matrix:{results['confusion_matrix']}")
        print('=='*40)



LogisticRegression
accuracy_score:0.8333333333333334
classification_report:              precision    recall  f1-score   support

           0       0.83      0.89      0.86       171
           1       0.84      0.76      0.80       129

    accuracy                           0.83       300
   macro avg       0.83      0.82      0.83       300
weighted avg       0.83      0.83      0.83       300

confusion_matrix:[[152  19]
 [ 31  98]]
RidgeClassifier
accuracy_score:0.8333333333333334
classification_report:              precision    recall  f1-score   support

           0       0.84      0.88      0.86       171
           1       0.83      0.78      0.80       129

    accuracy                           0.83       300
   macro avg       0.83      0.83      0.83       300
weighted avg       0.83      0.83      0.83       300

confusion_matrix:[[150  21]
 [ 29 100]]
Perceptron
accuracy_score:0.7633333333333333
classification_report:              precision    recall  f1-score   suppor

In [4]:
df['Heart Disease'].value_counts()

Heart Disease
0    608
1    392
Name: count, dtype: int64