In [1]:
## importing important libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

In [2]:
df = pd.read_csv("/content/heart.csv")

In [3]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [5]:
X = df.iloc[:,0:-1]
y = df.iloc[:,-1]

In [6]:
X.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up


In [7]:
y.value_counts()

Unnamed: 0_level_0,count
HeartDisease,Unnamed: 1_level_1
1,508
0,410


In [13]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Create Column Transformer with 4 types of transformers
cat_features = X.select_dtypes(include="object").columns
num_features = X.select_dtypes(exclude="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer(
    [
         ("OneHotEncoder", oh_transformer, cat_features),
          ("StandardScaler", numeric_transformer, num_features)
    ]
)

In [15]:
preprocessor

In [16]:
## applying Trnsformation in training(fit_transform)
X_train=preprocessor.fit_transform(X_train)

In [17]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.245067,-0.708985,0.372803,1.842609,2.284353,-0.097061
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,-1.886236,-0.166285,0.086146,-0.542709,1.652241,-0.836286
2,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.250993,0.919115,0.123134,1.842609,-0.441628,0.087745
3,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,-1.779375,-0.166285,0.104640,-0.542709,0.229991,-0.836286
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,-0.283314,-0.708985,-1.846478,1.842609,-1.271274,-0.836286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
729,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,-0.603898,-0.708985,0.502261,-0.542709,-1.034232,-0.836286
730,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,-0.924483,-0.708985,0.234098,-0.542709,0.150977,-0.836286
731,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.678439,-0.166285,0.493014,-0.542709,0.309005,0.457358
732,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.678439,1.027656,-1.846478,-0.542709,-0.718176,-0.836286


In [18]:
## apply tansformation on test(transform)
X_test=preprocessor.transform(X_test)

In [19]:
X_test

array([[ 0.        ,  1.        ,  0.        , ..., -0.5427086 ,
         1.69174843, -0.83628643],
       [ 1.        ,  0.        ,  1.        , ..., -0.5427086 ,
        -0.24409275, -0.83628643],
       [ 1.        ,  0.        ,  0.        , ...,  1.84260945,
        -0.56014845,  0.27255158],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  1.84260945,
        -0.79719023,  0.54976108],
       [ 1.        ,  1.        ,  0.        , ..., -0.5427086 ,
         1.37569273, -0.28186743],
       [ 1.        ,  0.        ,  0.        , ..., -0.5427086 ,
        -0.56014845,  0.08774524]])

In [20]:
rf = RandomForestClassifier()
gb = GradientBoostingClassifier()
svm = SVC()
lr = LogisticRegression()

In [21]:
# RandomForest Classifier
rf.fit(X_train,y_train)
rf_pred = rf.predict(X_test)
print("Accuracy Score : ",accuracy_score(y_test,rf_pred))
print("Confusion Matrix : ",confusion_matrix(y_test,rf_pred))
print("Classification Report : ",classification_report(y_test,rf_pred))

Accuracy Score :  0.8695652173913043
Confusion Matrix :  [[66 11]
 [13 94]]
Classification Report :                precision    recall  f1-score   support

           0       0.84      0.86      0.85        77
           1       0.90      0.88      0.89       107

    accuracy                           0.87       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.87      0.87      0.87       184



In [22]:
# Gradient Boosting Classifier
gb.fit(X_train,y_train)
gb_pred = gb.predict(X_test)
print("Accuracy Score : ",accuracy_score(y_test,gb_pred))
print("Confusion Matrix : ",confusion_matrix(y_test,gb_pred))
print("Classification Report : ",classification_report(y_test,gb_pred))

Accuracy Score :  0.8641304347826086
Confusion Matrix :  [[65 12]
 [13 94]]
Classification Report :                precision    recall  f1-score   support

           0       0.83      0.84      0.84        77
           1       0.89      0.88      0.88       107

    accuracy                           0.86       184
   macro avg       0.86      0.86      0.86       184
weighted avg       0.86      0.86      0.86       184



In [23]:
# Support Vector Classifier
svm.fit(X_train,y_train)
svm_pred = svm.predict(X_test)
print("Accuracy Score : ",accuracy_score(y_test,svm_pred))
print("Confusion Matrix : ",confusion_matrix(y_test,svm_pred))
print("Classification Report : ",classification_report(y_test,svm_pred))

Accuracy Score :  0.8586956521739131
Confusion Matrix :  [[65 12]
 [14 93]]
Classification Report :                precision    recall  f1-score   support

           0       0.82      0.84      0.83        77
           1       0.89      0.87      0.88       107

    accuracy                           0.86       184
   macro avg       0.85      0.86      0.86       184
weighted avg       0.86      0.86      0.86       184



In [24]:
# Logistic Regression
lr.fit(X_train,y_train)
lr_pred = lr.predict(X_test)
print("Accuracy Score : ",accuracy_score(y_test,lr_pred))
print("Confusion Matrix : ",confusion_matrix(y_test,lr_pred))
print("Classification Report : ",classification_report(y_test,lr_pred))

Accuracy Score :  0.8532608695652174
Confusion Matrix :  [[67 10]
 [17 90]]
Classification Report :                precision    recall  f1-score   support

           0       0.80      0.87      0.83        77
           1       0.90      0.84      0.87       107

    accuracy                           0.85       184
   macro avg       0.85      0.86      0.85       184
weighted avg       0.86      0.85      0.85       184



In [25]:
# Random Forest Classifier -
rf2 = RandomForestClassifier(max_samples=0.75,random_state=42)
rf2.fit(X_train,y_train)
rf2_pred = rf.predict(X_test)
print("Accuracy Score : ",accuracy_score(y_test,rf2_pred))
print("Confusion Matrix : ",confusion_matrix(y_test,rf2_pred))
print("Classification Report : ",classification_report(y_test,rf2_pred))

Accuracy Score :  0.8695652173913043
Confusion Matrix :  [[66 11]
 [13 94]]
Classification Report :                precision    recall  f1-score   support

           0       0.84      0.86      0.85        77
           1       0.90      0.88      0.89       107

    accuracy                           0.87       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.87      0.87      0.87       184



In [26]:
# cross validation
from sklearn.model_selection import cross_val_score

print(cross_val_score(RandomForestClassifier(max_samples=0.75),X_train,y_train,cv=10,scoring='accuracy'))

[0.89189189 0.89189189 0.89189189 0.89189189 0.82191781 0.84931507
 0.8630137  0.83561644 0.87671233 0.83561644]


### GridSearchCV

In [27]:
# Hyperameter tuning with GridSearchCV
from sklearn.model_selection import GridSearchCV

In [28]:
n_estimators = [20,60,100,120]
max_features = [0.2,0.6,1.0]
max_depth = [2,8,None]
max_samples = [0.5,0.75,1.0]

In [30]:
param_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'max_samples': max_samples

}
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0]}


In [31]:
rf3 = RandomForestClassifier()

In [32]:
rf_grid = GridSearchCV(estimator=rf3,param_grid=param_grid,cv=10,verbose=True,n_jobs=-1)
rf_grid.fit(X_train,y_train)

Fitting 10 folds for each of 108 candidates, totalling 1080 fits


In [33]:
rf_grid.best_params_

{'max_depth': 8, 'max_features': 0.2, 'max_samples': 1.0, 'n_estimators': 100}

In [34]:
rf4 = RandomForestClassifier(n_estimators=100,max_samples=1.0,max_features=0.2,max_depth=8)
rf4.fit(X_train,y_train)
rf4_pred = rf4.predict(X_test)
print("Accuracy Score : ",accuracy_score(y_test,rf4_pred))
print("Confusion Matrix : ",confusion_matrix(y_test,rf4_pred))
print("Classification Report : ",classification_report(y_test,rf4_pred))
print("Best Score:",rf_grid.best_score_)


Accuracy Score :  0.8695652173913043
Confusion Matrix :  [[66 11]
 [13 94]]
Classification Report :                precision    recall  f1-score   support

           0       0.84      0.86      0.85        77
           1       0.90      0.88      0.89       107

    accuracy                           0.87       184
   macro avg       0.87      0.87      0.87       184
weighted avg       0.87      0.87      0.87       184

Best Score: 0.875897815623843


### RandomSeachCV

In [35]:
n_estimators = [20,60,100,120]
max_features = [0.2,0.6,1.0]
max_depth = [2,8,None]
max_samples = [0.5,0.75,1.0]
bootstrap = [True,False]
min_samples_leaf = [1,2,3]
min_samples_split = [2,5]

In [36]:
param_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'max_samples': max_samples,
    'bootstrap': bootstrap,
    'min_samples_leaf': min_samples_leaf,
    'min_samples_split': min_samples_split

}
print(param_grid)

{'n_estimators': [20, 60, 100, 120], 'max_features': [0.2, 0.6, 1.0], 'max_depth': [2, 8, None], 'max_samples': [0.5, 0.75, 1.0], 'bootstrap': [True, False], 'min_samples_leaf': [1, 2, 3], 'min_samples_split': [2, 5]}


In [37]:
rf5 = RandomForestClassifier()

In [38]:
from sklearn.model_selection import RandomizedSearchCV
rf_rscv = RandomizedSearchCV(estimator=rf5,param_distributions=param_grid,cv=10,verbose=True,n_jobs=-1)
rf_rscv.fit(X_train,y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [39]:
rf_rscv.best_params_

{'n_estimators': 60,
 'min_samples_split': 2,
 'min_samples_leaf': 2,
 'max_samples': 0.75,
 'max_features': 0.6,
 'max_depth': None,
 'bootstrap': True}

In [40]:
rf6 = RandomForestClassifier(n_estimators=60,min_samples_split=2,min_samples_leaf=2,max_samples=0.75,max_features=0.6,max_depth=None,bootstrap=True)
rf6.fit(X_train,y_train)
rf6_pred = rf6.predict(X_test)
print("Accuracy Score : ",accuracy_score(y_test,rf6_pred))

Accuracy Score :  0.8641304347826086
