## Import Libraries

In [1]:
!pip install  xgboost



In [2]:
!pip install lightgbm



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.decomposition import PCA
import lightgbm as lgb
from sklearn.metrics import accuracy_score,precision_score,f1_score,recall_score,confusion_matrix,classification_report

In [4]:
df = pd.read_csv('Data/heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


## Data Preprocessing

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [6]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [7]:
df.duplicated().sum()

np.int64(723)

In [8]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434146,0.69561,0.942439,131.611707,246.0,0.149268,0.529756,149.114146,0.336585,1.071512,1.385366,0.754146,2.323902,0.513171
std,9.07229,0.460373,1.029641,17.516718,51.59251,0.356527,0.527878,23.005724,0.472772,1.175053,0.617755,1.030798,0.62066,0.50007
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


## Feature Selection , Data Splitting & Scaling

In [11]:
x = df.drop(columns='target')
y = df['target']

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### PCA

In [14]:
data = PCA(n_components=0.95,random_state=42)
PCA_X_train = data.fit_transform(X_train_scaled)
PCA_X_test = data.transform(X_test_scaled)

## Ada Boost

In [15]:
weak_learner = DecisionTreeClassifier(max_depth=10)
ada = AdaBoostClassifier(estimator=weak_learner, n_estimators=50, learning_rate=0.01, random_state=42)
ada.fit(PCA_X_train, Y_train)

In [16]:
# Testing Data
y_pred = ada.predict(PCA_X_test)
accuracy_score(Y_test, y_pred)

0.9707317073170731

In [17]:
confusion_matrix(Y_test, y_pred)

array([[102,   0],
       [  6,  97]])

In [18]:
print(classification_report(Y_test, y_pred, target_names=['No Disease (0)', 'Disease (1)']))

                precision    recall  f1-score   support

No Disease (0)       0.94      1.00      0.97       102
   Disease (1)       1.00      0.94      0.97       103

      accuracy                           0.97       205
     macro avg       0.97      0.97      0.97       205
  weighted avg       0.97      0.97      0.97       205



In [20]:
# Training Data
Y_pred = ada.predict(PCA_X_train)
accuracy_score(Y_train, Y_pred)

1.0

In [21]:
confusion_matrix(Y_train, Y_pred)

array([[397,   0],
       [  0, 423]])

In [22]:
print(classification_report(Y_train, Y_pred, target_names=['No Disease (0)', 'Disease (1)']))

                precision    recall  f1-score   support

No Disease (0)       1.00      1.00      1.00       397
   Disease (1)       1.00      1.00      1.00       423

      accuracy                           1.00       820
     macro avg       1.00      1.00      1.00       820
  weighted avg       1.00      1.00      1.00       820



## Gradient Boost

In [23]:
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
gb.fit(PCA_X_train, Y_train)

In [24]:
# Testing data
y_pred = gb.predict(PCA_X_test)
accuracy_score(Y_test, y_pred)

1.0

In [25]:
confusion_matrix(Y_test, y_pred)

array([[102,   0],
       [  0, 103]])

In [26]:
print(classification_report(Y_test, y_pred, target_names=['No Disease (0)', 'Disease (1)']))

                precision    recall  f1-score   support

No Disease (0)       1.00      1.00      1.00       102
   Disease (1)       1.00      1.00      1.00       103

      accuracy                           1.00       205
     macro avg       1.00      1.00      1.00       205
  weighted avg       1.00      1.00      1.00       205



In [27]:
# Training Data
Y_pred = gb.predict(PCA_X_train)
accuracy_score(Y_train, Y_pred)

1.0

In [28]:
confusion_matrix(Y_train, Y_pred)

array([[397,   0],
       [  0, 423]])

## XG Boost

In [29]:
# xgb boost use lambda and alpha for trees not for linear coefficient like in logistic regression
# it uses logistic  to turn output of trees into probabilities for classification
# eval_metric = evaluation metric ( it tells XGBoost how to measure the model's performance during training and validation)(it not use for training it use for monitoring)
# logloss = (Logistic Loss) it shows how wrong your predicted probabilities are ( if model predicts probabilities close to true values - logloss is small and if far to true values - logloass large)
# Small logloass is Better
xgb_model = xgb.XGBClassifier(
    objective = 'binary:logistic', #(binary : yes or no ; logistic = sigmoid function)
    n_estimators = 100,
    learning_rate = 0.1,
    max_depth = 5,
    reg_lambda = 1.0, # (L2- Ridge Regression (reduces overfitting))
    reg_alpha = 0.0,  # (L1- Lasso Regression (feature selection effect))
    random_state = 42,
    eval_metric = 'logloss'  
)
xgb_model.fit(PCA_X_train,Y_train)

In [30]:
# Testing data
y_pred = xgb_model.predict(PCA_X_test)
accuracy_score(Y_test, y_pred)

0.9853658536585366

In [31]:
confusion_matrix(Y_test, y_pred)

array([[102,   0],
       [  3, 100]])

In [32]:
print(classification_report(Y_test, y_pred, target_names=['No Disease (0)', 'Disease (1)']))

                precision    recall  f1-score   support

No Disease (0)       0.97      1.00      0.99       102
   Disease (1)       1.00      0.97      0.99       103

      accuracy                           0.99       205
     macro avg       0.99      0.99      0.99       205
  weighted avg       0.99      0.99      0.99       205



In [33]:
# Training Data
Y_pred = xgb_model.predict(PCA_X_train)
accuracy_score(Y_train, Y_pred)

1.0

In [34]:
confusion_matrix(Y_train, Y_pred)

array([[397,   0],
       [  0, 423]])

## Light GBM

In [36]:
# It also use logistic but internally so we don't need to write it 
lgb_model = lgb.LGBMClassifier(
    objective='binary',
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    num_leaves=7,
    reg_lambda=1.0,
    reg_alpha=0.0,
    random_state=42,
    # metric = 'binary_logloss' (you can also do this)
)
lgb_model.fit(PCA_X_train,Y_train)

[LightGBM] [Info] Number of positive: 423, number of negative: 397
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000465 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2928
[LightGBM] [Info] Number of data points in the train set: 820, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.515854 -> initscore=0.063436
[LightGBM] [Info] Start training from score 0.063436


In [37]:
# Testing data
y_pred = lgb_model.predict(PCA_X_test)
accuracy_score(Y_test, y_pred)



0.9707317073170731

In [38]:
confusion_matrix(Y_test, y_pred)

array([[ 99,   3],
       [  3, 100]])

In [135]:
print(classification_report(Y_test, y_pred, target_names=['No Disease (0)', 'Disease (1)']))

                precision    recall  f1-score   support

No Disease (0)       0.93      0.99      0.96       102
   Disease (1)       0.99      0.92      0.95       103

      accuracy                           0.96       205
     macro avg       0.96      0.96      0.96       205
  weighted avg       0.96      0.96      0.96       205



In [39]:
# Training Data
Y_pred = lgb_model.predict(PCA_X_train)
accuracy_score(Y_train, Y_pred)



0.998780487804878

In [40]:
confusion_matrix(Y_train, Y_pred)

array([[396,   1],
       [  0, 423]])