In [6]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from imblearn.combine import SMOTEENN



In [15]:
# load data
random_state=123
data = pd.read_csv("../Dataset/cleaned_hypertension_data.csv")
data

Unnamed: 0,Class,Level_of_Hemoglobin,Age,BMI,Sex,Smoking,Physical_activity,salt_content_in_the_diet,Level_of_Stress,Chronic_kidney_disease,Adrenal_and_thyroid_disorders,Genetic_Pedigree_Coefficient,alcohol_consumption_per_day
0,1,11.28,34,23,1,0,45961,48071,2,1,1,0.90,336.333333
1,0,9.75,54,33,1,0,26106,25333,3,0,0,0.23,205.000000
2,1,10.79,70,49,0,0,9995,29465,2,1,0,0.91,67.000000
3,0,11.00,71,50,0,0,10635,7439,1,1,0,0.43,242.000000
4,1,14.17,52,19,0,0,15619,49644,2,0,0,0.83,397.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1,10.14,69,26,1,1,26118,47568,3,1,0,0.02,144.000000
1996,1,11.77,24,45,1,1,2572,8063,3,1,1,1.00,299.666667
1997,1,16.91,18,42,0,0,14933,24753,2,1,1,0.22,369.000000
1998,0,11.15,46,45,1,1,18157,15275,3,0,1,0.72,253.000000


In [8]:
from imblearn.under_sampling import ClusterCentroids, TomekLinks
from imblearn.over_sampling import RandomOverSampler

def splitting_data(df, sampling):
    X = df.drop(['Class'], axis=1)
    y = df['Class']

    if sampling == 'none':
        return X, y
    elif sampling == 'SMOTEENN':
        sampler = SMOTEENN(random_state=0)
    elif sampling == 'SMOTE':
        sampler = SMOTE(random_state=0)
    elif sampling == 'under':
        sampler = RandomUnderSampler(random_state=0)
    elif sampling == 'over':
        sampler = RandomOverSampler(random_state=0)
    elif sampling == 'cluster_centroids':
        sampler = ClusterCentroids(random_state=0)
    elif sampling == 'tomek_links':
        sampler = TomekLinks()

    X_resampled, y_resampled = sampler.fit_resample(X, y)
    return X_resampled, y_resampled


In [9]:

def training(X_train, y_train):

    HGB = HistGradientBoostingClassifier()
    # Fit the classifier to the data
    HGB.fit(X_train, y_train)
    return HGB

In [10]:
def predict(HGB, X_test ,y_test):
    # Predict the labels for the training data X
    y_pred = HGB.predict(X_test)
    cr=classification_report(y_test, y_pred)
    print(cr)

In [11]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

def optimize_with_grid(X_train, y_train):

    # Initialize the LGBMClassifier
    HGB = HistGradientBoostingClassifier()
    # Define the parameter grid
    param_grid = {
        'max_iter': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2],
        'min_samples_leaf': [20, 40, 60]
    }

    # Initialize GridSearchCV
    HGB_cv = GridSearchCV(HGB, param_grid, cv=5)

    # Fit the grid search to the data
    HGB_cv.fit(X_train, y_train)

    # Best parameters and best score
    best_params = HGB_cv.best_params_
    best_score = HGB_cv.best_score_
    best_estimator = HGB_cv.best_estimator_
    print(best_params)
    print(best_score)

    return best_estimator

# Original Dataset

In [16]:
# using function with no sampling
X, y= splitting_data(data, 'none')
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [17]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y.value_counts())

Number of observations in each class in the training set:
0    1013
1     987
Name: Class, dtype: int64


In [18]:
HGB1 = training(X_train, y_train)
y_pred = predict(HGB1, X_test, y_test)

              precision    recall  f1-score   support

           0       0.91      0.91      0.91       191
           1       0.91      0.92      0.92       209

    accuracy                           0.91       400
   macro avg       0.91      0.91      0.91       400
weighted avg       0.91      0.91      0.91       400



In [19]:
best_HGB1 = optimize_with_grid(X_train, y_train)
prediction = predict(best_HGB1, X_test, y_test)

{'learning_rate': 0.1, 'max_depth': 3, 'max_iter': 100, 'min_samples_leaf': 40}
0.89625
              precision    recall  f1-score   support

           0       0.90      0.92      0.91       191
           1       0.93      0.91      0.92       209

    accuracy                           0.92       400
   macro avg       0.91      0.92      0.91       400
weighted avg       0.92      0.92      0.92       400



# SMOTE for over-sampling

In [21]:
X,y = splitting_data(data, 'SMOTE')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [22]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y.value_counts())

Number of observations in each class in the training set:
1    1013
0    1013
Name: Class, dtype: int64


In [23]:
HGB2 =training(X_train, y_train)
y_pred = predict(HGB2, X_test, y_test)

              precision    recall  f1-score   support

           0       0.87      0.90      0.89       211
           1       0.89      0.86      0.87       195

    accuracy                           0.88       406
   macro avg       0.88      0.88      0.88       406
weighted avg       0.88      0.88      0.88       406



In [24]:
best_HGB2 = optimize_with_grid(X_train, y_train)
prediction = predict(best_HGB2, X_test, y_test)

{'learning_rate': 0.1, 'max_depth': 3, 'max_iter': 100, 'min_samples_leaf': 60}
0.8925925925925926
              precision    recall  f1-score   support

           0       0.86      0.90      0.88       211
           1       0.89      0.85      0.87       195

    accuracy                           0.87       406
   macro avg       0.88      0.87      0.87       406
weighted avg       0.87      0.87      0.87       406



# HGB using SMOTEENN sampling 

In [25]:
X,y = splitting_data(data, 'SMOTEENN')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [26]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y.value_counts())

Number of observations in each class in the training set:
1    151
0    137
Name: Class, dtype: int64


In [27]:
HGB3 =training(X_train, y_train)
y_pred = predict(HGB3, X_test, y_test)

              precision    recall  f1-score   support

           0       0.81      0.84      0.82        25
           1       0.88      0.85      0.86        33

    accuracy                           0.84        58
   macro avg       0.84      0.84      0.84        58
weighted avg       0.85      0.84      0.85        58



In [28]:
best_HGB3 = optimize_with_grid(X_train, y_train)
prediction = predict(best_HGB3, X_test, y_test)

{'learning_rate': 0.01, 'max_depth': 3, 'max_iter': 200, 'min_samples_leaf': 20}
0.8869565217391304
              precision    recall  f1-score   support

           0       0.79      0.88      0.83        25
           1       0.90      0.82      0.86        33

    accuracy                           0.84        58
   macro avg       0.84      0.85      0.84        58
weighted avg       0.85      0.84      0.85        58



# HGB on Random undersampling

In [29]:
X,y = splitting_data(data, 'under')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [30]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y.value_counts())

Number of observations in each class in the training set:
0    987
1    987
Name: Class, dtype: int64


In [31]:
HGB4 =training(X_train, y_train)
y_pred = predict(HGB4, X_test, y_test)

              precision    recall  f1-score   support

           0       0.87      0.90      0.88       200
           1       0.89      0.86      0.88       195

    accuracy                           0.88       395
   macro avg       0.88      0.88      0.88       395
weighted avg       0.88      0.88      0.88       395



In [32]:
best_HGB4 = optimize_with_grid(X_train, y_train)
prediction = predict(best_HGB4, X_test, y_test)

KeyboardInterrupt: 

# HGB on Random Oversampling

In [None]:
X,y = splitting_data(data, 'over')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y.value_counts())

In [None]:
HGB5 =training(X_train, y_train)
y_pred = predict(HGB5, X_test, y_test)

In [None]:
best_HGB5 = optimize_with_grid(X_train, y_train)
prediction = predict(best_HGB5, X_test, y_test)

# HGB on Cluster Centroids

In [None]:
X,y = splitting_data(data, 'cluster_centroids')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y.value_counts())

In [None]:
HGB6 =training(X_train, y_train)
y_pred = predict(HGB6, X_test, y_test)

In [None]:
best_HGB6 = optimize_with_grid(X_train, y_train)
prediction = predict(best_HGB6, X_test, y_test)

# HGB on Tomek Links

In [None]:
X,y = splitting_data(data, 'tomek_links')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y.value_counts())

In [None]:
HGB7 =training(X_train, y_train)
y_pred = predict(HGB7, X_test, y_test)

In [None]:
best_HGB7 = optimize_with_grid(X_train, y_train)
prediction = predict(best_HGB7, X_test, y_test)