In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.discriminant_analysis import StandardScaler
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from joblib import dump


In [2]:
random_state=45
best_model = {}

In [3]:
# Read in the data
df = pd.read_csv('../Dataset/cleaned_hypertension_data.csv')
df

Unnamed: 0,Class,Level_of_Hemoglobin,Age,BMI,Sex,Smoking,Physical_activity,salt_content_in_the_diet,Level_of_Stress,Chronic_kidney_disease,Adrenal_and_thyroid_disorders,Genetic_Pedigree_Coefficient,alcohol_consumption_per_day
0,1,11.28,34,23,1,0,45961,48071,2,1,1,0.90,336.333333
1,0,9.75,54,33,1,0,26106,25333,3,0,0,0.23,205.000000
2,1,10.79,70,49,0,0,9995,29465,2,1,0,0.91,67.000000
3,0,11.00,71,50,0,0,10635,7439,1,1,0,0.43,242.000000
4,1,14.17,52,19,0,0,15619,49644,2,0,0,0.83,397.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,1,10.14,69,26,1,1,26118,47568,3,1,0,0.02,144.000000
1996,1,11.77,24,45,1,1,2572,8063,3,1,1,1.00,299.666667
1997,1,16.91,18,42,0,0,14933,24753,2,1,1,0.22,369.000000
1998,0,11.15,46,45,1,1,18157,15275,3,0,1,0.72,253.000000


In [4]:
from imblearn.under_sampling import ClusterCentroids, TomekLinks
from imblearn.over_sampling import RandomOverSampler

def splitting_data(df, sampling):
    X = df.drop(['Class'], axis=1)
    y = df['Class']

    if sampling == 'none':
        return X, y
    elif sampling == 'SMOTEENN':
        sampler = SMOTEENN(random_state=random_state)
    elif sampling == 'SMOTE':
        sampler = SMOTE(random_state=random_state)
    elif sampling == 'under':
        sampler = RandomUnderSampler(random_state=random_state)
    elif sampling == 'over':
        sampler = RandomOverSampler(random_state=random_state)
    elif sampling == 'cluster_centroids':
        sampler = ClusterCentroids(random_state=random_state)
    elif sampling == 'tomek_links':
        sampler = TomekLinks()

    X_resampled, y_resampled = sampler.fit_resample(X, y)
    return X_resampled, y_resampled


In [5]:
def training(X_train, y_train):
    # Create a KNN classifier with 5 neighbors
    DT = DecisionTreeClassifier(random_state=random_state)
    # Fit the classifier to the data
    DT.fit(X_train, y_train)
    return DT

In [6]:
def best_model(modelName, accuracy, precision, recall, f1):
    best_model[modelName] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [7]:
def predict(modleName, DT, X_test ,y_test):
    # Predict the labels for the training data X
    y_pred = DT.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    cr=classification_report(y_test, y_pred, output_dict=True)
    precision = cr['weighted avg']['precision']
    recall = cr['weighted avg']['recall']
    f1 = cr['weighted avg']['f1-score']
    best_model(modleName,accuracy,precision,recall,f1)
    cr=classification_report(y_test, y_pred)
    print(cr)

In [8]:




from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def optimize_with_grid(X_train, y_train):
    # Define a pipeline that first scales the data and then applies the classifier
    pipe = Pipeline([
        # ('scaler', StandardScaler()),
        ('dt', DecisionTreeClassifier(random_state=random_state))
    ])

    # Define the parameter grid to search
    param_grid = {
        'dt__max_depth': [None, 10, 20, 30, 40, 50],
        'dt__min_samples_split': [2, 5, 10],
        'dt__min_samples_leaf': [1, 2, 4],
        'dt__criterion': ['gini', 'entropy']
    }

    # Create the GridSearchCV object
    DT_cv = GridSearchCV(pipe,param_grid, cv=5, verbose=1, n_jobs=-1)

    # Perform the grid search on the provided data
    DT_cv.fit(X_train, y_train)

    # Best parameters and best score
    best_params = DT_cv.best_params_
    best_score = DT_cv.best_score_
    best_estimator = DT_cv.best_estimator_
    print(best_params)
    print(best_score)

    return best_estimator


<h1> DT on original data with optimization </h1>

In [9]:
# using function with no sampling 
X, y= splitting_data(df, 'none')
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y.value_counts())

Number of observations in each class in the training set:
0    1013
1     987
Name: Class, dtype: int64


In [11]:
DT1 =training(X_train, y_train)
y_pred = predict('original',DT1, X_test, y_test)

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       202
           1       0.82      0.82      0.82       198

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400



In [12]:
best_DT1 = optimize_with_grid(X_train, y_train)
prediction = predict('original_grid',best_DT1, X_test, y_test)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__min_samples_leaf': 4, 'dt__min_samples_split': 10}
0.8543749999999999
              precision    recall  f1-score   support

           0       0.83      0.88      0.85       202
           1       0.87      0.81      0.84       198

    accuracy                           0.85       400
   macro avg       0.85      0.85      0.85       400
weighted avg       0.85      0.85      0.85       400



<h1> DT using SMOTE sampling </h1>

In [13]:
X,y = splitting_data(df, 'SMOTE')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y.value_counts())

Number of observations in each class in the training set:
1    1013
0    1013
Name: Class, dtype: int64


In [15]:
DT2 =training(X_train, y_train)
y_pred = predict('SMOTE',DT2, X_test, y_test)

# Assume 'model' is your trained model
dump(DT2, '../Models/DT_SMOTE.joblib')


              precision    recall  f1-score   support

           0       0.79      0.81      0.80       205
           1       0.81      0.78      0.79       201

    accuracy                           0.80       406
   macro avg       0.80      0.80      0.80       406
weighted avg       0.80      0.80      0.80       406



['../Models/DT_SMOTE.joblib']

In [16]:
best_DT2 = optimize_with_grid(X_train, y_train)
prediction = predict('SMOTE_grid',best_DT2, X_test, y_test)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'dt__criterion': 'gini', 'dt__max_depth': 10, 'dt__min_samples_leaf': 4, 'dt__min_samples_split': 10}
0.8555555555555555
              precision    recall  f1-score   support

           0       0.84      0.87      0.85       205
           1       0.86      0.83      0.84       201

    accuracy                           0.85       406
   macro avg       0.85      0.85      0.85       406
weighted avg       0.85      0.85      0.85       406



<h1> DT using SMOTEENN sampling </h1>

In [17]:
X,y = splitting_data(df, 'SMOTEENN')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
1    153
0    137
Name: Class, dtype: int64


In [19]:
DT3 =training(X_train, y_train)
y_pred = predict('SMOTEENN',DT3, X_test, y_test)

              precision    recall  f1-score   support

           0       0.85      0.79      0.81        28
           1       0.81      0.87      0.84        30

    accuracy                           0.83        58
   macro avg       0.83      0.83      0.83        58
weighted avg       0.83      0.83      0.83        58



In [20]:
best_DT3 = optimize_with_grid(X_train, y_train)
prediction = predict('SMOTEENN_grid',best_DT3, X_test, y_test)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_leaf': 4, 'dt__min_samples_split': 10}
0.7890841813135985
              precision    recall  f1-score   support

           0       0.96      0.82      0.88        28
           1       0.85      0.97      0.91        30

    accuracy                           0.90        58
   macro avg       0.91      0.89      0.90        58
weighted avg       0.90      0.90      0.90        58



<h1> DT on Random undersampling </h1>

In [21]:
X,y = splitting_data(df, 'under')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [22]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
0    987
1    987
Name: Class, dtype: int64


In [23]:
DT4 =training(X_train, y_train)
y_pred = predict('undersampling',DT4, X_test, y_test)

              precision    recall  f1-score   support

           0       0.78      0.85      0.81       200
           1       0.83      0.75      0.79       195

    accuracy                           0.80       395
   macro avg       0.81      0.80      0.80       395
weighted avg       0.80      0.80      0.80       395



In [24]:
best_DT4 = optimize_with_grid(X_train, y_train)
prediction = predict('undersampling_grid',best_DT4, X_test, y_test)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'dt__criterion': 'gini', 'dt__max_depth': 10, 'dt__min_samples_leaf': 4, 'dt__min_samples_split': 10}
0.8613039983926061
              precision    recall  f1-score   support

           0       0.80      0.88      0.84       200
           1       0.86      0.78      0.82       195

    accuracy                           0.83       395
   macro avg       0.83      0.83      0.83       395
weighted avg       0.83      0.83      0.83       395



<h1> DT on Random Oversampling </h1>

In [25]:
X,y = splitting_data(df,'over')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [26]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
1    1013
0    1013
Name: Class, dtype: int64


In [27]:
DT5 =training(X_train, y_train)
y_pred = predict('oversampling',DT5, X_test, y_test)

              precision    recall  f1-score   support

           0       0.86      0.82      0.83       211
           1       0.81      0.85      0.83       195

    accuracy                           0.83       406
   macro avg       0.83      0.83      0.83       406
weighted avg       0.83      0.83      0.83       406



In [28]:
best_DT5 = optimize_with_grid(X_train, y_train)
prediction = predict('oversampling_grid',best_DT5, X_test, y_test)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'dt__criterion': 'gini', 'dt__max_depth': 10, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 10}
0.8567901234567902
              precision    recall  f1-score   support

           0       0.85      0.86      0.85       211
           1       0.84      0.83      0.84       195

    accuracy                           0.84       406
   macro avg       0.84      0.84      0.84       406
weighted avg       0.84      0.84      0.84       406



<h1> DT on Cluster Centroids </h1>

In [29]:
X,y = splitting_data(df, 'cluster_centroids')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)



In [30]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
0    987
1    987
Name: Class, dtype: int64


In [31]:
DT6 = training(X_train, y_train)
y_pred = predict('cluster_centroids',DT6, X_test, y_test)

              precision    recall  f1-score   support

           0       0.82      0.86      0.84       200
           1       0.85      0.81      0.83       195

    accuracy                           0.84       395
   macro avg       0.84      0.84      0.84       395
weighted avg       0.84      0.84      0.84       395



In [32]:
best_DT6 = optimize_with_grid(X_train, y_train)
prediction = predict('cluster_centroids_grid',best_DT6, X_test, y_test)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'dt__criterion': 'gini', 'dt__max_depth': 10, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 5}
0.8562366887683343
              precision    recall  f1-score   support

           0       0.82      0.89      0.85       200
           1       0.87      0.81      0.84       195

    accuracy                           0.85       395
   macro avg       0.85      0.85      0.85       395
weighted avg       0.85      0.85      0.85       395



<h1> DT on Tomek Links </h1>

In [33]:
X,y = splitting_data(df, 'tomek_links')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [34]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
1    987
0    694
Name: Class, dtype: int64


In [35]:
DT7 =training(X_train, y_train)
y_pred = predict('tomek_links',DT7, X_test, y_test)

              precision    recall  f1-score   support

           0       0.81      0.81      0.81       139
           1       0.86      0.86      0.86       198

    accuracy                           0.84       337
   macro avg       0.83      0.83      0.83       337
weighted avg       0.84      0.84      0.84       337



In [36]:
best_DT7 = optimize_with_grid(X_train, y_train)
prediction = predict('tomek_links_grid',best_DT7, X_test, y_test)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'dt__criterion': 'entropy', 'dt__max_depth': 10, 'dt__min_samples_leaf': 4, 'dt__min_samples_split': 10}
0.8534345003606502
              precision    recall  f1-score   support

           0       0.81      0.89      0.85       139
           1       0.92      0.85      0.88       198

    accuracy                           0.87       337
   macro avg       0.86      0.87      0.86       337
weighted avg       0.87      0.87      0.87       337



In [37]:
best_model

{'original': {'accuracy': 0.825,
  'precision': 0.825,
  'recall': 0.825,
  'f1': 0.825},
 'original_grid': {'accuracy': 0.8475,
  'precision': 0.8488768070395978,
  'recall': 0.8475,
  'f1': 0.8472889782169042},
 'SMOTE': {'accuracy': 0.7980295566502463,
  'precision': 0.7982314139529371,
  'recall': 0.7980295566502463,
  'f1': 0.7979559951542711},
 'SMOTE_grid': {'accuracy': 0.8497536945812808,
  'precision': 0.8503773709773952,
  'recall': 0.8497536945812808,
  'f1': 0.8496469408435314},
 'SMOTEENN': {'accuracy': 0.8275862068965517,
  'precision': 0.8287466843501327,
  'recall': 0.8275862068965517,
  'f1': 0.8271742265068184},
 'SMOTEENN_grid': {'accuracy': 0.896551724137931,
  'precision': 0.9038201487491548,
  'recall': 0.896551724137931,
  'f1': 0.895805702917772},
 'undersampling': {'accuracy': 0.8025316455696202,
  'precision': 0.8048416589738391,
  'recall': 0.8025316455696202,
  'f1': 0.8020058963623469},
 'undersampling_grid': {'accuracy': 0.830379746835443,
  'precision': 0

In [38]:
best_model_df = pd.DataFrame.from_dict(best_model, orient='index')
best_model_df.sort_values(by='accuracy', ascending=False, inplace=True)
best_model_df

Unnamed: 0,accuracy,precision,recall,f1
SMOTEENN_grid,0.896552,0.90382,0.896552,0.895806
tomek_links_grid,0.866469,0.871491,0.866469,0.867258
SMOTE_grid,0.849754,0.850377,0.849754,0.849647
original_grid,0.8475,0.848877,0.8475,0.847289
cluster_centroids_grid,0.84557,0.847429,0.84557,0.845272
oversampling_grid,0.844828,0.844812,0.844828,0.844774
tomek_links,0.839763,0.839763,0.839763,0.839763
cluster_centroids,0.835443,0.836047,0.835443,0.83531
oversampling,0.832512,0.833644,0.832512,0.832573
undersampling_grid,0.83038,0.833265,0.83038,0.829882
