In [1]:
# installing package
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from numpy import random
from numpy import mean
from numpy import std
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
import warnings
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectFromModel
# importing data

# data import -- google colab
#from google.colab import drive
#drive.mount('/content/drive')
#file_path = '/content/drive/MyDrive/ObesityDataSet_raw_and_data_sinthetic.csv'
#df = pd.read_csv(file_path)
# data import -- jupyter notebook
df = pd.read_csv("ObesityDataSet_raw_and_data_sinthetic.csv")
df = df.rename(columns={'family_history_with_overweight': 'FHWO', 'NObeyesdad' : 'Obesity Level',})
cols = df.columns
num_cols = df._get_numeric_data().columns
cat_cols = list(set(cols) - set(num_cols))
for i in cat_cols:
    col_val = sorted(list(set(df[i].tolist())))
    replace_num = []
    for j in range(len(col_val)):
        replace_num.append(j)
    df[i].replace(col_val,replace_num, inplace=True)
outlier_index = [18, 21, 25, 30, 68, 92, 119, 132, 133, 142, 152, 188, 191, 200, 217, 232, 236, 245, 252, 277, 333, 495]
df_remove_outliers = df
df_remove_outliers = df_remove_outliers.drop(outlier_index)
df_remove_outliers = df_remove_outliers.reset_index(drop=True)

columns = ['Height', 'Weight']
df_remove_outliers.drop(columns, inplace=True, axis=1)
df.drop(columns, inplace=True, axis=1)

In [2]:
features = ['Age', 'Gender', 'FHWO', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS']
scaler = MinMaxScaler()
X = scaler.fit_transform(df[features])

target_name = 'Obesity Level'
y = df[target_name]

In [3]:
train_features, test_features, train_labels, test_labels = train_test_split(
    X, y, train_size=0.8, random_state=42)

rfc = RandomForestClassifier()
rfc.fit(train_features, train_labels)

train_score = rfc.score(train_features, train_labels)
test_score = rfc.score(test_features, test_labels)

rfc_pred = rfc.predict(test_features)

cm = confusion_matrix(test_labels,rfc_pred)
mcc = matthews_corrcoef(test_labels, rfc_pred)
report_best = classification_report(test_labels, rfc_pred, digits=4)

print('Random Forest Model(before outliers removal):')
print('Train accuracy:', train_score)
print('Test accuracy:', test_score)
print('Classification Report of Random Forest Classifier : \n', report_best)
print('Confusion Matrix: ')
print(cm)

Random Forest Model(before outliers removal):
Train accuracy: 1.0
Test accuracy: 0.8392434988179669
Classification Report of Random Forest Classifier : 
               precision    recall  f1-score   support

           0     0.8814    0.9286    0.9043        56
           1     0.6716    0.7258    0.6977        62
           2     0.8395    0.8718    0.8553        78
           3     0.8333    0.9483    0.8871        58
           4     1.0000    1.0000    1.0000        63
           5     0.7959    0.6964    0.7429        56
           6     0.8684    0.6600    0.7500        50

    accuracy                         0.8392       423
   macro avg     0.8415    0.8330    0.8339       423
weighted avg     0.8411    0.8392    0.8373       423

Confusion Matrix: 
[[52  2  0  0  0  2  0]
 [ 5 45  4  1  0  4  3]
 [ 0  5 68  3  0  1  1]
 [ 0  3  0 55  0  0  0]
 [ 0  0  0  0 63  0  0]
 [ 1  9  5  1  0 39  1]
 [ 1  3  4  6  0  3 33]]


In [4]:
features = ['Age', 'Gender', 'FHWO', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS']
scaler = MinMaxScaler()
X = scaler.fit_transform(df_remove_outliers[features])

target_name = 'Obesity Level'
y = df_remove_outliers[target_name]

In [5]:
train_features, test_features, train_labels, test_labels = train_test_split(
    X, y, train_size=0.8, random_state=42)

rfc_best = RandomForestClassifier()
rfc_best.fit(train_features, train_labels)

train_score = rfc_best.score(train_features, train_labels)
test_score = rfc_best.score(test_features, test_labels)

rfc_pred = rfc_best.predict(test_features)

cm = confusion_matrix(test_labels,rfc_pred)

report_best = classification_report(test_labels, rfc_pred, digits=4)

print('Random Forest Model (after outliers removal):')
print('Train accuracy:', train_score)
print('Test accuracy:', test_score)
print('Classification Report of Random Forest Classifier : \n', report_best)
print('Confusion Matrix: ')
print(cm)

Random Forest Model (after outliers removal):
Train accuracy: 1.0
Test accuracy: 0.8516746411483254
Classification Report of Random Forest Classifier : 
               precision    recall  f1-score   support

           0     0.9434    0.8333    0.8850        60
           1     0.5970    0.7692    0.6723        52
           2     0.8947    0.7969    0.8430        64
           3     0.9077    0.9365    0.9219        63
           4     1.0000    0.9865    0.9932        74
           5     0.7885    0.7885    0.7885        52
           6     0.8235    0.7925    0.8077        53

    accuracy                         0.8517       418
   macro avg     0.8507    0.8433    0.8445       418
weighted avg     0.8630    0.8517    0.8550       418

Confusion Matrix: 
[[50 10  0  0  0  0  0]
 [ 3 40  1  0  0  5  3]
 [ 0  6 51  2  0  3  2]
 [ 0  3  0 59  0  0  1]
 [ 0  0  0  0 73  0  1]
 [ 0  4  3  2  0 41  2]
 [ 0  4  2  2  0  3 42]]


In [6]:
# Random Forest Grid Search
param_grid = {
    'n_estimators': [100, 1000],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [10, 50],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4],
    'criterion' : ['gini', 'entropy'],
}

CV_rfc = GridSearchCV(estimator=rfc_best, param_grid=param_grid, cv=5)
CV_rfc.fit(train_features, train_labels)
best_model = CV_rfc.best_estimator_

train_preds = best_model.predict(train_features)
train_acc = accuracy_score(train_labels, train_preds)

test_preds = best_model.predict(test_features)
test_acc = accuracy_score(test_labels, test_preds)

rfc_hyperGS = best_model.predict(test_features)
cm_hyper = confusion_matrix(test_labels,rfc_hyperGS)

report_hyperGS = classification_report(test_labels, rfc_hyperGS, digits=4)

print('Random Forest Model with Grid Search:')
print('Best hyperparameters:', CV_rfc.best_params_)
print('Train accuracy:', train_acc)
print('Test accuracy:', test_acc)

print('Classification Report:')
print(report_hyperGS)
print('Confusion Matrix: ')
print(cm_hyper)

Random Forest Model with Grid Search:
Best hyperparameters: {'criterion': 'gini', 'max_depth': 50, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
Train accuracy: 1.0
Test accuracy: 0.8660287081339713
Classification Report:
              precision    recall  f1-score   support

           0     0.9434    0.8333    0.8850        60
           1     0.6269    0.8077    0.7059        52
           2     0.9123    0.8125    0.8595        64
           3     0.9365    0.9365    0.9365        63
           4     1.0000    0.9865    0.9932        74
           5     0.8077    0.8077    0.8077        52
           6     0.8302    0.8302    0.8302        53

    accuracy                         0.8660       418
   macro avg     0.8653    0.8592    0.8597       418
weighted avg     0.8770    0.8660    0.8692       418

Confusion Matrix: 
[[50  9  0  0  0  1  0]
 [ 2 42  1  0  0  5  2]
 [ 0  6 52  2  0  2  2]
 [ 0  3  0 59  0  0  1]
 [ 0  0  0  0 73  0

In [7]:
# Random Forest Random Search
param_random = {
    'n_estimators': [int(x) for x in np.linspace(100, 300, num = 10)],
    'max_features': ['sqrt', 'log2', 10, 12, 15],
    'max_depth' : [int(x) for x in np.linspace(10, 50, num = 10)],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1,2,3],
    'criterion' : ['gini','entropy'],
    'bootstrap' : [True,False]
}
random_search = RandomizedSearchCV(estimator=rfc_best,
                                   param_distributions=param_random,
                                   n_iter = 50,
                                   cv=10, 
                                   verbose=True, 
                                   n_jobs = -1)

random_search.fit(train_features, train_labels)
best_model = random_search.best_estimator_
train_acc = best_model.score(train_features, train_labels)
test_acc = best_model.score(test_features, test_labels)
rfc_hyperRS = best_model.predict(test_features)
cm_hyper = confusion_matrix(test_labels,rfc_hyperRS)

report_hyperRS = classification_report(test_labels, rfc_hyperRS, digits=4)

print('Random Forest Model with Random Search:')
print('Best hyperparameters:', random_search.best_params_)
print('Train accuracy:', train_acc)
print('Test accuracy:', test_acc)

print('Classification Report:')
print(report_hyperRS)
print('Confusion Matrix: ')
print(cm_hyper)

Fitting 10 folds for each of 50 candidates, totalling 500 fits
Random Forest Model with Random Search:
Best hyperparameters: {'n_estimators': 255, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 45, 'criterion': 'entropy', 'bootstrap': False}
Train accuracy: 1.0
Test accuracy: 0.8708133971291866
Classification Report:
              precision    recall  f1-score   support

           0     0.9608    0.8167    0.8829        60
           1     0.6087    0.8077    0.6942        52
           2     0.9273    0.7969    0.8571        64
           3     0.9524    0.9524    0.9524        63
           4     1.0000    0.9865    0.9932        74
           5     0.7818    0.8269    0.8037        52
           6     0.8846    0.8679    0.8762        53

    accuracy                         0.8708       418
   macro avg     0.8737    0.8650    0.8657       418
weighted avg     0.8856    0.8708    0.8748       418

Confusion Matrix: 
[[49 10  0  0  0  1  0]
 [ 2

In [8]:
# Random Forest Bayes Optimization
warnings.filterwarnings("ignore", category=UserWarning, module="skopt.optimizer.optimizer")
param_grid = {
    'n_estimators': [100, 1000],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [10, 50],
    'min_samples_split': [2, 10],
    'min_samples_leaf': [1, 4],
    'criterion' : ['gini', 'entropy'],
}

opt = BayesSearchCV(
    estimator = rfc_best,
    search_spaces=param_grid,
    n_iter=100,  # Number of evaluations
    cv=5,  # Cross-validation folds
    n_jobs=-1,
)

opt.fit(train_features, train_labels)


best_rfc_model = opt.best_estimator_
train_acc = best_rfc_model.score(train_features, train_labels)
test_preds = best_rfc_model.predict(test_features)
test_acc = accuracy_score(test_labels, test_preds)
report_hyperBO = classification_report(test_labels, test_preds, digits=4)

rfc_hyperBO = best_model.predict(test_features)
cm_hyper = confusion_matrix(test_labels,rfc_hyperBO)

print('Random Forest Model with Bayes Optimization:')
print('Best hyperparameters:', opt.best_params_)
print('Train accuracy:', train_acc)
print('Test accuracy:', test_acc)

print('Classification Report:')
print(report_hyperBO)
print('Confusion Matrix: ')
print(cm_hyper)

Random Forest Model with Bayes Optimization:
Best hyperparameters: OrderedDict([('criterion', 'entropy'), ('max_depth', 39), ('max_features', 'log2'), ('min_samples_leaf', 1), ('min_samples_split', 2), ('n_estimators', 863)])
Train accuracy: 1.0
Test accuracy: 0.8779904306220095
Classification Report:
              precision    recall  f1-score   support

           0     0.9623    0.8500    0.9027        60
           1     0.6515    0.8269    0.7288        52
           2     0.9298    0.8281    0.8760        64
           3     0.9375    0.9524    0.9449        63
           4     1.0000    0.9865    0.9932        74
           5     0.8039    0.7885    0.7961        52
           6     0.8519    0.8679    0.8598        53

    accuracy                         0.8780       418
   macro avg     0.8767    0.8715    0.8716       418
weighted avg     0.8879    0.8780    0.8807       418

Confusion Matrix: 
[[49 10  0  0  0  1  0]
 [ 2 42  1  0  0  5  2]
 [ 0  6 51  2  0  3  2]
 [ 0  3  

In [10]:
cv_outer = KFold(n_splits=10, shuffle=True, random_state=42)

outer_results = list()

selected_feature_names = []

for train_ix, test_ix in cv_outer.split(X):
    X_train, X_test = X[train_ix, :], X[test_ix, :]
    y_train, y_test = y[train_ix], y[test_ix]

    cv_inner = KFold(n_splits=3, shuffle=True, random_state=42)
    model = RandomForestClassifier()
    param_grid = {
        'n_estimators': [100, 1000],
        'max_features': ['sqrt', 'log2'],
        'max_depth': [10, 50],
        'min_samples_split': [2, 10],
        'min_samples_leaf': [1, 4],
        'criterion': ['gini', 'entropy'],
    }
    search = GridSearchCV(model, param_grid, scoring='accuracy', cv=cv_inner, refit=True)
    result = search.fit(X_train, y_train)
    best_model = result.best_estimator_

    feature_selector = SelectFromModel(best_model, threshold='median')
    feature_selector.fit(X_train, y_train)
    X_train_selected = feature_selector.transform(X_train)
    X_test_selected = feature_selector.transform(X_test)

    selected_feature_indices = feature_selector.get_support(indices=True)

    selected_features = [features[i] for i in selected_feature_indices if i < len(features)]
    selected_feature_names.append(selected_features)

    best_model.fit(X_train_selected, y_train)
    yhat = best_model.predict(X_test_selected)
    acc = accuracy_score(y_test, yhat)
    outer_results.append(acc)
    print('> accuracy = %f, Best_params = %s' % (acc, result.best_params_))

for fold, features in enumerate(selected_feature_names):
    print(f'Fold {fold+1} - Selected Features: {features}')

print('Accuracy: %.4f (%.4f)' % (mean(outer_results), std(outer_results)))



> accuracy = 0.837321, Best_params = {'criterion': 'gini', 'max_depth': 50, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
> accuracy = 0.775120, Best_params = {'criterion': 'gini', 'max_depth': 50, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
> accuracy = 0.822967, Best_params = {'criterion': 'entropy', 'max_depth': 50, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
> accuracy = 0.813397, Best_params = {'criterion': 'entropy', 'max_depth': 50, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000}
> accuracy = 0.779904, Best_params = {'criterion': 'entropy', 'max_depth': 50, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
> accuracy = 0.784689, Best_params = {'criterion': 'entropy', 'max_depth': 50, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 