In [1]:
# installing package
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from numpy import random
from numpy import mean
from numpy import std
from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
import warnings
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.feature_selection import SelectKBest, f_classif
# importing data

# data import -- google colab
#from google.colab import drive
#drive.mount('/content/drive')
#file_path = '/content/drive/MyDrive/ObesityDataSet_raw_and_data_sinthetic.csv'
#df = pd.read_csv(file_path)
# data import -- jupyter notebook
df = pd.read_csv("ObesityDataSet_raw_and_data_sinthetic.csv")
df = df.rename(columns={'family_history_with_overweight': 'FHWO', 'NObeyesdad' : 'Obesity Level',})
cols = df.columns
num_cols = df._get_numeric_data().columns
cat_cols = list(set(cols) - set(num_cols))
for i in cat_cols:
    col_val = sorted(list(set(df[i].tolist())))
    replace_num = []
    for j in range(len(col_val)):
        replace_num.append(j)
    df[i].replace(col_val,replace_num, inplace=True)
outlier_index = [18, 21, 25, 30, 68, 92, 119, 132, 133, 142, 152, 188, 191, 200, 217, 232, 236, 245, 252, 277, 333, 495]
df_remove_outliers = df
df_remove_outliers = df_remove_outliers.drop(outlier_index)
df_remove_outliers = df_remove_outliers.reset_index(drop=True)

columns = ['Height', 'Weight']
df_remove_outliers.drop(columns, inplace=True, axis=1)
df.drop(columns, inplace=True, axis=1)

In [2]:
features = ['Age', 'Gender', 'FHWO', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS']
scaler = MinMaxScaler()
X = scaler.fit_transform(df[features])

target_name = 'Obesity Level'
y = df[target_name]

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state = 42)
mm = MinMaxScaler()
X_train_mm_scaled = mm.fit_transform(X_train)
X_test_mm_scaled = mm.transform(X_test)
knn_best = KNeighborsClassifier()

clf = knn_best.fit(X_train_mm_scaled, y_train)
y_pred = clf.predict(X_test_mm_scaled)
accuracy = round(metrics.accuracy_score(y_test, y_pred),5)
train_score = clf.score(X_train_mm_scaled, y_train)
test_score = clf.score(X_test_mm_scaled, y_test)

print('KNN Model (before outliers removal):')
print('Train accuracy:', train_score)
print('Test accuracy:', test_score)
print("Classification Report: \n", metrics.classification_report(y_test, y_pred, digits=4))

KNN Model (before outliers removal):
Train accuracy: 0.816350710900474
Test accuracy: 0.6903073286052009
Classification Report: 
               precision    recall  f1-score   support

           0     0.6267    0.8393    0.7176        56
           1     0.5294    0.2903    0.3750        62
           2     0.6860    0.7564    0.7195        78
           3     0.6806    0.8448    0.7538        58
           4     0.9545    1.0000    0.9767        63
           5     0.7250    0.5179    0.6042        56
           6     0.5400    0.5400    0.5400        50

    accuracy                         0.6903       423
   macro avg     0.6775    0.6841    0.6695       423
weighted avg     0.6824    0.6903    0.6753       423



In [4]:
features = ['Age', 'Gender', 'FHWO', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS']
scaler = MinMaxScaler()
X = scaler.fit_transform(df_remove_outliers[features])

target_name = 'Obesity Level'
y = df_remove_outliers[target_name]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, random_state=42)

knn_best = KNeighborsClassifier()

clf = knn_best.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = round(metrics.accuracy_score(y_test, y_pred),5)
train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)

print('KNN Model (after outliers removal):')
print('Train accuracy:', train_score)
print('Test accuracy:', test_score)
print("Classification Report: \n", metrics.classification_report(y_test, y_pred, digits=4))

KNN Model (after outliers removal):
Train accuracy: 0.8132854578096947
Test accuracy: 0.7679425837320574
Classification Report: 
               precision    recall  f1-score   support

           0     0.7260    0.8833    0.7970        60
           1     0.5758    0.3654    0.4471        52
           2     0.6667    0.7500    0.7059        64
           3     0.8406    0.9206    0.8788        63
           4     0.9733    0.9865    0.9799        74
           5     0.6923    0.6923    0.6923        52
           6     0.7727    0.6415    0.7010        53

    accuracy                         0.7679       418
   macro avg     0.7496    0.7485    0.7431       418
weighted avg     0.7610    0.7679    0.7590       418



In [6]:
# KNN LOOCV
k_values = range(1, 15)

best_score = 0
best_k = None

loo = LeaveOneOut()

for k in k_values:
    model = KNeighborsClassifier(n_neighbors=k)

    scores = []

    for train_ix, test_ix in loo.split(X):
        X_train, X_test = X[train_ix], X[test_ix]
        y_train, y_test = y[train_ix], y[test_ix]

        model.fit(X_train, y_train)


        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        scores.append(accuracy)

    average_accuracy = np.mean(scores)

    # Update best_score and best_k if this model is better
    if average_accuracy > best_score:
        best_score = average_accuracy
        best_k = k
print('KNN Model with LOOCV:')
print(f"Accuracy: {best_score:.4f}")
print(f"Best k: {best_k}")

KNN Model with LOOCV:
Accuracy: 0.7779
Best k: 1


In [7]:
cv_outer = KFold(n_splits=10, shuffle=True, random_state=42)
outer_scores = []
selected_feature_indices_per_fold = []

fold_number = 1

for train_ix, test_ix in cv_outer.split(X):
    X_train, X_test = X[train_ix], X[test_ix]
    y_train, y_test = y[train_ix], y[test_ix]

    cv_inner = KFold(n_splits=3, shuffle=True, random_state=42)
    knn = KNeighborsClassifier()

    param_grid = {
        'kneighborsclassifier__n_neighbors': [1, 3, 5, 7, 9],
        'kneighborsclassifier__weights': ['uniform', 'distance'],
        'kneighborsclassifier__metric': ['euclidean', 'manhattan', 'chebyshev']
    }

    pipeline = Pipeline([
        ('scaler', MinMaxScaler()),
        ('selectkbest', SelectKBest(f_classif)),
        ('kneighborsclassifier', knn)
    ])

    search = GridSearchCV(pipeline, param_grid, scoring='accuracy', cv=cv_inner, refit=True)
    result = search.fit(X_train, y_train)
    best_model = result.best_estimator_

    selected_feature_indices = best_model.named_steps['selectkbest'].get_support(indices=True)
    selected_feature_indices_per_fold.append(selected_feature_indices)

    selected_feature_names = [features[i] for i in selected_feature_indices]
    
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    outer_scores.append(accuracy)

    print(f'Fold {fold_number}: Test Accuracy={accuracy:.4f}, Best Params={search.best_params_}, Selected Features={selected_feature_names}')
    fold_number += 1

print(f'KNN Nested CV Accuracy: Mean={np.mean(outer_scores):.4f}, Std={np.std(outer_scores):.4f}')


Fold 1: Test Accuracy=0.7943, Best Params={'kneighborsclassifier__metric': 'manhattan', 'kneighborsclassifier__n_neighbors': 5, 'kneighborsclassifier__weights': 'distance'}, Selected Features=['Age', 'Gender', 'FHWO', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SCC', 'CALC', 'MTRANS']
Fold 2: Test Accuracy=0.7799, Best Params={'kneighborsclassifier__metric': 'manhattan', 'kneighborsclassifier__n_neighbors': 1, 'kneighborsclassifier__weights': 'uniform'}, Selected Features=['Age', 'Gender', 'FHWO', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SCC', 'CALC', 'MTRANS']
Fold 3: Test Accuracy=0.7799, Best Params={'kneighborsclassifier__metric': 'manhattan', 'kneighborsclassifier__n_neighbors': 5, 'kneighborsclassifier__weights': 'distance'}, Selected Features=['Age', 'Gender', 'FHWO', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SCC', 'CALC', 'MTRANS']
Fold 4: Test Accuracy=0.7512, Best Params={'kneighborsclassifier__metric': 'manhattan', 'kneighborsclassifier__n_neighbors': 5, 'kneighborsclassifier__weights': 'distance'}, Selec