## Importing Dataset

In [75]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns

In [76]:
!pip install xgboost



In [77]:
import xgboost as xgb

In [78]:
data = pd.read_csv('ObesitydataSet.csv')

## Preprocessing dataset features

In [79]:
data_1 = data[['Gender', 'Age', 'family_history_with_overweight',
          'FAVC', 'FCVC', 'NCP','SMOKE', 'SCC']]
data['family_history_with_overweight'] = data_1['family_history_with_overweight'].map({'yes':1,'no':0})
data['Gender'] = data_1['Gender'].map({'Male':1,'Female':0})
data['FAVC'] = data_1['FAVC'].map({'yes':1,'no':0})
data['SMOKE'] = data_1['SMOKE'].map({'yes':1,'no':0})
data['SCC'] = data_1['SCC'].map({'yes':1,'no':0})

## Converting Categorical Features into Numerical

In [80]:
ord_encoders = {}
ord_vars = ('CAEC','CALC','NObeyesdad')
ord_vals = [(('no','Sometimes','Frequently','Always'),),
           (('no','Sometimes','Frequently','Always'),),
            (('Insufficient_Weight','Normal_Weight','Overweight_Level_I',
             'Overweight_Level_II','Obesity_Type_I','Obesity_Type_II','Obesity_Type_III'),)]
for i,key in enumerate(ord_vars):
    print(key,ord_vals[i])
    ord_encoders[key] = OrdinalEncoder(categories=ord_vals[i])
    ord_encoders[key].fit(np.asarray(data[key]).reshape(-1,1))
    col = 'ord_'+key
    data[col] = ord_encoders[key].transform(np.asarray(data[key]).reshape(-1,1))

CAEC (('no', 'Sometimes', 'Frequently', 'Always'),)
CALC (('no', 'Sometimes', 'Frequently', 'Always'),)
NObeyesdad (('Insufficient_Weight', 'Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III'),)


In [81]:
data = pd.concat([data,pd.get_dummies(data.MTRANS).add_prefix('MTRANS_')],axis=1)

In [82]:
data.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,...,MTRANS,NObeyesdad,ord_CAEC,ord_CALC,ord_NObeyesdad,MTRANS_Automobile,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,0,21.0,1.62,64.0,1,0,2.0,3.0,Sometimes,0,...,Public_Transportation,Normal_Weight,1.0,0.0,1.0,0,0,0,1,0
1,0,21.0,1.52,56.0,1,0,3.0,3.0,Sometimes,1,...,Public_Transportation,Normal_Weight,1.0,1.0,1.0,0,0,0,1,0
2,1,23.0,1.8,77.0,1,0,2.0,3.0,Sometimes,0,...,Public_Transportation,Normal_Weight,1.0,2.0,1.0,0,0,0,1,0
3,1,27.0,1.8,87.0,0,0,3.0,3.0,Sometimes,0,...,Walking,Overweight_Level_I,1.0,2.0,2.0,0,0,0,0,1
4,1,22.0,1.78,89.8,0,0,2.0,1.0,Sometimes,0,...,Public_Transportation,Overweight_Level_II,1.0,1.0,3.0,0,0,0,1,0


In [83]:
X_columns = ['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
          'FAVC', 'FCVC', 'NCP','SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'ord_CAEC',
          'ord_CALC','MTRANS_Automobile', 'MTRANS_Bike', 'MTRANS_Motorbike',
          'MTRANS_Public_Transportation', 'MTRANS_Walking']
X = data[X_columns]
y = data['ord_NObeyesdad']

## XG Boost Implementation

In [84]:
import xgboost as xgb

from sklearn import datasets

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder 

from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
   

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5)
model = XGBClassifier() 
model


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, gamma=None,
              gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
              max_leaves=None, min_child_weight=None, missing=nan,
              monotone_constraints=None, n_estimators=100, n_jobs=None,
              num_parallel_tree=None, predictor=None, random_state=None,
              reg_alpha=None, reg_lambda=None, ...)

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=5)
model = XGBClassifier()
model.fit(X_train, y_train)

y_pred = model.predict(X_test) 

#predictions = [round(value) for value in y_pred]

accuracy = accuracy_score(y_test,y_pred) 

print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 97.40%


In [89]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result3 = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(result3)
result4 = classification_report(y_test, y_pred)
print("Classification Report:")
print (result4)
result5 = accuracy_score(y_test,y_pred)
print("Accuracy:",result5)

Confusion Matrix:
[[54  0  0  0  0  0  0]
 [ 3 63  2  0  0  0  0]
 [ 0  2 50  2  0  0  0]
 [ 0  0  0 56  1  0  0]
 [ 0  0  0  0 76  0  0]
 [ 0  0  0  0  0 46  0]
 [ 0  0  0  0  0  1 67]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.95      1.00      0.97        54
         1.0       0.97      0.93      0.95        68
         2.0       0.96      0.93      0.94        54
         3.0       0.97      0.98      0.97        57
         4.0       0.99      1.00      0.99        76
         5.0       0.98      1.00      0.99        46
         6.0       1.00      0.99      0.99        68

    accuracy                           0.97       423
   macro avg       0.97      0.97      0.97       423
weighted avg       0.97      0.97      0.97       423

Accuracy: 0.9739952718676123


# Grid Search XG Boost

# 1st parameter test

In [13]:
from sklearn.model_selection import GridSearchCV

In [18]:

param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4, cv=5)
gsearch1.fit(X_train,y_train)

Parameters: { "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     gamma=0, gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_bin=None,
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=5,
                                     max_leaves=None, min_child_weight=1,
                                     missing=nan, monotone_constraints=None,
               

In [20]:
#grid_search.fit(X_train, y_train)
gsearch1.best_params_, gsearch1.best_score_

({'max_depth': 3, 'min_child_weight': 1}, nan)

In [22]:
gsearch1.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.8,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=3, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=140,
              n_jobs=4, nthread=4, num_parallel_tree=1,
              objective='multi:softprob', predictor='auto', random_state=27, ...)

In [25]:
y_pred3=gsearch1.best_estimator_.predict(X_test)
accuracy_score(y_test, y_pred3)

0.9645390070921985

# 2nd parameter test

In [28]:
param_test2 = {
 'max_depth':[4,5,6],
 'min_child_weight':[4,5,6]
}
gsearch2 = GridSearchCV(estimator = XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test2, scoring='roc_auc',n_jobs=4, cv=5)
gsearch2.fit(X_train,y_train)
gsearch2.best_params_, gsearch2.best_score_

Parameters: { "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




({'max_depth': 4, 'min_child_weight': 4}, nan)

In [29]:
gsearch2.best_params_

{'max_depth': 4, 'min_child_weight': 4}

In [30]:
y_pred4=gsearch2.best_estimator_.predict(X_test)
accuracy_score(y_test, y_pred4)


0.9692671394799054

# 3rd parameter test

In [31]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=4,
 min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=4, cv=5)
gsearch3.fit(X_train,y_train)
gsearch3.best_params_, gsearch3.best_score_

Parameters: { "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




({'gamma': 0.0}, nan)

In [47]:
y_pred5=gsearch3.best_estimator_.predict(X_test)
accuracy_score(y_test, y_pred5)

0.9550827423167849

#  4th parameter test

In [62]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
 min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=4, cv=5)
gsearch4.fit(X_train,y_train)
gsearch4.best_params_, gsearch4.best_score_

Parameters: { "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




({'colsample_bytree': 0.6, 'subsample': 0.6}, nan)

In [63]:
y_pred6=gsearch4.best_estimator_.predict(X_test)
accuracy_score(y_test, y_pred6)

0.950354609929078

# 5th parameter test

In [64]:
param_test5 = {
 'subsample':[i/100.0 for i in range(75,90,5)],
 'colsample_bytree':[i/100.0 for i in range(75,90,5)]
}
gsearch5 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
 min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test5, scoring='roc_auc',n_jobs=4, cv=5)
gsearch5.fit(X_train,y_train)
gsearch5.best_params_, gsearch4.best_score_

Parameters: { "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




({'colsample_bytree': 0.75, 'subsample': 0.75}, nan)

In [65]:
y_pred7=gsearch5.best_estimator_.predict(X_test)
accuracy_score(y_test, y_pred7)

0.9527186761229315

# 6th parameter test (Tuning Regularization Parameters)

In [66]:
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=4,
 min_child_weight=6, gamma=0.1, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test6, scoring='roc_auc',n_jobs=4, cv=5)
gsearch6.fit(X_train,y_train)
gsearch6.best_params_, gsearch6.best_score_

Parameters: { "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




({'reg_alpha': 1e-05}, nan)

In [67]:
y_pred8=gsearch6.best_estimator_.predict(X_test)
accuracy_score(y_test, y_pred8)

0.9574468085106383

In [72]:
xgb3 = XGBClassifier(
 learning_rate =0.01,
 n_estimators=1000,
 max_depth=4,
 min_child_weight=4,
 gamma=0,
 subsample=0.75,
 colsample_bytree=0.75
    ,
 reg_alpha=0.005,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)

In [74]:
xgb3.fit(X_train,y_train)
y_pred9 = xgb3.predict(X_test) 

predictions2 = [round(value) for value in y_pred9]

accuracy = accuracy_score(y_test,predictions) 

print("Accuracy: %.2f%%" % (accuracy * 100.0))



Parameters: { "scale_pos_weight" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Accuracy: 95.74%


# K Nearest Neighbours

In [35]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf=KNeighborsClassifier()
knn_clf.fit(X_train,y_train)
ypred=knn_clf.predict(X_test) 


In [36]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
result = confusion_matrix(y_test, ypred)
print("Confusion Matrix:")
print(result)
result1 = classification_report(y_test, ypred)
print("Classification Report:")
print (result1)
result2 = accuracy_score(y_test,ypred)
print("Accuracy:",result2)

Confusion Matrix:
[[52  2  0  0  0  0  0]
 [14 36 13  4  1  0  0]
 [ 0  4 44  4  2  0  0]
 [ 0  0  2 50  5  0  0]
 [ 0  0  0  1 74  0  1]
 [ 0  0  0  0  2 44  0]
 [ 0  0  0  0  0  0 68]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.79      0.96      0.87        54
         1.0       0.86      0.53      0.65        68
         2.0       0.75      0.81      0.78        54
         3.0       0.85      0.88      0.86        57
         4.0       0.88      0.97      0.93        76
         5.0       1.00      0.96      0.98        46
         6.0       0.99      1.00      0.99        68

    accuracy                           0.87       423
   macro avg       0.87      0.87      0.87       423
weighted avg       0.87      0.87      0.86       423

Accuracy: 0.8699763593380615
