In [1]:
from sklearn import set_config
set_config(display="diagram")

# Import Library

In [2]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin

# Read Data

In [3]:
df = pd.read_csv("ObesityDataSet1.csv")

# Split Columns

In [4]:
numeric_columns = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
categorical_columns = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
target_column = 'NObeyesdad'

In [5]:
df['Age'] = df['Age'].astype(str).str.replace(' years', '', regex=False)
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
# Handle any NaNs that might arise from coercion if original data had non-numeric entries
df['Age'] = df['Age'].fillna(df['Age'].median()) # Impute any NaNs in Age after cleaning

In [6]:
X = df.drop(target_column, axis=1)
y = df[target_column]

# EDA

### Checking Unique Values

In [7]:
# check unique values for each categorical column
for col in categorical_columns:
    unique_values = df[col].unique()
    print(f"Unique values in {col}: {unique_values}")


Unique values in Gender: ['Male' 'Female']
Unique values in family_history_with_overweight: ['yes' 'no']
Unique values in FAVC: ['yes' 'no']
Unique values in CAEC: ['Sometimes' 'Frequently' nan 'Always' 'no']
Unique values in SMOKE: ['yes' 'no']
Unique values in SCC: ['no' 'yes']
Unique values in CALC: ['Sometimes' 'no' 'Frequently']
Unique values in MTRANS: ['Automobile' 'Public_Transportation' 'Walking' 'Bike' 'Motorbike']


there is missing values in CAEC, later we will input with simpleImputer most frequent

In [8]:
# check unique values for the numerical columns
for col in numeric_columns:
    unique_values = df[col].unique()
    print(f"Unique values in {col}: {unique_values}")

Unique values in Age: [31 18 44 22 21 25 56 24 19 23 41 20 26 35 28 17 33 27 40 39 30 32 37 38
 42 34 29 16 52 61 43 36 55 51 45 47]
Unique values in Height: [1.87 1.59 1.68 1.74 1.69 1.51 1.79 1.7  1.82 1.56 1.62 1.76 1.54 1.88
 1.6  1.81 1.86 1.67 1.65 1.93 1.63 1.53 1.61 1.85 1.66 1.8  1.75 1.91
 1.78 1.84 1.83 1.89 1.64 1.52 1.57 1.72 1.9  1.77 1.55 1.71 1.73 1.58
 1.5  1.98 1.46 1.49 1.48 1.94 1.92]
Unique values in Weight: [128.87  40.    77.   102.    75.    99.53  63.72  90.   141.92  49.
  58.    79.99  82.58  80.    45.    86.75 126.42  79.75  67.   118.56
  53.66 110.07  84.85  47.   125.42  84.49  60.    85.    86.24  94.45
 120.42  64.   107.01  68.   133.74 102.78 128.83 109.96 119.62  78.43
 103.19  50.95  55.01 105.26 106.69 101.78  43.53  99.61  56.    53.
  66.4  129.16 111.83  79.84  99.98 109.41  46.66 121.   111.64 121.31
  70.    44.24 118.42 120.98 120.75 121.24 111.94 112.28 108.93  49.93
 104.55  84.78  54.17  50.    99.   118.07  99.62  78.    73.94 118.67
 10

as you can see there is some imputation that goes wrong by user like 44 years and more, later we will handle it by making new classes 

# Encoding

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline

In [10]:
feature_categories_for_encoder = [
    ['Male', 'Female'],  # Gender
    ['no', 'yes'],       # family_history_with_overweight
    ['no', 'yes'],       # FAVC
    ['no', 'Sometimes', 'Frequently', 'Always'],  # CAEC
    ['no', 'yes'],       # SMOKE
    ['no', 'yes'],       # SCC
    ['no', 'Sometimes', 'Frequently'],  # CALC
    ['Automobile', 'Motorbike', 'Bike', 'Public_Transportation', 'Walking']  # MTRANS
]

In [11]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal_encoder', OrdinalEncoder(categories=feature_categories_for_encoder, handle_unknown='use_encoded_value', unknown_value=-1))
])

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

## Target Encoded

In [12]:
target_categories = ['Insufficient_Weight', 'Normal_Weight', 'Overweight_Level_I', 'Overweight_Level_II', 'Obesity_Type_I', 'Obesity_Type_II', 'Obesity_Type_III']
target_encoder = OrdinalEncoder(categories=[target_categories])
y_encoded = target_encoder.fit_transform(y.to_frame())
y_encoded = y_encoded.ravel()

## Combine All Precessor

In [13]:
from sklearn.compose import ColumnTransformer

In [14]:
preprocessor = ColumnTransformer(
    transformers=[
        # 'Age' is now processed by the general 'num' transformer
        ('num', numerical_transformer, ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']),
        ('cat', categorical_transformer, categorical_columns)
    ],
    remainder='passthrough'
)

## Model Training

In [15]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from imblearn.pipeline import Pipeline as ImbPipeline 

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

## Model 1: Random Forest Classifier

In [17]:
rf_model_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

## Model 2: XGBoost Classifier

In [18]:
xgboost_model_pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor), # Use the same preprocessor
    ('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')) # Add XGBoost
])

In [19]:
rf_model_pipeline.fit(X_train, y_train)

In [20]:
y_pred = rf_model_pipeline.predict(X_test)

In [21]:
print("Random Forest Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nRandom Forest Classification Report:\n", classification_report(y_test, y_pred, target_names=target_categories))

Random Forest Model Accuracy: 0.9146919431279621

Random Forest Classification Report:
                      precision    recall  f1-score   support

Insufficient_Weight       0.93      1.00      0.96        27
      Normal_Weight       0.84      0.90      0.87        29
 Overweight_Level_I       0.82      0.79      0.81        29
Overweight_Level_II       0.88      0.79      0.84        29
     Obesity_Type_I       0.92      0.97      0.94        35
    Obesity_Type_II       1.00      0.93      0.97        30
   Obesity_Type_III       1.00      1.00      1.00        32

           accuracy                           0.91       211
          macro avg       0.91      0.91      0.91       211
       weighted avg       0.92      0.91      0.91       211



In [22]:
xgboost_model_pipeline.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [23]:
y_pred = xgboost_model_pipeline.predict(X_test)

In [24]:
print("XGBoost Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nXGBoost Classification Report:\n", classification_report(y_test, y_pred, target_names=target_categories))

XGBoost Model Accuracy: 0.943127962085308

XGBoost Classification Report:
                      precision    recall  f1-score   support

Insufficient_Weight       0.90      1.00      0.95        27
      Normal_Weight       0.93      0.86      0.89        29
 Overweight_Level_I       0.96      0.86      0.91        29
Overweight_Level_II       0.90      0.97      0.93        29
     Obesity_Type_I       0.92      0.97      0.94        35
    Obesity_Type_II       1.00      0.93      0.97        30
   Obesity_Type_III       1.00      1.00      1.00        32

           accuracy                           0.94       211
          macro avg       0.94      0.94      0.94       211
       weighted avg       0.94      0.94      0.94       211



It turns out that the XGBoost model is the better model compared to Random Forest

## Export PKL

In [25]:
import pickle

with open('final_model.pkl', 'wb') as f:
    pickle.dump(xgboost_model_pipeline, f)

with open('target_encoder.pkl', 'wb') as f:
    pickle.dump(target_encoder, f)