In [14]:
from sklearn import set_config
set_config(display="diagram")

In [15]:
import pandas as pd
df = pd.read_csv("ObesityDataSet2.csv")

In [16]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
import xgboost as xgb
import pickle

#### Preprocessing

In [17]:
df['Age'] = df['Age'].str.replace('years', '', case=False).str.strip()
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['Age'] = df['Age'].astype(int)

#### Impute Missing Value

In [18]:
df['FCVC'] = df['FCVC'].fillna(df['FCVC'].mean())
df['MTRANS'] = df['MTRANS'].fillna(df['MTRANS'].mode()[0])

#### Encode

In [19]:
binary_mapping = {
    "Gender": {"Female": 0, "Male": 1},
    "family_history_with_overweight": {"no": 0, "yes": 1},
    "FAVC": {"no": 0, "yes": 1},
    "SMOKE": {"no": 0, "yes": 1},
    "SCC": {"no": 0, "yes": 1}
}
df = df.replace(binary_mapping)

caec_mapping = {'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3}
df['CAEC'] = df['CAEC'].astype(str).str.strip().replace(caec_mapping)

calc_mapping = {'no': 0, 'Sometimes': 1, 'Frequently': 2, 'Always': 3}
df['CALC'] = df['CALC'].astype(str).str.strip().replace(calc_mapping)

mtrans_mapping = {'Automobile': 0, 'Bike': 1, 'Motorbike': 2, 'Public_Transportation': 3, 'Walking': 4}
df['MTRANS'] = df['MTRANS'].astype(str).str.strip().replace(mtrans_mapping)

In [20]:
target_mapping = {
    'Insufficient_Weight': 0,
    'Normal_Weight': 1,
    'Overweight_Level_I': 2,
    'Overweight_Level_II': 3,
    'Obesity_Type_I': 4,
    'Obesity_Type_II': 5,
    'Obesity_Type_III': 6
}
df['NObeyesdad'] = df['NObeyesdad'].replace(target_mapping)

#### Splitting

In [21]:
X = df.drop('NObeyesdad', axis=1)
y = df['NObeyesdad']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#### Preprocessing Pipeline

In [22]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer([
    ('scaler', StandardScaler(), numeric_features)
])
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', xgb.XGBClassifier(
        n_estimators=100,
        min_child_weight=2,
        max_depth=10,
        learning_rate=0.1,
        subsample=1.0,
        use_label_encoder=False,
        eval_metric='mlogloss',
        random_state=42
    ))
])

#### Fit Pipeline

In [23]:
pipeline.fit(X_train, y_train)

with open('final_pipeline_model.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

print("Pipeline berhasil disimpan ke final_pipeline_model.pkl")

Pipeline berhasil disimpan ke final_pipeline_model.pkl


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1056 entries, 0 to 1055
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          1056 non-null   int64  
 1   Age                             1056 non-null   int32  
 2   Height                          1056 non-null   float64
 3   Weight                          1056 non-null   float64
 4   family_history_with_overweight  1056 non-null   int64  
 5   FAVC                            1056 non-null   int64  
 6   FCVC                            1056 non-null   float64
 7   NCP                             1056 non-null   float64
 8   CAEC                            1056 non-null   int64  
 9   SMOKE                           1056 non-null   int64  
 10  CH2O                            1056 non-null   float64
 11  SCC                             1056 non-null   int64  
 12  FAF                             10