# Multi-Class Prediction of Obesity Risk
Run after following notebooks are run:
1. **01 Data Cleaning**
2. **02 EDA**

## Development Notes


In [42]:
## set up libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
import xgboost as xgb

## Load Data

In [3]:
train = pd.read_pickle('train_clean.pkl')
test = pd.read_pickle('test_clean.pkl')

## separate in to features and response variable
x_train = train.drop('NObeyesdad', axis=1)
y_train = train['NObeyesdad']

x_test = test

## One Hot Encoding

In [17]:
## take note of 'CALC' - train has 3 levels, but test has 4
print('train: ', x_train.CALC.unique())
print('test: ', x_test.CALC.unique())

CALC_levels = x_test.CALC.cat.categories.tolist()

train:  ['Sometimes', 'no', 'Frequently']
Categories (4, object): ['no' < 'Sometimes' < 'Frequently' < 'Always']
test:  ['Sometimes', 'no', 'Frequently', 'Always']
Categories (4, object): ['no' < 'Sometimes' < 'Frequently' < 'Always']


In [24]:
## list categorical columns, excluding CALC, that can be assigned automatically
cols_onehot_auto = x_train.columns[x_train.dtypes=='category'].drop('CALC')

## use ColumnTransformer so only categorical columns are affected
ct = ColumnTransformer([('One_Hot_Cat', OneHotEncoder(drop='first'), cols_onehot_auto),
                        ('One_Hot_Cat_Manual', OneHotEncoder(drop='first', categories=[CALC_levels]), ['CALC'])],
                       remainder='passthrough', verbose_feature_names_out=False)
x_train_onehot = pd.DataFrame(ct.fit_transform(x_train), columns=ct.get_feature_names_out())
x_test_onehot = pd.DataFrame(ct.transform(x_test), columns=ct.get_feature_names_out())

x_train_onehot.columns.tolist()



['Gender_Male',
 'family_history_with_overweight_yes',
 'FAVC_yes',
 'CAEC_Frequently',
 'CAEC_Sometimes',
 'CAEC_no',
 'SMOKE_yes',
 'SCC_yes',
 'MTRANS_Bike',
 'MTRANS_Motorbike',
 'MTRANS_Public_Transportation',
 'MTRANS_Walking',
 'CALC_Sometimes',
 'CALC_Frequently',
 'CALC_Always',
 'Age',
 'Height',
 'Weight',
 'FCVC',
 'NCP',
 'CH2O',
 'FAF',
 'TUE']

In [49]:
## label encode response variable
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
le.classes_

array(['Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_I',
       'Obesity_Type_II', 'Obesity_Type_III', 'Overweight_Level_I',
       'Overweight_Level_II'], dtype=object)

## Modelling

### XGBoost

In [80]:
## convert data to DMatrix
x_train_onehot_D = xgb.DMatrix(data=x_train_onehot, label=y_train_le)

params = {'objective':'multi:softprob', 'num_class':7, 'max_depth':8, 'eta':0.1}

xgb_tuning = xgb.cv(dtrain=x_train_onehot_D, params=params, num_boost_round=100, early_stopping_rounds=20, nfold=5, as_pandas=True, seed=123)

In [81]:
xgb_tuning.tail(1)

Unnamed: 0,train-mlogloss-mean,train-mlogloss-std,test-mlogloss-mean,test-mlogloss-std
99,0.102056,0.002044,0.280454,0.00725
