# Multi-Class Prediction of Obesity Risk
Run after following notebooks are run:
1. **01 Data Cleaning**
2. **02 EDA**

## Development Notes


In [33]:
## set up libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

## Load Data

In [2]:
train = pd.read_pickle('train_clean.pkl')
test = pd.read_pickle('test_clean.pkl')

## separate in to features and response variable
x_train = train.drop('NObeyesdad', axis=1)
y_train = train['NObeyesdad']

x_test = test

## One Hot Encoding

In [3]:
## take note of 'CALC' - train has 3 levels, but test has 4
print('train: ', x_train.CALC.unique())
print('test: ', x_test.CALC.unique())

CALC_levels = x_test.CALC.cat.categories.tolist()

train:  ['Sometimes', 'no', 'Frequently']
Categories (4, object): ['no' < 'Sometimes' < 'Frequently' < 'Always']
test:  ['Sometimes', 'no', 'Frequently', 'Always']
Categories (4, object): ['no' < 'Sometimes' < 'Frequently' < 'Always']


In [4]:
## list categorical columns, excluding CALC, that can be assigned automatically
cols_onehot_auto = x_train.columns[x_train.dtypes=='category'].drop('CALC')

## use ColumnTransformer so only categorical columns are affected
ct = ColumnTransformer([('One_Hot_Cat', OneHotEncoder(drop='first'), cols_onehot_auto),
                        ('One_Hot_Cat_Manual', OneHotEncoder(drop='first', categories=[CALC_levels]), ['CALC'])],
                       remainder='passthrough', verbose_feature_names_out=False)
x_train_onehot = pd.DataFrame(ct.fit_transform(x_train), columns=ct.get_feature_names_out())
x_test_onehot = pd.DataFrame(ct.transform(x_test), columns=ct.get_feature_names_out())

x_train_onehot.columns.tolist()

['Gender_Male',
 'family_history_with_overweight_yes',
 'FAVC_yes',
 'CAEC_Frequently',
 'CAEC_Sometimes',
 'CAEC_no',
 'SMOKE_yes',
 'SCC_yes',
 'MTRANS_Bike',
 'MTRANS_Motorbike',
 'MTRANS_Public_Transportation',
 'MTRANS_Walking',
 'CALC_Sometimes',
 'CALC_Frequently',
 'CALC_Always',
 'Age',
 'Height',
 'Weight',
 'FCVC',
 'NCP',
 'CH2O',
 'FAF',
 'TUE']

In [5]:
## label encode response variable
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
le.classes_

array(['Insufficient_Weight', 'Normal_Weight', 'Obesity_Type_I',
       'Obesity_Type_II', 'Obesity_Type_III', 'Overweight_Level_I',
       'Overweight_Level_II'], dtype=object)

## Modelling

### XGBoost

In [6]:
## convert data to DMatrix
x_train_onehot_D = xgb.DMatrix(data=x_train_onehot, label=y_train_le)

## cross-validation
params = {'objective':'multi:softprob', 'num_class':7, 'max_depth':8, 'eta':0.1}
xgb_tuning = xgb.cv(dtrain=x_train_onehot_D, params=params, num_boost_round=100, early_stopping_rounds=20, nfold=5, as_pandas=True, seed=123)

## mean result
xgb_tuning.tail(1)

Unnamed: 0,train-mlogloss-mean,train-mlogloss-std,test-mlogloss-mean,test-mlogloss-std
99,0.102056,0.002044,0.280454,0.00725


In [None]:
## hyperparameter tuning: n_estimators
param_grid = {'n_estimators' : np.arange(100,250,50), 'max_depth':[8], 'eta':[0.1], 'colsample_bytree': [0.5, 1], 'reg_alpha':[0.5, 1],        # parameters to tune
              'objective':['multi:softprob'], 'num_class':[7], 'n_jobs' : [-1]}                                                                 # fixed parameters
kf = KFold(n_splits=10, shuffle=True, random_state=8)

xgb_tuning = GridSearchCV(xgb.XGBClassifier(random_state=8), param_grid, scoring='accuracy', cv=kf)
xgb_tuning.fit(x_train_onehot.values, y_train_le)

## find best param with highest accuracy
xgb_tuning_accr = xgb_tuning.cv_results_['mean_test_score']
print('best_params =', xgb_tuning.best_params_)
print('best_score =', xgb_tuning.best_score_)
xgb_tuning_accr

In [24]:
## instantiate xgb algorithm, using best tuned param
xgbclass = xgb.XGBClassifier(n_estimators= 150, max_depth=8, eta=0.1, colsample_bytree=0.5, reg_alpha=1, objective='multi:softprob', num_class=7, random_state=8, n_jobs=-1)

## fit model to training data
xgbclass.fit(x_train_onehot.values, y_train_le); None

### Random Forest

In [37]:
## hyperparameter tuning: n_estimators
param_grid = {'n_estimators' : np.arange(240,310,20), 'min_samples_split' : [5,10], 'n_jobs' : [-1]}
kf = KFold(n_splits=10, shuffle=True, random_state=8)

rf_tuning = GridSearchCV(RandomForestClassifier(random_state=8), param_grid, scoring='accuracy', cv=kf)
rf_tuning.fit(x_train_onehot.values, y_train_le)

## find best param with highest accuracy
rf_tuning_accr = rf_tuning.cv_results_['mean_test_score']
print('best_params =', rf_tuning.best_params_)
print('best_score =', rf_tuning.best_score_)
rf_tuning_accr

best_params = {'min_samples_split': 10, 'n_estimators': 280, 'n_jobs': -1}
best_score = 0.8978225735311188


array([0.89743726, 0.89729275, 0.89705165, 0.89782241, 0.89666627,
       0.89714808, 0.89782257, 0.89762987])

In [38]:
## instantiate rf algorithm, using best tuned param
rf = RandomForestClassifier(n_estimators=280, random_state=8, min_samples_split=10, n_jobs=-1)

## fit model to training data
rf.fit(x_train_onehot.values, y_train_le); None

In [39]:
## List feature importance
rf_ft_imp = pd.DataFrame(data={'Features': x_train_onehot.columns, 'Gini Importance':rf.feature_importances_*100})
rf_ft_imp.sort_values(by='Gini Importance', ascending=False).head(15)

Unnamed: 0,Features,Gini Importance
17,Weight,38.160275
15,Age,9.588418
18,FCVC,9.480501
16,Height,8.678373
0,Gender_Male,6.761056
20,CH2O,4.437775
22,TUE,4.363152
21,FAF,3.520261
19,NCP,3.187509
1,family_history_with_overweight_yes,3.008211


## Test Predictions

In [25]:
## set up submission format - i.e. Id & NObeyesdad
id = np.arange(len(train), len(train)+len(test))
submission = pd.DataFrame(data={'id':id, 'NObeyesdad':''})

In [26]:
## xgb prediction
y_testpred_xgb = xgbclass.predict(x_test_onehot.values)
y_testpred_xgb = le.inverse_transform(y_testpred_xgb)
submission['NObeyesdad'] = y_testpred_xgb
submission.to_csv('Submissions/test_pred_xgb_v2.csv', index=False)