In [80]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
import xgboost as xgb
from xgboost import XGBRFClassifier
from sklearn.metrics import jaccard_score, accuracy_score, log_loss, f1_score, confusion_matrix, classification_report
file='ObesityDataSet_raw_and_data_sinthetic.csv'


In [2]:
df=pd.read_csv(file)
df.sample(5)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
458,Male,19.0,1.69,60.0,no,yes,2.0,3.0,Always,no,1.0,no,1.0,1.0,Sometimes,Public_Transportation,Normal_Weight
1111,Male,18.011718,1.680991,79.752916,yes,yes,2.413156,2.521546,Sometimes,no,1.985312,no,0.00705,0.965464,Sometimes,Public_Transportation,Overweight_Level_II
1908,Female,20.871667,1.782453,137.852618,yes,yes,3.0,3.0,Sometimes,no,2.748909,no,1.989171,0.832515,Sometimes,Public_Transportation,Obesity_Type_III
634,Female,23.0,1.710129,50.079991,yes,yes,2.0,3.0,Frequently,no,2.685842,no,0.373186,2.0,no,Public_Transportation,Insufficient_Weight
1663,Male,24.079971,1.61981,98.54302,yes,yes,2.95841,2.434347,Sometimes,no,1.0,no,1.930033,0.754023,no,Public_Transportation,Obesity_Type_II


In [5]:
df.shape
df.isnull().sum()
df.Gender.replace(to_replace=['Male', 'Female'], value=[1,0], inplace=True)

In [9]:
df.family_history_with_overweight.replace(to_replace=['yes', 'no'], value=[1,0], inplace=True)
df.SMOKE.replace(to_replace=['yes', 'no'], value=[1,0], inplace=True)
df.SCC.replace(to_replace=['yes', 'no'], value=[1,0], inplace=True)
df.FAVC.replace(to_replace=['yes', 'no'], value=[1,0], inplace=True)


In [17]:
df.CALC.replace(to_replace=['Sometimes', 'Frequently', 'Always', 'no'], value=[1,2,3,0], inplace=True)
df.CAEC.replace(to_replace=['Sometimes', 'Frequently', 'Always', 'no'], value=[1,2,3,0], inplace=True)
df.MTRANS.replace(to_replace=['Automobile', 'Public_Transportation', 'Walking', 'Motorbike', 'Bike'], value=[1,2,4,3,5], inplace=True)

In [21]:
df.rename(columns={'NObeyesdad':'NObesity'},  inplace=True)

In [24]:
df.NObesity.value_counts()
df.columns

Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'MTRANS', 'NObesity'],
      dtype='object')

In [74]:
y=df.NObesity.values
X=df[['Height', 'Weight', 'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', ]]

In [None]:
##'family_history_with_overweight', 'Gender', 'Age', -> dont have any impact on the predictions

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=21)

In [72]:
rt=RandomForestClassifier(max_depth=12, class_weight="balanced")
rt.fit(X_train, y_train)
yhat_r=rt.predict(X_test)
print(accuracy_score(y_test, yhat_r))
print(f1_score(y_test, yhat_r, average="weighted"))
#print(confusion_matrix(y_test, yhat_r))
print(cross_val_score(rt, X_train, y_train, cv=4))

0.950354609929078
0.9500959856230714
[0.93601896 0.93127962 0.94549763 0.92890995]


GridsearchCV to tune my hyperparameters.

In [82]:
rf=RandomForestClassifier()
param_grid = {
    'bootstrap': [True],
    'max_depth': [8, 9, 10, 11],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300]
}

grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train, y_train)
ypred=grid_search.predict(X_test)
grid_search.score(X_test,y_test)

grid_search.best_params_
print(classification_report(y_test, ypred))

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  2.4min finished


                     precision    recall  f1-score   support

Insufficient_Weight       0.97      0.94      0.95        65
      Normal_Weight       0.84      0.92      0.88        52
     Obesity_Type_I       1.00      0.97      0.98        59
    Obesity_Type_II       1.00      1.00      1.00        64
   Obesity_Type_III       1.00      1.00      1.00        75
 Overweight_Level_I       0.95      0.74      0.83        53
Overweight_Level_II       0.82      0.98      0.89        55

           accuracy                           0.94       423
          macro avg       0.94      0.94      0.93       423
       weighted avg       0.95      0.94      0.94       423



In [76]:

xg_model=xgb.XGBRFClassifier(max_depth=12,n_estimators=190)
xg_model.fit(X_train, y_train)
y_pred = xg_model.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred, average="weighted"))
print(confusion_matrix(y_test, y_pred))
print(cross_val_score(xg_model, X_train, y_train,cv=4))


0.9716312056737588
0.9717112778474689
[[63  2  0  0  0  0  0]
 [ 1 50  0  0  0  1  0]
 [ 0  0 57  1  0  1  0]
 [ 0  0  0 64  0  0  0]
 [ 0  0  0  1 74  0  0]
 [ 0  4  0  0  0 48  1]
 [ 0  0  0  0  0  0 55]]
[0.95260664 0.93838863 0.93838863 0.94549763]
