In [156]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, f1_score, recall_score, accuracy_score
from sklearn.pipeline import Pipeline


In [157]:
obesity = pd.read_csv('obesity_level.csv')
obesity.head()

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,0be1dad
0,0,Male,24.443011,1.699998,81.66995,1,1,2.0,2.983297,Sometimes,0,2.763573,0,0.0,0.976473,Sometimes,Public_Transportation,Overweight_Level_II
1,1,Female,18.0,1.56,57.0,1,1,2.0,3.0,Frequently,0,2.0,0,1.0,1.0,0,Automobile,0rmal_Weight
2,2,Female,18.0,1.71146,50.165754,1,1,1.880534,1.411685,Sometimes,0,1.910378,0,0.866045,1.673584,0,Public_Transportation,Insufficient_Weight
3,3,Female,20.952737,1.71073,131.274851,1,1,3.0,3.0,Sometimes,0,1.674061,0,1.467863,0.780199,Sometimes,Public_Transportation,Obesity_Type_III
4,4,Male,31.641081,1.914186,93.798055,1,1,2.679664,1.971472,Sometimes,0,1.979848,0,1.967973,0.931721,Sometimes,Public_Transportation,Overweight_Level_II


- FAVC (Frequent consumption of high-caloric food)
- FCVC (Frequency of consumption of vegetables)
- NCP (Number of main meals)
- CAEC (Consumption of food between meals)
- SMOKE
- CH2O (Daily water consumption)
- SCC (Caloric beverages consumption)
- FAF (Physical activity frequency)
- TUE (Time spent using technological devices)
- CALC (Consumption of alcohol)
- MTRANS (Mode of transportation)
- 0be1dad (Target variable representing obesity level)

In [158]:
obesity.shape

(20758, 18)

In [159]:
obesity.rename(columns={"0be1dad": "OB_LEVEL"}, inplace=True)
obesity.columns

Index(['id', 'Gender', 'Age', 'Height', 'Weight',
       'family_history_with_overweight', 'FAVC', 'FCVC', 'NCP', 'CAEC',
       'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE', 'CALC', 'MTRANS', 'OB_LEVEL'],
      dtype='object')

In [160]:
categorical_columns = ['Gender', 'MTRANS', 'OB_LEVEL']
obesity[categorical_columns] = obesity[categorical_columns].astype('category')

In [161]:
caec_mapping = {"0": 0, "Sometimes": 1, "Frequently": 2, "Always": 3}
calc_mapping = {"0": 0, "Sometimes": 1, "Frequently": 2}
ob_level_mapping = {"Insufficient_Weight": 0, "0rmal_Weight": 1, "Overweight_Level_I": 2, "Overweight_Level_II": 3, "Obesity_Type_I": 4, "Obesity_Type_II": 5, "Obesity_Type_III": 6}
obesity['CAEC'] = obesity['CAEC'].map(caec_mapping)
obesity['CALC'] = obesity['CALC'].map(calc_mapping)
obesity['OB_LEVEL'] = obesity['OB_LEVEL'].map(ob_level_mapping)
obesity['CAEC'] = pd.Categorical(obesity['CAEC'], categories=[0, 1, 2, 3], ordered=True)
obesity['CALC'] = pd.Categorical(obesity['CALC'], categories=[0, 1, 2], ordered=True)
obesity['OB_LEVEL'] = pd.Categorical(obesity['OB_LEVEL'], categories=[0, 1, 2, 3, 4, 5, 6], ordered=True)

In [162]:
obesity.isnull().sum()

id                                0
Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
OB_LEVEL                          0
dtype: int64

In [163]:
numeric_columns = ['Age', 'Height', 'Weight', 'FCVC', 'CH2O', 'NCP', 'FAF', 'TUE']
z_scores = stats.zscore(obesity[numeric_columns])
threshold = 3
outliers = (abs(z_scores) > threshold)
outliers.any()

Age        True
Height     True
Weight    False
FCVC      False
CH2O      False
NCP       False
FAF       False
TUE       False
dtype: bool

In [164]:
X, y = obesity.drop('OB_LEVEL', axis=1), obesity['OB_LEVEL']
X = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
rfc = RandomForestClassifier(random_state=42, max_depth=30, min_samples_leaf=5, min_samples_split=2, n_estimators=100, max_features='log2')
score = cross_val_score(rfc, X_train, y_train, cv=10)
np.mean(score)

0.8824504761973844

In [165]:
# grid_search.best_estimator_.named_steps["classification"].feature_importances_
# grid_search.best_estimator_.feature_names_in_

In [166]:
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix


array([[476,  33,   2,   1,   0,   1,   0],
       [ 31, 567,  33,   6,   2,   0,   0],
       [  6,  48, 351,  71,  13,   0,   0],
       [  0,  14,  34, 423,  39,   2,   0],
       [  1,   0,  12,  30, 484,  28,   2],
       [  0,   0,   0,   6,  17, 638,   1],
       [  0,   0,   1,   0,   2,   0, 777]])

In [167]:
print(f"Precision: {accuracy_score(y_test, y_pred)}")
print(f"Recall: {recall_score(y_test, y_pred, average='macro')}")
print(f"F1 score: {f1_score(y_test, y_pred, average='macro')}")

Precision: 0.894990366088632
Recall: 0.884000471018276
F1 score: 0.884321700171592
