# 1. Data Visualization

In [None]:
# link: https://pubmed.ncbi.nlm.nih.gov/31467953/

In [1]:
import pandas as pd

In [2]:
obesity = pd.read_csv("obesity.csv")

In [3]:
obesity.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [4]:
obesity.Gender.value_counts()

Gender
Male      1068
Female    1043
Name: count, dtype: int64

In [5]:
obesity.family_history_with_overweight.value_counts()

family_history_with_overweight
yes    1726
no      385
Name: count, dtype: int64

In [6]:
obesity.CAEC.value_counts()

CAEC
Sometimes     1765
Frequently     242
Always          53
no              51
Name: count, dtype: int64

In [7]:
obesity.SMOKE.value_counts()

SMOKE
no     2067
yes      44
Name: count, dtype: int64

In [8]:
obesity.SCC.value_counts()

SCC
no     2015
yes      96
Name: count, dtype: int64

In [9]:
obesity.CALC.value_counts()

CALC
Sometimes     1401
no             639
Frequently      70
Always           1
Name: count, dtype: int64

In [10]:
obesity.MTRANS.value_counts()

MTRANS
Public_Transportation    1580
Automobile                457
Walking                    56
Motorbike                  11
Bike                        7
Name: count, dtype: int64

In [11]:
obesity.NObeyesdad.value_counts()

NObeyesdad
Obesity_Type_I         351
Obesity_Type_III       324
Obesity_Type_II        297
Overweight_Level_I     290
Overweight_Level_II    290
Normal_Weight          287
Insufficient_Weight    272
Name: count, dtype: int64

In [12]:
obesity.columns

Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

# 2. Data Analysis

## 2.1. Import libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

## 2.2. Map target label

In [None]:
label_encoder = LabelEncoder()
obesity['target_encoded'] = label_encoder.fit_transform(obesity['NObeyesdad'])

## 2.3. Define features and target

In [None]:
X = obesity.drop(columns=['NObeyesdad'])
y = obesity['NObeyesdad']

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(exclude=['object']).columns.tolist()

## 2.4. Divide to train test dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

## 2.5 Create preprocessing data pipeline

In [None]:
categorical_transformer = OneHotEncoder(drop='first', handle_unknown='ignore')
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

## 2.6. Model pipeline

In [None]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        solver='lbfgs',
        max_iter=1000,
        random_state=42
    ))
])

## 2.7. Train the modelcwith train dataset

In [None]:
model.fit(X_train, y_train)

## 2.8. Evaluate the model with test dataset

In [None]:
y_pred = model.predict(X_test)

## 2.9  Confusion matrix as metrics for accuracy, f1, recall, precision

In [15]:
print("\n Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

print("\n Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Target mapping: {'Insufficient_Weight': 0, 'Normal_Weight': 1, 'Obesity_Type_I': 2, 'Obesity_Type_II': 3, 'Obesity_Type_III': 4, 'Overweight_Level_I': 5, 'Overweight_Level_II': 6}

 Classification Report:
                     precision    recall  f1-score   support

Insufficient_Weight       1.00      1.00      1.00        54
      Normal_Weight       0.98      0.98      0.98        58
     Obesity_Type_I       0.99      0.99      0.99        70
    Obesity_Type_II       0.98      1.00      0.99        60
   Obesity_Type_III       1.00      0.98      0.99        65
 Overweight_Level_I       1.00      1.00      1.00        58
Overweight_Level_II       1.00      1.00      1.00        58

           accuracy                           0.99       423
          macro avg       0.99      0.99      0.99       423
       weighted avg       0.99      0.99      0.99       423


 Confusion Matrix:
[[54  0  0  0  0  0  0]
 [ 0 57  1  0  0  0  0]
 [ 0  1 69  0  0  0  0]
 [ 0  0  0 60  0  0  0]
 [ 0 