In [24]:
# Load the dataset
import pandas as pd
df = pd.read_csv("../data-set/ObesityDataSet.csv")

In [25]:
# Display the first 5 rows of the dataset
# print(df.head())

# Check for missing values
print(df.isnull().sum())

print(df.info())

Gender                            0
Age                               1
Height                            4
Weight                            0
family_history_with_overweight    1
FAVC                              2
FCVC                              1
NCP                               1
CAEC                              1
SMOKE                             1
CH2O                              0
SCC                               1
FAF                               2
TUE                               1
CALC                              3
MTRANS                            2
NObeyesdad                        0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2110 non-null   float64
 2   Height             

In [26]:
# Fill missing values in 'Age' and 'Weight' columns with their mean values
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Weight'].fillna(df['Weight'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Weight'].fillna(df['Weight'].mean(), inplace=True)


In [27]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical variables using LabelEncoder
categorical_columns = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'NObeyesdad']
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save the encoder for future use

In [28]:
from sklearn.preprocessing import MinMaxScaler

# Normalize numerical features using MinMaxScaler
numeric_columns = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
scaler = MinMaxScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

In [29]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% training, 20% testing)
X = df.drop('NObeyesdad', axis=1)  # Features
y = df['NObeyesdad']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
from sklearn.tree import DecisionTreeClassifier

# Train a Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

In [31]:
from sklearn.ensemble import RandomForestClassifier

# Train a Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [32]:
from sklearn.metrics import classification_report, accuracy_score

# Evaluate the Decision Tree model
y_pred_dt = dt_model.predict(X_test)
print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))
print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))

Decision Tree Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94        56
           1       0.87      0.87      0.87        62
           2       0.95      0.92      0.94        78
           3       0.93      0.95      0.94        58
           4       1.00      1.00      1.00        63
           5       0.89      0.89      0.89        56
           6       0.98      0.94      0.96        50

    accuracy                           0.93       423
   macro avg       0.93      0.93      0.93       423
weighted avg       0.93      0.93      0.93       423

Decision Tree Accuracy: 0.933806146572104


In [33]:
# Evaluate the Random Forest model
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.96      0.98        56
           1       0.88      0.90      0.89        62
           2       0.99      0.96      0.97        78
           3       0.97      0.98      0.97        58
           4       1.00      1.00      1.00        63
           5       0.88      0.88      0.88        56
           6       0.96      0.98      0.97        50

    accuracy                           0.95       423
   macro avg       0.95      0.95      0.95       423
weighted avg       0.95      0.95      0.95       423

Random Forest Accuracy: 0.9527186761229315


In [None]:
# import joblib

# joblib.dump(dt_model, 'decision_tree_model.pkl')
# joblib.dump(rf_model, 'random_forest_model.pkl')