In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
df=pd.read_csv("/content/ObesityDataSet_raw_and_data_sinthetic.csv")
df.head()

Unnamed: 0,Age,Gender,Height,Weight,CALC,FAVC,FCVC,NCP,SCC,SMOKE,CH2O,family_history_with_overweight,FAF,TUE,CAEC,MTRANS,NObeyesdad
0,21.0,Female,1.62,64.0,no,no,2.0,3.0,no,no,2.0,yes,0.0,1.0,Sometimes,Public_Transportation,Normal_Weight
1,21.0,Female,1.52,56.0,Sometimes,no,3.0,3.0,yes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,23.0,Male,1.8,77.0,Frequently,no,2.0,3.0,no,no,2.0,yes,2.0,1.0,Sometimes,Public_Transportation,Normal_Weight
3,27.0,Male,1.8,87.0,Frequently,no,3.0,3.0,no,no,2.0,no,2.0,0.0,Sometimes,Walking,Overweight_Level_I
4,22.0,Male,1.78,89.8,Sometimes,no,2.0,1.0,no,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [3]:
# List of categorical features (columns with text values)
categorical_features = ['Gender', 'family_history_with_overweight', 'FAVC',
                        'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']

print("\nCategorical features to be encoded:")
for column in categorical_features:
    print(f"{column}: {df[column].unique()}")  # Show the unique values before encoding

# Dictionary to save encoders for each column (useful later for new data)
label_encoders = {}

# Encode each categorical feature into numbers
for column in categorical_features:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])  # Convert text to numbers
    label_encoders[column] = le  # Save the encoder
    print(f"Encoding {column}: {list(le.classes_)} -> {list(range(len(le.classes_)))}")

# Encode the target column (obesity class)
target_encoder = LabelEncoder()
df['NObeyesdad'] = target_encoder.fit_transform(df['NObeyesdad'])

print("\nTarget classes after encoding:")
for i, label in enumerate(target_encoder.classes_):
    print(f"{label} -> {i}")



Categorical features to be encoded:
Gender: ['Female' 'Male']
family_history_with_overweight: ['yes' 'no']
FAVC: ['no' 'yes']
CAEC: ['Sometimes' 'Frequently' 'Always' 'no']
SMOKE: ['no' 'yes']
SCC: ['no' 'yes']
CALC: ['no' 'Sometimes' 'Frequently' 'Always']
MTRANS: ['Public_Transportation' 'Walking' 'Automobile' 'Motorbike' 'Bike']
Encoding Gender: ['Female', 'Male'] -> [0, 1]
Encoding family_history_with_overweight: ['no', 'yes'] -> [0, 1]
Encoding FAVC: ['no', 'yes'] -> [0, 1]
Encoding CAEC: ['Always', 'Frequently', 'Sometimes', 'no'] -> [0, 1, 2, 3]
Encoding SMOKE: ['no', 'yes'] -> [0, 1]
Encoding SCC: ['no', 'yes'] -> [0, 1]
Encoding CALC: ['Always', 'Frequently', 'Sometimes', 'no'] -> [0, 1, 2, 3]
Encoding MTRANS: ['Automobile', 'Bike', 'Motorbike', 'Public_Transportation', 'Walking'] -> [0, 1, 2, 3, 4]

Target classes after encoding:
Insufficient_Weight -> 0
Normal_Weight -> 1
Obesity_Type_I -> 2
Obesity_Type_II -> 3
Obesity_Type_III -> 4
Overweight_Level_I -> 5
Overweight_Level

In [5]:
# Standardize numerical features to improve model performance
print("\nStandardizing numerical features...")

# Create a StandardScaler object
scaler = StandardScaler()
numerical_features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
# Apply the scaler to the numerical features
# This transforms the data to have mean = 0 and standard deviation = 1
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Show the first few rows of the updated data
print("\nData after preprocessing:")
print(df.head())


Standardizing numerical features...

Data after preprocessing:
        Age  Gender    Height    Weight  CALC  FAVC      FCVC       NCP  SCC  \
0 -0.522124       0 -0.875589 -0.862558     3     0 -0.785019  0.404153    0   
1 -0.522124       0 -1.947599 -1.168077     2     0  1.088342  0.404153    1   
2 -0.206889       1  1.054029 -0.366090     1     0 -0.785019  0.404153    0   
3  0.423582       1  1.054029  0.015808     1     0  1.088342  0.404153    0   
4 -0.364507       1  0.839627  0.122740     2     0 -0.785019 -2.167023    0   

   SMOKE      CH2O  family_history_with_overweight       FAF       TUE  CAEC  \
0      0 -0.013073                               1 -1.188039  0.561997     2   
1      1  1.618759                               1  2.339750 -1.080625     2   
2      0 -0.013073                               1  1.163820  0.561997     2   
3      0 -0.013073                               0  1.163820 -1.080625     2   
4      0 -0.013073                               0 -1.1

In [6]:
X = df.drop('NObeyesdad', axis=1)
y = df['NObeyesdad']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Count how many unique target classes are in the data
num_classes = len(y.unique())

# Print the number of target classes
print(f"Number of target classes: {num_classes}")


Number of target classes: 7


In [9]:
model = Sequential([
    # Input layer: expects input with shape = number of features in training data
    tf.keras.Input(shape=(X_train.shape[1],)),

    # First hidden layer with 128 neurons and ReLU activation
    Dense(128, activation='relu'),

    # Dropout layer: randomly turns off 30% of the neurons to reduce overfitting
    Dropout(0.3),

    # Second hidden layer with 64 neurons
    Dense(64, activation='relu'),
    Dropout(0.3),

    # Third hidden layer with 32 neurons
    Dense(32, activation='relu'),
    Dropout(0.2),

    # Output layer with 'num_classes' units (one for each class), using softmax for multi-class classification
    Dense(num_classes, activation='softmax')
])

In [10]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [11]:
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True,
    verbose=1
)

In [12]:
print("\nTraining the model...")

# Train the model using training data
history = model.fit(
    X_train, y_train,        # Training features and labels
    epochs=100,              # Maximum number of times the model will see the data
    batch_size=32,           # Number of samples per update (smaller = slower but more accurate)
    validation_split=0.2,    # Use 20% of training data for validation (to check performance during training)
    callbacks=[early_stop],  # Stop training early if validation performance stops improving
    verbose=1                # Print training progress (1 = show full output)
)



Training the model...
Epoch 1/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.1625 - loss: 2.0307 - val_accuracy: 0.4497 - val_loss: 1.7352
Epoch 2/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.3374 - loss: 1.7628 - val_accuracy: 0.5473 - val_loss: 1.3961
Epoch 3/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4322 - loss: 1.4940 - val_accuracy: 0.5769 - val_loss: 1.0836
Epoch 4/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4938 - loss: 1.2724 - val_accuracy: 0.6509 - val_loss: 0.9089
Epoch 5/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5363 - loss: 1.1734 - val_accuracy: 0.6716 - val_loss: 0.8131
Epoch 6/100
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5834 - loss: 1.0265 - val_accuracy: 0.7337 - val_loss: 0.7216
Epoch 7/100

In [13]:
# Evaluate the model on test data
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0)

# Print the accuracy and loss on test data
print(f'Test Accuracy: {test_acc:.4f}')
print(f'Test Loss: {test_loss:.4f}')

Test Accuracy: 0.9433
Test Loss: 0.1555


In [14]:
# Make predictions on the test data
y_pred_proba = model.predict(X_test)  # Get prediction probabilities for each class
y_pred_classes = np.argmax(y_pred_proba, axis=1)  # Choose the class with the highest probability

# Calculate accuracy using scikit-learn
accuracy = accuracy_score(y_test, y_pred_classes)
print(f'Accuracy (using sklearn): {accuracy:.4f}')


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 28ms/step
Accuracy (using sklearn): 0.9433


In [17]:
# Create a new data sample for prediction
new_data = pd.DataFrame([{
    'Age': 22,
    'Gender': label_encoders['Gender'].transform(['Male'])[0],  # Encode 'Male' using the saved LabelEncoder
    'Height': 1.75,
    'Weight': 85,
    'family_history_with_overweight': label_encoders['family_history_with_overweight'].transform(['yes'])[0],
    'FAVC': label_encoders['FAVC'].transform(['yes'])[0],
    'FCVC': 2.0,  # Frequency of vegetable consumption
    'NCP': 3.0,   # Number of main meals per day
    'CAEC': label_encoders['CAEC'].transform(['Sometimes'])[0],  # Eating between meals
    'SMOKE': label_encoders['SMOKE'].transform(['no'])[0],
    'CH2O': 2.0,  # Water consumption
    'SCC': label_encoders['SCC'].transform(['no'])[0],
    'FAF': 1.0,   # Physical activity frequency
    'TUE': 1.0,   # Time using technology (in hours)
    'CALC': label_encoders['CALC'].transform(['Sometimes'])[0],
    'MTRANS': label_encoders['MTRANS'].transform(['Public_Transportation'])[0]
}])

# Display the new data before preprocessing (e.g., scaling)
print("Inference data before preprocessing:")
print(new_data)

Inference data before preprocessing:
   Age  Gender  Height  Weight  family_history_with_overweight  FAVC  FCVC  \
0   22       1    1.75      85                               1     1   2.0   

   NCP  CAEC  SMOKE  CH2O  SCC  FAF  TUE  CALC  MTRANS  
0  3.0     2      0   2.0    0  1.0  1.0     2       3  


In [18]:
new_data[numerical_features] = scaler.fit_transform(new_data[numerical_features])

In [23]:
# === Make a prediction on the new data ===
prediction_proba = model.predict(new_data)  # Get class probabilities
predicted_class_idx = np.argmax(prediction_proba[0])  # Get the index of the highest probability
predicted_class = target_encoder.inverse_transform([predicted_class_idx])[0]  # Convert index back to original label

# Print the predicted class
print(f"\nPredicted Obesity Class: {predicted_class}")

# Print the probability for each class
print(f"\nClass Probabilities:")
for i, class_name in enumerate(target_encoder.classes_):
    print(f"{class_name}: {prediction_proba[0][i]:.4f}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step

Predicted Obesity Class: Overweight_Level_II

Class Probabilities:
Insufficient_Weight: 0.0000
Normal_Weight: 0.0005
Obesity_Type_I: 0.0082
Obesity_Type_II: 0.0000
Obesity_Type_III: 0.0000
Overweight_Level_I: 0.3414
Overweight_Level_II: 0.6499


In [25]:
# Save model
model.save('model/obesity_model.keras')

# Save label encoders and scaler
import joblib
joblib.dump(label_encoders, 'model/label_encoders.pkl')
joblib.dump(target_encoder, 'model/target_encoder.pkl')
joblib.dump(scaler, 'model/scaler.pkl')


['model/scaler.pkl']