<a href="https://colab.research.google.com/github/superv13/Obesity-Classification-Model/blob/main/LightGBM_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install optuna


Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.5.0


In [3]:

#  OBESITY CLASSIFICATION

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from lightgbm import LGBMClassifier, plot_importance
import optuna
import warnings
warnings.filterwarnings('ignore')


# Load your dataset

train_df = pd.read_csv('/content/drive/MyDrive/MLP/train.csv')
test_df  = pd.read_csv('/content/drive/MyDrive/MLP/test.csv')

print(f"Train shape: {train_df.shape}")
print(f"Test shape:  {test_df.shape}")

# Encode categorical columns

categorical_cols = [
    'Gender', 'family_history_with_overweight', 'FAVC', 'CAEC',
    'SMOKE', 'SCC', 'CALC', 'MTRANS'
]

le = LabelEncoder()
for col in categorical_cols:
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col]  = le.transform(test_df[col])


# Encode target label
target_le = LabelEncoder()
train_df['WeightCategory'] = target_le.fit_transform(train_df['WeightCategory'])


# Define features and target
X = train_df.drop(['id', 'WeightCategory'], axis=1)
y = train_df['WeightCategory']
test = test_df.drop(['id'], axis=1)


# Train-test split
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


Train shape: (15533, 18)
Test shape:  (5225, 17)


In [4]:
pip show lightgbm


Name: lightgbm
Version: 4.6.0
Summary: LightGBM Python-package
Home-page: https://github.com/microsoft/LightGBM
Author: 
Author-email: 
License: The MIT License (MIT)

 Copyright (c) Microsoft Corporation

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:

 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL T

In [None]:

#  OPTUNA TUNING

def objective(trial):
    param = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "random_state": 42,
        "num_class": len(np.unique(y)),

        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.05),
        "n_estimators": trial.suggest_int("n_estimators", 400, 600),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.005, 0.015),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.02, 0.06),
        "max_depth": trial.suggest_int("max_depth", 6, 14),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.3, 0.9),
        "subsample": trial.suggest_float("subsample", 0.8, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 10, 50),
    }
    model = LGBMClassifier(**param)
    model.fit(X_train, y_train)
    return model.score(X_valid, y_valid)

print("\n Starting Optuna hyperparameter tuning... (may take time)")
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=100)

best_params = study.best_params
print("\n Best Parameters found by Optuna:")
print(best_params)


[I 2025-10-26 14:54:07,876] A new study created in memory with name: no-name-7af24686-9b7c-48e8-83d2-6328f1bb3db5



 Starting Optuna hyperparameter tuning... (may take time)


[I 2025-10-26 14:54:23,749] Trial 0 finished with value: 0.9044093981332475 and parameters: {'learning_rate': 0.0249816047538945, 'n_estimators': 591, 'lambda_l1': 0.012319939418114049, 'lambda_l2': 0.04394633936788146, 'max_depth': 7, 'colsample_bytree': 0.3935967122017216, 'subsample': 0.8116167224336399, 'min_child_samples': 45}. Best is trial 0 with value: 0.9044093981332475.
[I 2025-10-26 14:54:37,789] Trial 1 finished with value: 0.9034438364982298 and parameters: {'learning_rate': 0.034044600469728355, 'n_estimators': 542, 'lambda_l1': 0.005205844942958024, 'lambda_l2': 0.05879639408647977, 'max_depth': 13, 'colsample_bytree': 0.4274034664069657, 'subsample': 0.8363649934414201, 'min_child_samples': 17}. Best is trial 0 with value: 0.9044093981332475.
[I 2025-10-26 14:54:50,931] Trial 2 finished with value: 0.9047312520115868 and parameters: {'learning_rate': 0.02216968971838151, 'n_estimators': 505, 'lambda_l1': 0.009319450186421156, 'lambda_l2': 0.03164916560792168, 'max_depth

In [None]:
#  Generate Submission (with decoded labels)

print("\n Generating final submission file...")
final_predictions = model.predict(test)

# Decode numeric predictions back to string labels
final_predictions_labels = target_le.inverse_transform(final_predictions)

submission = pd.DataFrame({
    "id": test_df["id"],
    "WeightCategory": final_predictions_labels
})

submission.to_csv("submission3.csv", index=False)
print("\nSubmission file 'submission.csv' created successfully!")
print(submission.head())

In [None]:

y_pred = model.predict(X_valid)

#  Evaluation

acc = accuracy_score(y_valid, y_pred)
print(f"\n Final Hold-out Accuracy: {acc:.4f}")
print("\nClassification Report:")
print(classification_report(y_valid, y_pred, target_names=target_le.classes_))

# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X, y, cv=cv, scoring='accuracy')
print(f"\nCross-Validation Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")


#  Confusion Matrix and Feature Importance

plt.figure(figsize=(8, 6))
sns.heatmap(confusion_matrix(y_valid, y_pred), annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - LightGBM (Optuna Tuned)")
plt.show()

plt.figure(figsize=(10, 6))
plot_importance(final_model, max_num_features=10)
plt.title("Top 10 Feature Importances - LightGBM")
plt.show()


In [None]:
final_model = LGBMClassifier(**best_params, n_jobs=-1)
final_model.fit(X, y)