# Obesity Level Classification - Model Training

**Dataset**: Estimation of Obesity Levels Based on Eating Habits and Physical Condition (UCI)

**Objective**: Train and evaluate 6 classification models to predict obesity levels

**Author**: BITS WILP M.Tech AIML Assignment 2

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Metrics
from sklearn.metrics import (
    accuracy_score, roc_auc_score, precision_score, 
    recall_score, f1_score, matthews_corrcoef,
    confusion_matrix, classification_report
)

# Model persistence
import joblib
import os

print("All libraries imported successfully!")

## 2. Load and Explore Dataset

In [None]:
# Load dataset
# Dataset source: UCI ML Repository / Kaggle
# URL: https://archive.ics.uci.edu/dataset/544/estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition

df = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')

print(f"Dataset Shape: {df.shape}")
print(f"\nFeatures: {df.shape[1] - 1}")
print(f"Instances: {df.shape[0]}")
df.head()

In [None]:
# Dataset Info
print("Dataset Information:")
print("="*50)
df.info()

In [None]:
# Check for missing values
print("Missing Values:")
print("="*50)
print(df.isnull().sum())

In [None]:
# Target variable distribution
print("\nTarget Variable Distribution (NObeyesdad):")
print("="*50)
print(df['NObeyesdad'].value_counts())

plt.figure(figsize=(10, 6))
df['NObeyesdad'].value_counts().plot(kind='bar', color='steelblue', edgecolor='black')
plt.title('Distribution of Obesity Levels', fontsize=14)
plt.xlabel('Obesity Level', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Statistical summary
print("\nStatistical Summary:")
print("="*50)
df.describe()

## 3. Data Preprocessing

In [None]:
# Identify categorical and numerical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Categorical Columns ({len(categorical_cols)}): {categorical_cols}")
print(f"\nNumerical Columns ({len(numerical_cols)}): {numerical_cols}")

In [None]:
# Create a copy for preprocessing
df_processed = df.copy()

# Label encode categorical variables
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_processed[col] = le.fit_transform(df_processed[col])
    label_encoders[col] = le
    print(f"{col}: {dict(zip(le.classes_, le.transform(le.classes_)))}")

print("\nLabel encoding completed!")

In [None]:
# Separate features and target
X = df_processed.drop('NObeyesdad', axis=1)
y = df_processed['NObeyesdad']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature names: {list(X.columns)}")

In [None]:
# Train-test split (80-20)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("Feature scaling completed!")

## 4. Model Training and Evaluation

### Evaluation Metrics Function

In [None]:
def evaluate_model(model, X_test, y_test, model_name):
    """
    Evaluate model and return all required metrics
    """
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test) if hasattr(model, 'predict_proba') else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    mcc = matthews_corrcoef(y_test, y_pred)
    
    # AUC Score (multi-class: one-vs-rest)
    if y_pred_proba is not None:
        try:
            auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='weighted')
        except:
            auc = 0.0
    else:
        auc = 0.0
    
    metrics = {
        'Model': model_name,
        'Accuracy': round(accuracy, 4),
        'AUC': round(auc, 4),
        'Precision': round(precision, 4),
        'Recall': round(recall, 4),
        'F1': round(f1, 4),
        'MCC': round(mcc, 4)
    }
    
    return metrics, y_pred, confusion_matrix(y_test, y_pred)

print("Evaluation function defined!")

In [None]:
# Store results
results = []
models_dict = {}

### 4.1 Logistic Regression

In [None]:
print("Training Logistic Regression...")
print("="*50)

lr_model = LogisticRegression(max_iter=1000, random_state=42, multi_class='multinomial')
lr_model.fit(X_train_scaled, y_train)

lr_metrics, lr_pred, lr_cm = evaluate_model(lr_model, X_test_scaled, y_test, 'Logistic Regression')
results.append(lr_metrics)
models_dict['Logistic Regression'] = lr_model

print(f"\nLogistic Regression Results:")
for key, value in lr_metrics.items():
    if key != 'Model':
        print(f"  {key}: {value}")

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(lr_cm, annot=True, fmt='d', cmap='Blues')
plt.title('Logistic Regression - Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

### 4.2 Decision Tree Classifier

In [None]:
print("Training Decision Tree Classifier...")
print("="*50)

dt_model = DecisionTreeClassifier(random_state=42, max_depth=10)
dt_model.fit(X_train_scaled, y_train)

dt_metrics, dt_pred, dt_cm = evaluate_model(dt_model, X_test_scaled, y_test, 'Decision Tree')
results.append(dt_metrics)
models_dict['Decision Tree'] = dt_model

print(f"\nDecision Tree Results:")
for key, value in dt_metrics.items():
    if key != 'Model':
        print(f"  {key}: {value}")

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(dt_cm, annot=True, fmt='d', cmap='Greens')
plt.title('Decision Tree - Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

### 4.3 K-Nearest Neighbors Classifier

In [None]:
print("Training K-Nearest Neighbors Classifier...")
print("="*50)

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)

knn_metrics, knn_pred, knn_cm = evaluate_model(knn_model, X_test_scaled, y_test, 'KNN')
results.append(knn_metrics)
models_dict['KNN'] = knn_model

print(f"\nKNN Results:")
for key, value in knn_metrics.items():
    if key != 'Model':
        print(f"  {key}: {value}")

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(knn_cm, annot=True, fmt='d', cmap='Oranges')
plt.title('KNN - Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

### 4.4 Naive Bayes Classifier (Gaussian)

In [None]:
print("Training Gaussian Naive Bayes Classifier...")
print("="*50)

nb_model = GaussianNB()
nb_model.fit(X_train_scaled, y_train)

nb_metrics, nb_pred, nb_cm = evaluate_model(nb_model, X_test_scaled, y_test, 'Naive Bayes')
results.append(nb_metrics)
models_dict['Naive Bayes'] = nb_model

print(f"\nNaive Bayes Results:")
for key, value in nb_metrics.items():
    if key != 'Model':
        print(f"  {key}: {value}")

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(nb_cm, annot=True, fmt='d', cmap='Purples')
plt.title('Naive Bayes - Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

### 4.5 Random Forest (Ensemble)

In [None]:
print("Training Random Forest Classifier...")
print("="*50)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=15)
rf_model.fit(X_train_scaled, y_train)

rf_metrics, rf_pred, rf_cm = evaluate_model(rf_model, X_test_scaled, y_test, 'Random Forest')
results.append(rf_metrics)
models_dict['Random Forest'] = rf_model

print(f"\nRandom Forest Results:")
for key, value in rf_metrics.items():
    if key != 'Model':
        print(f"  {key}: {value}")

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(rf_cm, annot=True, fmt='d', cmap='YlGn')
plt.title('Random Forest - Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

### 4.6 XGBoost (Ensemble)

In [None]:
print("Training XGBoost Classifier...")
print("="*50)

xgb_model = XGBClassifier(
    n_estimators=100, 
    random_state=42, 
    max_depth=6,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='mlogloss'
)
xgb_model.fit(X_train_scaled, y_train)

xgb_metrics, xgb_pred, xgb_cm = evaluate_model(xgb_model, X_test_scaled, y_test, 'XGBoost')
results.append(xgb_metrics)
models_dict['XGBoost'] = xgb_model

print(f"\nXGBoost Results:")
for key, value in xgb_metrics.items():
    if key != 'Model':
        print(f"  {key}: {value}")

# Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(xgb_cm, annot=True, fmt='d', cmap='RdYlGn')
plt.title('XGBoost - Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.show()

## 5. Model Comparison

In [None]:
# Create comparison DataFrame
results_df = pd.DataFrame(results)
results_df = results_df.set_index('Model')

print("\n" + "="*80)
print("MODEL COMPARISON TABLE")
print("="*80)
print(results_df.to_string())
print("="*80)

In [None]:
# Visualization of model comparison
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
metrics_to_plot = ['Accuracy', 'AUC', 'Precision', 'Recall', 'F1', 'MCC']
colors = ['steelblue', 'coral', 'seagreen', 'orchid', 'goldenrod', 'slategray']

for idx, (metric, ax) in enumerate(zip(metrics_to_plot, axes.flat)):
    values = results_df[metric].values
    models = results_df.index.tolist()
    
    bars = ax.bar(models, values, color=colors[idx], edgecolor='black')
    ax.set_title(f'{metric} Comparison', fontsize=12, fontweight='bold')
    ax.set_ylabel(metric)
    ax.set_ylim(0, 1.1)
    ax.tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for bar, val in zip(bars, values):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.02, 
                f'{val:.3f}', ha='center', va='bottom', fontsize=9)

plt.suptitle('Model Performance Comparison', fontsize=14, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Best model identification
best_accuracy_model = results_df['Accuracy'].idxmax()
best_f1_model = results_df['F1'].idxmax()
best_auc_model = results_df['AUC'].idxmax()

print("\nBest Performing Models:")
print("="*50)
print(f"Best Accuracy: {best_accuracy_model} ({results_df.loc[best_accuracy_model, 'Accuracy']})")
print(f"Best F1 Score: {best_f1_model} ({results_df.loc[best_f1_model, 'F1']})")
print(f"Best AUC Score: {best_auc_model} ({results_df.loc[best_auc_model, 'AUC']})")

## 6. Save Models and Preprocessing Objects

In [None]:
# Create directory for saved models
os.makedirs('trained_models', exist_ok=True)

# Save all models
for name, model in models_dict.items():
    filename = f"trained_models/{name.lower().replace(' ', '_')}_model.pkl"
    joblib.dump(model, filename)
    print(f"Saved: {filename}")

# Save scaler and label encoders
joblib.dump(scaler, 'trained_models/scaler.pkl')
joblib.dump(label_encoders, 'trained_models/label_encoders.pkl')

# Save feature names
feature_names = list(X.columns)
joblib.dump(feature_names, 'trained_models/feature_names.pkl')

# Save class labels
class_labels = label_encoders['NObeyesdad'].classes_.tolist()
joblib.dump(class_labels, 'trained_models/class_labels.pkl')

# Save results DataFrame
results_df.to_csv('trained_models/model_results.csv')

# Save test data for Streamlit app
test_data = X_test.copy()
test_data['NObeyesdad'] = label_encoders['NObeyesdad'].inverse_transform(y_test)
test_data.to_csv('trained_models/test_data.csv', index=False)

print("\nAll models and artifacts saved successfully!")

## 7. Summary and Observations

In [None]:
print("\n" + "="*80)
print("FINAL MODEL COMPARISON TABLE")
print("="*80)
print(results_df.to_markdown())
print("="*80)

### Model Performance Observations

| ML Model Name | Observation about model performance |
|---------------|------------------------------------|
| **Logistic Regression** | Provides a solid baseline with good interpretability. Performance is moderate due to the linear decision boundary assumption, which may not capture complex non-linear relationships in the obesity data. Works well when features have linear relationships with the target. |
| **Decision Tree** | Shows good performance with ability to capture non-linear patterns. May be prone to overfitting without proper depth constraints. Provides excellent interpretability through feature importance and tree visualization. |
| **KNN** | Performance depends heavily on the choice of k and feature scaling. Works well when similar obesity levels cluster together in feature space. Can be computationally expensive for large datasets during prediction. |
| **Naive Bayes** | Assumes feature independence which may not hold for obesity data (e.g., height and weight are correlated). Fastest training time and works well with limited data. Lower performance compared to other models due to independence assumption violation. |
| **Random Forest (Ensemble)** | Excellent performance due to ensemble averaging that reduces overfitting. Robust to outliers and handles mixed feature types well. Feature importance provides insights into key obesity predictors like weight, height, and physical activity. |
| **XGBoost (Ensemble)** | Top performer with highest accuracy and AUC. Sequential boosting effectively handles complex feature interactions. Regularization prevents overfitting. Best suited for this multi-class obesity classification task. |

In [None]:
print("\nTraining Complete!")
print("="*50)
print("Files saved in 'trained_models/' directory:")
for f in os.listdir('trained_models'):
    print(f"  - {f}")