<a href="https://colab.research.google.com/github/shilpathota/AI_ML/blob/master/Predict_Obesity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
# Importing the dataset
!pip install ucimlrepo



In [49]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition = fetch_ucirepo(id=544)

# data (as pandas dataframes)
X = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.features
y = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.targets

# metadata
print(estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.metadata)

# variable information
print(estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.variables)

{'uci_id': 544, 'name': 'Estimation of Obesity Levels Based On Eating Habits and Physical Condition ', 'repository_url': 'https://archive.ics.uci.edu/dataset/544/estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition', 'data_url': 'https://archive.ics.uci.edu/static/public/544/data.csv', 'abstract': 'This dataset include data for the estimation of obesity levels in individuals from the countries of Mexico, Peru and Colombia, based on their eating habits and physical condition. ', 'area': 'Health and Medicine', 'tasks': ['Classification', 'Regression', 'Clustering'], 'characteristics': ['Multivariate'], 'num_instances': 2111, 'num_features': 16, 'feature_types': ['Integer'], 'demographics': ['Gender', 'Age'], 'target_col': ['NObeyesdad'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2019, 'last_updated': 'Tue Sep 10 2024', 'dataset_doi': '10.24432/C5H31Z', 'creators': [], 'intro_paper': {'ID': 358, 'type': 

In [50]:
print(X.shape)
print(X.dtypes)
print(X.head())
print(y.head())

(2111, 16)
Gender                             object
Age                               float64
Height                            float64
Weight                            float64
family_history_with_overweight     object
FAVC                               object
FCVC                              float64
NCP                               float64
CAEC                               object
SMOKE                              object
CH2O                              float64
SCC                                object
FAF                               float64
TUE                               float64
CALC                               object
MTRANS                             object
dtype: object
   Gender   Age  Height  Weight family_history_with_overweight FAVC  FCVC  \
0  Female  21.0    1.62    64.0                            yes   no   2.0   
1  Female  21.0    1.52    56.0                            yes   no   3.0   
2    Male  23.0    1.80    77.0                            yes   no   2.

Data Cleanup
- Removing Duplicates
- Encoding the Categorical Variables
- handling missing values

In [51]:
import pandas as pd

In [52]:
# Dropping duplicates
df = pd.concat([X, y],axis=1)
df_cleaned = df.drop_duplicates(X)

X = df_cleaned.drop(columns = y.columns)
y = df_cleaned[y.columns]

In [53]:
# Count of missing values in each column of X
missing_X = X.isnull().sum()
print(y.isnull().sum())


# Display only columns with missing values
print(missing_X[missing_X > 0])

NObeyesdad    0
dtype: int64
Series([], dtype: int64)


In [54]:
# For a combined DataFrame df
num_cols = df_cleaned.select_dtypes(include=['int64', 'float64']).columns
cat_cols = df_cleaned.select_dtypes(include=['object', 'category']).columns

In [55]:
# Filling the values
from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy='mean')  # or strategy='median'
df[num_cols] = num_imputer.fit_transform(df[num_cols])

In [56]:
# Categorical variables
cat_imputer = SimpleImputer(strategy='most_frequent')
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

In [57]:
# Label Encoder
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

In [58]:
# Saving the model
import joblib

joblib.dump(cat_imputer, 'cat_imputer.pkl')
joblib.dump(le, 'label_encoder_gender.pkl')  # one per column if needed

['label_encoder_gender.pkl']

In [59]:
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   int64  
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   int64  
 5   FAVC                            2111 non-null   int64  
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   int64  
 9   SMOKE                           2111 non-null   int64  
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   int64  
 12  FAF                             21

| Column                                                           | Type            | Encoding Strategy                    | Notes                              |
| ---------------------------------------------------------------- | --------------- | ------------------------------------ | ---------------------------------- |
| `Gender`                                                         | Binary          | Already Encoded (0=female, 1=male)   | ✅ Done                             |
| `Age`, `Height`, `Weight`                                        | Numerical       | No encoding needed                   | Just scale later                   |
| `family_history_with_overweight`, `FAVC`, `CAEC`, `SMOKE`, `SCC` | Binary/Nominal  | Already Encoded                      | ✅ Done                             |
| `FCVC`, `NCP`, `CH2O`, `FAF`, `TUE`, `CALC`                      | Ordinal/Numeric | Keep as is or ensure float → int     | Ordinal → can be scaled            |
| `MTRANS`                                                         | Nominal         | Needs **One-Hot Encoding**           | Multi-class categorical, unordered |
| `NObeyesdad`                                                     | Target          | Keep as **Label Encoded** if already | Multiclass target: fine as numeric |


In [60]:
# One-Hot Encode MTRANS properly
df = pd.get_dummies(df, columns=['MTRANS'], prefix='MTRANS', drop_first=True)

# Check the updated columns
print(df.columns)

Index(['Gender', 'Age', 'Height', 'Weight', 'family_history_with_overweight',
       'FAVC', 'FCVC', 'NCP', 'CAEC', 'SMOKE', 'CH2O', 'SCC', 'FAF', 'TUE',
       'CALC', 'NObeyesdad', 'MTRANS_1', 'MTRANS_2', 'MTRANS_3', 'MTRANS_4'],
      dtype='object')


In [61]:
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 20 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   int64  
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   int64  
 5   FAVC                            2111 non-null   int64  
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   int64  
 9   SMOKE                           2111 non-null   int64  
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   int64  
 12  FAF                             21

In [69]:
# Check if we have all cleaned data
print(df[["Gender", "family_history_with_overweight", "FAVC", "CAEC", "SMOKE", "SCC"]].dtypes)
print(df[["Gender", "family_history_with_overweight", "FAVC", "CAEC", "SMOKE", "SCC"]].head())

Gender                            int64
family_history_with_overweight    int64
FAVC                              int64
CAEC                              int64
SMOKE                             int64
SCC                               int64
dtype: object
   Gender  family_history_with_overweight  FAVC  CAEC  SMOKE  SCC
0       0                               1     0     2      0    0
1       0                               1     0     2      1    1
2       1                               1     0     2      0    0
3       1                               0     0     2      0    0
4       1                               0     0     2      0    0


Feature Scaling & Train test split

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [63]:
# 1. Separate features and target
X = df.drop("NObeyesdad", axis=1)
y = df["NObeyesdad"]

In [64]:
# 2. Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [70]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

In [71]:
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

Train shape: (1688, 19)
Test shape: (423, 19)


Model Training

To choose best model I do stratified sampling where some strats are taken for each of the K fold cross validation

In [72]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from xgboost import XGBClassifier

In [86]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "KNN": KNeighborsClassifier(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier()
}


In [87]:
# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = {}

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    results[name] = scores
    print(f"{name}: Mean Accuracy = {scores.mean():.4f} | Std = {scores.std():.4f}")

Logistic Regression: Mean Accuracy = 0.8679 | Std = 0.0150
KNN: Mean Accuracy = 0.8016 | Std = 0.0138
SVM: Mean Accuracy = 0.8578 | Std = 0.0099
Random Forest: Mean Accuracy = 0.9490 | Std = 0.0081
XGBoost: Mean Accuracy = 0.9704 | Std = 0.0068


The best model is Random Forest, XGBoost and SVM which has 94%, 97% and 85% accuracy

Hyper parameter tuning using Grid Search CV using the hyper parameters n_estimators, max_depth, min_samples_split

In [75]:
# Top 1 - Random Forest
from sklearn.model_selection import GridSearchCV

param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

grid_rf = GridSearchCV(RandomForestClassifier(), param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
grid_rf.fit(X_train, y_train)

print("Best RF Params:", grid_rf.best_params_)
print("Best RF Accuracy:", grid_rf.best_score_)


Best RF Params: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Best RF Accuracy: 0.9520200867381876


In [78]:
# Top 2 - SVM
param_grid_svm = {
    'C': [10, 11, 9],
    'kernel': ['linear', 'rbf'],
    'gamma': ['scale', 'auto']
}

grid_svm = GridSearchCV(SVC(), param_grid_svm, cv=5, scoring='accuracy', n_jobs=-1)
grid_svm.fit(X_train, y_train)

print("Best SVM Params:", grid_svm.best_params_)
print("Best SVM Accuracy:", grid_svm.best_score_)


Best SVM Params: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Best SVM Accuracy: 0.960307622074342


In [88]:
# Xgboost
param_grid_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 1],
    'reg_lambda': [1, 10]  # L2 regularization
}
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')

grid_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid_xgb,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid_xgb.fit(X_train, y_train)

print("✅ Best Parameters:", grid_xgb.best_params_)
print("🏆 Best CV Accuracy:", grid_xgb.best_score_)


Fitting 5 folds for each of 432 candidates, totalling 2160 fits


Parameters: { "use_label_encoder" } are not used.



✅ Best Parameters: {'colsample_bytree': 1.0, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'reg_lambda': 1, 'subsample': 1.0}
🏆 Best CV Accuracy: 0.9697803451969168


In [82]:
from sklearn.metrics import classification_report, confusion_matrix

#best_model = grid_rf.best_estimator_
best_model =  grid_svm.best_estimator_

y_pred = best_model.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Confusion Matrix:
[[54  0  0  0  0  0  0]
 [ 3 53  0  0  0  2  0]
 [ 0  0 67  1  0  0  2]
 [ 0  0  0 60  0  0  0]
 [ 0  0  0  1 64  0  0]
 [ 0  1  0  0  0 56  1]
 [ 0  0  2  0  0  4 52]]

Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97        54
           1       0.98      0.91      0.95        58
           2       0.97      0.96      0.96        70
           3       0.97      1.00      0.98        60
           4       1.00      0.98      0.99        65
           5       0.90      0.97      0.93        58
           6       0.95      0.90      0.92        58

    accuracy                           0.96       423
   macro avg       0.96      0.96      0.96       423
weighted avg       0.96      0.96      0.96       423



In [89]:
# Evaluating Xgboost
from sklearn.metrics import classification_report, confusion_matrix

best_xgb = grid_xgb.best_estimator_
y_pred_xgb = best_xgb.predict(X_test)

print("📊 Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("\n📝 Classification Report:\n", classification_report(y_test, y_pred_xgb))


📊 Confusion Matrix:
 [[49  5  0  0  0  0  0]
 [ 4 53  0  0  0  1  0]
 [ 0  0 69  0  0  0  1]
 [ 0  0  1 59  0  0  0]
 [ 0  0  1  0 64  0  0]
 [ 0  5  0  0  0 53  0]
 [ 0  0  1  0  0  1 56]]

📝 Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.91      0.92        54
           1       0.84      0.91      0.88        58
           2       0.96      0.99      0.97        70
           3       1.00      0.98      0.99        60
           4       1.00      0.98      0.99        65
           5       0.96      0.91      0.94        58
           6       0.98      0.97      0.97        58

    accuracy                           0.95       423
   macro avg       0.95      0.95      0.95       423
weighted avg       0.95      0.95      0.95       423



SAving the model that is best

In [90]:
import joblib

joblib.dump(best_model, 'best_model.pkl')
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

Loading the model to predict

In [91]:
# Load saved model
model = joblib.load('best_model.pkl')
scaler = joblib.load('scaler.pkl')

# New sample input (replace with real input)
new_data = pd.DataFrame([[1, 25, 1.70, 65, 1, 1, 2.0, 3.0, 1, 0, 2.0, 1, 2.0, 1.0, 2] + [0]*N])  # replace N with MTRANS one-hot features count

# Scale
new_data_scaled = scaler.transform(new_data)

# Predict
prediction = model.predict(new_data_scaled)
print("Predicted class:", prediction[0])


NameError: name 'N' is not defined

DEploying the model using flask

In [None]:
from flask import Flask, request, jsonify
import joblib
import numpy as np

app = Flask(__name__)
model = joblib.load('best_model.pkl')
scaler = joblib.load('scaler.pkl')

@app.route('/predict', methods=['POST'])
def predict():
    input_data = request.get_json(force=True)
    features = np.array(input_data['features']).reshape(1, -1)
    features_scaled = scaler.transform(features)
    prediction = model.predict(features_scaled)
    return jsonify({'prediction': int(prediction[0])})

if __name__ == '__main__':
    app.run(debug=True)
