In [None]:
# Preprocessing
X = train_df.drop("WeightCategory", axis=1)
y = train_df["WeightCategory"]
le = LabelEncoder()
y_encoded = le.fit_transform(y)

categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).drop('id', axis=1).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='drop'
)

###Decision Tree

In [None]:
# --- Build and Evaluate ---
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', DecisionTreeClassifier(random_state=42))])
model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)

print(f"--- Step 1: Baseline Decision Tree ---")
print(f"Validation Accuracy: {accuracy:.4f}\n")
print(classification_report(y_val, y_pred, target_names=le.classes_))

--- Step 1: Baseline Decision Tree ---
Validation Accuracy: 0.8378

                     precision    recall  f1-score   support

Insufficient_Weight       0.86      0.89      0.88       374
      Normal_Weight       0.78      0.78      0.78       469
     Obesity_Type_I       0.80      0.80      0.80       441
    Obesity_Type_II       0.93      0.93      0.93       481
   Obesity_Type_III       0.99      1.00      0.99       597
 Overweight_Level_I       0.69      0.63      0.66       369
Overweight_Level_II       0.70      0.74      0.72       376

           accuracy                           0.84      3107
          macro avg       0.82      0.82      0.82      3107
       weighted avg       0.84      0.84      0.84      3107



###XGBoost

In [None]:
# Build the XGBoost Pipeline
xgb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'))
])

# --- Train and Evaluate ---
xgb_pipeline.fit(X_train, y_train)
y_pred_xgb = xgb_pipeline.predict(X_val)
accuracy_xgb = accuracy_score(y_val, y_pred_xgb)

print(f"\n--- Step 2: XGBoost Model ---")
print(f"Validation Accuracy: {accuracy_xgb:.4f}")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



--- Step 2: XGBoost Model ---
Validation Accuracy: 0.9002


###**Hyperparameter Tuning with GridSearchCV**<br>
This step finds the best settings for your XGBoost model to maximize performance. This is the most important step for getting the highest score.

In [None]:
# create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'))
])

# Define the Parameter Grid to search
param_grid = {
    'classifier__n_estimators': [100, 250],  # Number of trees
    'classifier__max_depth': [3, 5, 7],      # Depth of trees
    'classifier__learning_rate': [0.1]
}

# --- Set up and run GridSearchCV ---
grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1, scoring='accuracy', verbose=2)
print("\n--- Step 3: Hyperparameter Tuning ---")
print("Starting search... (This may take a while)")
grid_search.fit(X, y_encoded)

print(f"\nBest parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")


--- Step 3: Hyperparameter Tuning ---
Starting search... (This may take a while)
Fitting 3 folds for each of 6 candidates, totalling 18 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



Best parameters found: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 3, 'classifier__n_estimators': 250}
Best cross-validation accuracy: 0.9041
