In [1]:
from google.colab import files
uploaded = files.upload()

Saving train.csv to train.csv


In [2]:
from google.colab import files
uploaded = files.upload()

Saving test.csv to test.csv


In [3]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.5.0


In [4]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import optuna

In [5]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print("Data loaded successfully.")

Data loaded successfully.


In [6]:
test_ids = test_df['id']

In [7]:
train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)

In [8]:
X = train_df.drop('WeightCategory', axis=1)
y_raw = train_df['WeightCategory']
X_test = test_df

In [9]:
le = LabelEncoder()
y = le.fit_transform(y_raw)
num_classes = len(le.classes_)
print(f"Target variable encoded. Found {num_classes} classes.")

Target variable encoded. Found 7 classes.


In [10]:
combined_df = pd.concat([X, X_test], axis=0)
combined_processed = pd.get_dummies(combined_df, drop_first=False)

In [11]:
X_processed = combined_processed.iloc[:len(X)]
X_test_processed = combined_processed.iloc[len(X):]
print("Categorical features encoded.")

Categorical features encoded.


In [24]:
X_train, X_val, y_train, y_val = train_test_split(
    X_processed,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)
print(f"Training data split for optimization: {len(X_train)} train, {len(X_val)} validation samples.")

Training data split for optimization: 12426 train, 3107 validation samples.


In [17]:
# Optuna Function
def objective(trial):
    param = {
        'objective': 'multi:softmax',
        'num_class': num_classes,
        'eval_metric': 'mlogloss',
        'n_estimators': 1000,
        'random_state': 42,
        'n_jobs': -1,
        'early_stopping_rounds': 50,

        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True),
    }

    # XGBoost model with the suggested parameters
    xgb_model = xgb.XGBClassifier(**param)

    # Train the model
    xgb_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    # Store the best number of trees found
    trial.set_user_attr("best_iteration", xgb_model.best_iteration)

    preds = xgb_model.predict(X_val)

    # Return the accuracy
    accuracy = accuracy_score(y_val, preds)
    return accuracy

In [25]:
print("Test 100 different hyperparameter combinations.")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2025-10-26 10:30:37,536] A new study created in memory with name: no-name-f866e5c9-79b9-4d21-a81b-d222a8ef0013


Test 100 different hyperparameter combinations.


[I 2025-10-26 10:31:02,707] Trial 0 finished with value: 0.9060186675249436 and parameters: {'learning_rate': 0.014177266453524276, 'max_depth': 7, 'subsample': 0.7013042991230489, 'colsample_bytree': 0.6053374121633057, 'min_child_weight': 2, 'reg_alpha': 0.0010067139664566723, 'reg_lambda': 3.386728773979617e-07}. Best is trial 0 with value: 0.9060186675249436.
[I 2025-10-26 10:31:04,629] Trial 1 finished with value: 0.9015127132281944 and parameters: {'learning_rate': 0.2398404881384723, 'max_depth': 6, 'subsample': 0.617446508730907, 'colsample_bytree': 0.7835306628015303, 'min_child_weight': 1, 'reg_alpha': 6.109336512425741e-07, 'reg_lambda': 2.54221999853969e-05}. Best is trial 0 with value: 0.9060186675249436.
[I 2025-10-26 10:31:20,832] Trial 2 finished with value: 0.8999034438364982 and parameters: {'learning_rate': 0.018172835375588888, 'max_depth': 3, 'subsample': 0.6874302379818947, 'colsample_bytree': 0.6065674380372239, 'min_child_weight': 8, 'reg_alpha': 0.0002823419325

In [26]:
print(f"Best validation accuracy: {study.best_value:.4f}")
print("Found best parameters:")
print(study.best_params)

# best parameters
best_params = study.best_params
# optimal number of trees
best_iteration = study.best_trial.user_attrs["best_iteration"]

# Add back the fixed parameters
best_params.update({
    'objective': 'multi:softmax',
    'num_class': num_classes,
    'eval_metric': 'mlogloss',
    'random_state': 42,
    'n_jobs': -1,
    'n_estimators': best_iteration # Use the best number of trees
})

print("\nTraining with the best parameters.")
final_model = xgb.XGBClassifier(**best_params)

# Train on the FULL training dataset
final_model.fit(X_processed, y)

Best validation accuracy: 0.9079
Found best parameters:
{'learning_rate': 0.08339261820460793, 'max_depth': 6, 'subsample': 0.9149093401933116, 'colsample_bytree': 0.6834147072815087, 'min_child_weight': 9, 'reg_alpha': 0.030464122447465224, 'reg_lambda': 0.941665147483467}

Training with the best parameters.


In [27]:
predictions_int = final_model.predict(X_test_processed)
predictions_str = le.inverse_transform(predictions_int)

submission_df = pd.DataFrame({
    'id': test_ids,
    'WeightCategory': predictions_str
})

submission_df.to_csv('submission_optimized.csv', index=False)

In [28]:
from sklearn.metrics import accuracy_score, classification_report

# Predictions on validation set
y_val_pred = final_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
val_report = classification_report(y_val, y_val_pred)

print(f"Validation Accuracy: {val_accuracy:.4f}")
print("Classification Report:\n", val_report)


Validation Accuracy: 0.9385
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.97       374
           1       0.93      0.94      0.94       469
           2       0.93      0.90      0.91       441
           3       0.97      0.98      0.98       481
           4       0.99      1.00      0.99       597
           5       0.88      0.83      0.86       369
           6       0.87      0.90      0.89       376

    accuracy                           0.94      3107
   macro avg       0.93      0.93      0.93      3107
weighted avg       0.94      0.94      0.94      3107

