In [1]:
import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Load data
train = pd.read_csv("train.csv")
features = [col for col in train.columns if col not in ["id", "time", "Y1", "Y2"]]

X = train[features]
y = train["Y1"]  # <-- choose y1 or y2

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    # --- FEATURE SELECTION PART ---
    # Either keep all features, or drop one randomly
    feature_subset = features.copy()
    if trial.suggest_categorical("drop_one_feature", [True, False]):
        drop_feature = trial.suggest_categorical("feature_to_drop", feature_subset)
        feature_subset.remove(drop_feature)
    
    X_train_sub = X_train[feature_subset]
    X_val_sub = X_val[feature_subset]
    
    # --- HYPERPARAMETERS PART ---
    params = {
        "objective": "regression",
        "metric": "rmse",  # LightGBM needs a valid metric
        "verbosity": -1,
        "boosting_type": "gbdt",
        "n_estimators": 5000,
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 31, 256),
        "max_depth": trial.suggest_int("max_depth", -1, 15),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 5.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 5.0),
    }

    # Train model
    model = lgb.LGBMRegressor(**params)
    model.fit(
        X_train_sub, y_train,
        eval_set=[(X_val_sub, y_val)],
        eval_metric="rmse",
        callbacks=[lgb.early_stopping(200, verbose=False)]
    )
    
    preds = model.predict(X_val_sub)
    r2 = r2_score(y_val, preds)
    
    # Log which features were used
    trial.set_user_attr("used_features", feature_subset)
    
    return r2

# Run optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=250, show_progress_bar=True)

# Results
print("Best R²:", study.best_value)
print("Best Params:", study.best_params)
print("Features used:", study.best_trial.user_attrs["used_features"])


[I 2025-09-22 04:51:07,719] A new study created in memory with name: no-name-1d88a59d-090f-48db-bc31-c806d41ce020


  0%|          | 0/250 [00:00<?, ?it/s]

[I 2025-09-22 04:51:22,055] Trial 0 finished with value: 0.7656994069374025 and parameters: {'drop_one_feature': False, 'learning_rate': 0.018047822643388767, 'num_leaves': 121, 'max_depth': 0, 'feature_fraction': 0.939596700013158, 'bagging_fraction': 0.6520457316402423, 'bagging_freq': 8, 'lambda_l1': 0.6451032717137972, 'lambda_l2': 3.7439395731998086}. Best is trial 0 with value: 0.7656994069374025.
[I 2025-09-22 04:51:39,960] Trial 1 finished with value: 0.7644882940720511 and parameters: {'drop_one_feature': True, 'feature_to_drop': 'E', 'learning_rate': 0.02217515529211953, 'num_leaves': 174, 'max_depth': 14, 'feature_fraction': 0.9203538980029686, 'bagging_fraction': 0.704965699492487, 'bagging_freq': 7, 'lambda_l1': 4.103545460977234, 'lambda_l2': 0.5631252176565432}. Best is trial 0 with value: 0.7656994069374025.
[I 2025-09-22 04:52:10,441] Trial 2 finished with value: 0.7678725242857621 and parameters: {'drop_one_feature': False, 'learning_rate': 0.011798670252857916, 'num_

In [2]:
import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# Load data
train = pd.read_csv("train.csv")
features = [col for col in train.columns if col not in ["id", "time", "Y1", "Y2"]]

X = train[features]
y = train["Y2"]  # <-- choose y1 or y2

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

def objective_2(trial):
    # --- FEATURE SELECTION PART ---
    # Either keep all features, or drop one randomly
    feature_subset = features.copy()
    if trial.suggest_categorical("drop_one_feature", [True, False]):
        drop_feature = trial.suggest_categorical("feature_to_drop", feature_subset)
        feature_subset.remove(drop_feature)
    
    X_train_sub = X_train[feature_subset]
    X_val_sub = X_val[feature_subset]
    
    # --- HYPERPARAMETERS PART ---
    params = {
        "objective": "regression",
        "metric": "rmse",  # LightGBM needs a valid metric
        "verbosity": -1,
        "boosting_type": "gbdt",
        "n_estimators": 5000,
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.1, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 31, 256),
        "max_depth": trial.suggest_int("max_depth", -1, 15),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 5.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 5.0),
    }

    # Train model
    model = lgb.LGBMRegressor(**params)
    model.fit(
        X_train_sub, y_train,
        eval_set=[(X_val_sub, y_val)],
        eval_metric="rmse",
        callbacks=[lgb.early_stopping(200, verbose=False)]
    )
    
    preds = model.predict(X_val_sub)
    r2 = r2_score(y_val, preds)
    
    # Log which features were used
    trial.set_user_attr("used_features", feature_subset)
    
    return r2

# Run optimization
study2 = optuna.create_study(direction="maximize")
study2.optimize(objective_2, n_trials=250, show_progress_bar=True)

# Results
print("Best R²:", study2.best_value)
print("Best Params:", study2.best_params)
print("Features used:", study2.best_trial.user_attrs["used_features"])


[I 2025-09-22 05:29:37,683] A new study created in memory with name: no-name-74e5f387-3cd0-4496-9b29-bd61a859d492


  0%|          | 0/250 [00:00<?, ?it/s]

[I 2025-09-22 05:29:40,933] Trial 0 finished with value: 0.712573394198141 and parameters: {'drop_one_feature': True, 'feature_to_drop': 'K', 'learning_rate': 0.0372038652367285, 'num_leaves': 93, 'max_depth': 11, 'feature_fraction': 0.8685275545400712, 'bagging_fraction': 0.8301948142432898, 'bagging_freq': 6, 'lambda_l1': 1.237743502546098, 'lambda_l2': 0.6794650955554682}. Best is trial 0 with value: 0.712573394198141.
[I 2025-09-22 05:29:43,269] Trial 1 finished with value: 0.7223682942025084 and parameters: {'drop_one_feature': False, 'learning_rate': 0.0829781331253707, 'num_leaves': 64, 'max_depth': 15, 'feature_fraction': 0.6374565358294194, 'bagging_fraction': 0.6423376681777567, 'bagging_freq': 7, 'lambda_l1': 4.654875282701928, 'lambda_l2': 1.3140919709568222}. Best is trial 1 with value: 0.7223682942025084.
[I 2025-09-22 05:29:46,364] Trial 2 finished with value: 0.7165948394978833 and parameters: {'drop_one_feature': False, 'learning_rate': 0.03156755108502234, 'num_leaves

In [9]:
import pandas as pd
import lightgbm as lgb

# Load train and test
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Features (exclude id, time, y1, y2)
features = [col for col in train.columns if col not in ["id", "time", "Y1", "Y2"]]

X = train[features]
y1 = train["Y1"]
y2 = train["Y2"]

# Print best params and best R² from Optuna
print("Best R² from tuning Y1:", study.best_value)
print("Best parameters for Y1:", study.best_params)

print("Best R² from tuning Y2:", study2.best_value)
print("Best parameters for Y2:", study2.best_params)

# Prepare final parameters
best_params1 = study.best_params.copy()
best_params1.update({
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "n_estimators": 5000,   # more trees since we train on full data
})

best_params2 = study2.best_params.copy()
best_params2.update({
    "objective": "regression",
    "metric": "rmse",
    "verbosity": -1,
    "boosting_type": "gbdt",
    "n_estimators": 5000,   # more trees since we train on full data
})


# Train model for y1
model_y1 = lgb.LGBMRegressor(**best_params1)
model_y1.fit(X, y1, eval_metric="rmse")

# Train model for y2
model_y2 = lgb.LGBMRegressor(**best_params2)
model_y2.fit(X, y2, eval_metric="rmse")

# Predictions on test set
X_test = test[features]
test["Y1"] = model_y1.predict(X_test)
test["Y2"] = model_y2.predict(X_test)

# Save submission file
submission = test[["id", "Y1", "Y2"]]
submission.to_csv("submission_y1_y2.csv", index=False)

print("✅ File saved as submission_y1_y2.csv")


Best R² from tuning Y1: 0.7740017843097684
Best parameters for Y1: {'drop_one_feature': True, 'feature_to_drop': 'B', 'learning_rate': 0.007112912631634668, 'num_leaves': 207, 'max_depth': 10, 'feature_fraction': 0.6234902817859154, 'bagging_fraction': 0.657725096581343, 'bagging_freq': 6, 'lambda_l1': 3.73635988724836, 'lambda_l2': 1.8236318145877732}
Best R² from tuning Y2: 0.7326741545296769
Best parameters for Y2: {'drop_one_feature': True, 'feature_to_drop': 'E', 'learning_rate': 0.009856325885219324, 'num_leaves': 252, 'max_depth': 11, 'feature_fraction': 0.6754586178291333, 'bagging_fraction': 0.6578550928146473, 'bagging_freq': 9, 'lambda_l1': 0.9570076661480098, 'lambda_l2': 0.7084216252360638}
✅ File saved as submission_y1_y2.csv
