In [None]:
# optuna_lgb_smape_tuning_fixed_callbacks.py
import os
import re
import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from scipy.sparse import hstack
import warnings
warnings.filterwarnings("ignore")

RANDOM_SEED = 42
N_TRIALS = 1
NUM_BOOST_ROUND = 1000
EARLY_STOPPING_ROUNDS = 100

# -------------------------
# Load data + feature engineering
# -------------------------
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

def extract_quantity(text):
    value_match = re.search(r"Value:\s*([\d.]+)", str(text))
    value = float(value_match.group(1)) if value_match else 1.0

    pack_match = re.search(r"pack of (\d+)", str(text).lower())
    pack = int(pack_match.group(1)) if pack_match else 1

    return value * pack

train_df['quantity'] = train_df['catalog_content'].apply(extract_quantity).fillna(1)
test_df['quantity'] = test_df['catalog_content'].apply(extract_quantity).fillna(1)

def clean_text(text):
    text = str(text).replace('\n',' ').lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

train_df['clean_text'] = train_df['catalog_content'].apply(clean_text).fillna("unknown")
test_df['clean_text'] = test_df['catalog_content'].apply(clean_text).fillna("unknown")

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_text_train = vectorizer.fit_transform(train_df['clean_text'])
X_text_test = vectorizer.transform(test_df['clean_text'])

X_train = hstack([X_text_train, train_df[['quantity']].values.astype(np.float64)])
X_test = hstack([X_text_test, test_df[['quantity']].values.astype(np.float64)])
y_train = train_df['price'].values
y_train_log = np.log1p(y_train)

# Train-validation split for tuning
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train_log, test_size=0.2, random_state=RANDOM_SEED)

lgb_tr = lgb.Dataset(X_tr, label=y_tr)
lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_tr)

# -------------------------
# SMAPE functions
# -------------------------
def smape_np(y_true, y_pred):
    denom = (np.abs(y_true) + np.abs(y_pred)) + 1e-9
    return 100.0 * np.mean(2.0 * np.abs(y_pred - y_true) / denom)

def lgb_smape(preds, data):
    labels = data.get_label()
    y_true = np.expm1(labels)
    y_pred = np.expm1(preds)
    val = smape_np(y_true, y_pred)
    return 'smape', val, False

# -------------------------
# Optuna objective (uses callbacks instead of early_stopping_rounds kw)
# -------------------------
def objective(trial):
    param = {
        'objective': 'regression',
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart']),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.2, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 16, 512),
        'max_depth': trial.suggest_int('max_depth', 3, 16),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 10),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 200),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 10.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 10.0),
        'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 1.0),
        'verbosity': -1,
        'seed': RANDOM_SEED,
        'deterministic': True,
    }

    # Use callbacks for early stopping & logging (compatible across LightGBM versions)
    callbacks = [
        lgb.early_stopping(EARLY_STOPPING_ROUNDS),
        lgb.log_evaluation(period=0)  # disable built-in logging here; set period>0 to enable
    ]

    try:
        model = lgb.train(
            param,
            lgb_tr,
            num_boost_round=NUM_BOOST_ROUND,
            valid_sets=[lgb_tr, lgb_val],
            valid_names=['train', 'val'],
            feval=lgb_smape,
            callbacks=callbacks,
        )
    except Exception as e:
        print("Trial failed:", e)
        return 1e6

    best_iter = model.best_iteration if model.best_iteration is not None else NUM_BOOST_ROUND
    val_preds_log = model.predict(X_val, num_iteration=best_iter)
    val_preds = np.expm1(val_preds_log)
    y_val_orig = np.expm1(y_val)

    val_preds = np.clip(val_preds, 0.0, 1e9)
    y_val_orig = np.clip(y_val_orig, 0.0, 1e9)

    val_smape = smape_np(y_val_orig, val_preds)
    trial.set_user_attr("best_iteration", best_iter)
    return val_smape

# -------------------------
# Run Optuna & final training (use callbacks here too)
# -------------------------
if __name__ == "__main__":
    storage_name = "sqlite:///optuna_lgb_smape.db"
    study = optuna.create_study(
        study_name="lgb_smape_tuning",
        direction="minimize",
        storage=storage_name,
        load_if_exists=True,
    )

    print("Starting Optuna study ...")
    study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

    print("Study completed.")
    print("Best SMAPE: {:.6f}".format(study.best_value))
    print("Best params:")
    for k, v in study.best_params.items():
        print(f"  {k}: {v}")

    best_params = study.best_params.copy()
    best_params.update({
        'objective': 'regression',
        'verbosity': -1,
        'seed': RANDOM_SEED,
        'deterministic': True
    })

    # Retrain on almost full data; keep small holdout for early stopping
    X_tr_full, X_holdout, y_tr_full, y_holdout = train_test_split(X_train, y_train_log, test_size=0.05, random_state=RANDOM_SEED)
    lgb_tr_full = lgb.Dataset(X_tr_full, label=y_tr_full)
    lgb_holdout = lgb.Dataset(X_holdout, label=y_holdout, reference=lgb_tr_full)

    # callbacks for final training: show log every 100 rounds
    final_callbacks = [
        lgb.early_stopping(EARLY_STOPPING_ROUNDS),
        lgb.log_evaluation(period=100)
    ]

    print("Retraining final model on full training data with best params...")
    final_model = lgb.train(
        best_params,
        lgb_tr_full,
        num_boost_round=NUM_BOOST_ROUND,
        valid_sets=[lgb_tr_full, lgb_holdout],
        valid_names=['train', 'holdout'],
        feval=lgb_smape,
        callbacks=final_callbacks,
    )

    best_iter_final = final_model.best_iteration if final_model.best_iteration is not None else NUM_BOOST_ROUND
    print("Final model best iteration:", best_iter_final)

    # Evaluate on holdout
    hold_preds_log = final_model.predict(X_holdout, num_iteration=best_iter_final)
    hold_preds = np.expm1(hold_preds_log)
    y_hold_orig = np.expm1(y_holdout)
    hold_smape = smape_np(y_hold_orig, hold_preds)
    hold_mae = mean_absolute_error(y_hold_orig, hold_preds)
    print(f"Holdout MAE: {hold_mae:.4f}")
    print(f"Holdout SMAPE: {hold_smape:.4f}%")

    # Predict test set and save submission
    test_preds_log = final_model.predict(X_test, num_iteration=best_iter_final)
    test_preds = np.expm1(test_preds_log)
    test_preds = np.clip(test_preds, 0.01, 1e9)

    submission = pd.DataFrame({
        'sample_id': test_df['sample_id'],
        'price': np.round(test_preds, 2)
    })

    out_name = 'test_out_optuna_fixed.csv'
    submission.to_csv(out_name, index=False)
    print(f"âœ… Submission saved to {out_name}")
    print("Done.")


[I 2025-10-13 20:04:16,999] Using an existing study with name 'lgb_smape_tuning' instead of creating a new one.


Starting Optuna study ...


  0%|          | 0/1 [00:00<?, ?it/s]

[I 2025-10-13 20:11:45,788] Trial 52 finished with value: 68.63368803595849 and parameters: {'boosting_type': 'dart', 'learning_rate': 0.01018545598783915, 'num_leaves': 203, 'max_depth': 7, 'feature_fraction': 0.6093661228916337, 'bagging_fraction': 0.8298913413858434, 'bagging_freq': 5, 'min_child_samples': 109, 'lambda_l1': 8.890341692616618, 'lambda_l2': 2.2067577117742885, 'min_split_gain': 0.8307586532788592}. Best is trial 52 with value: 68.63368803595849.
Study completed.
Best SMAPE: 68.633688
Best params:
  boosting_type: dart
  learning_rate: 0.01018545598783915
  num_leaves: 203
  max_depth: 7
  feature_fraction: 0.6093661228916337
  bagging_fraction: 0.8298913413858434
  bagging_freq: 5
  min_child_samples: 109
  lambda_l1: 8.890341692616618
  lambda_l2: 2.2067577117742885
  min_split_gain: 0.8307586532788592
Retraining final model on full training data with best params...
