In [2]:
# =========================================================
# LightGBM Regressor with SMAPE
# =========================================================

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
import warnings

warnings.filterwarnings("ignore")

# =========================================================
# SMAPE metric
# =========================================================

# SMAPE
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))


In [4]:

# =========================================================
# Load Data
# =========================================================
data = pd.read_csv("X_train_embeddings_train_tfidf_words.csv")
X = data.drop(columns=["price", "sample_id"])
y = data["price"].values

# =========================================================
# Train / Validation Split
# =========================================================



In [8]:
X_train_scaled, X_val_scaled, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [11]:
X_train_scaled.columns
X_train

Index(['Unnamed: 0', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       '4991', '4992', '4993', '4994', '4995', '4996', '4997', '4998', '4999',
       '5000'],
      dtype='object', length=5002)

In [9]:

# =========================================================
# Scale features
# =========================================================
# scaler = RobustScaler()
# X_train_scaled = scaler.fit_transform(X_train_raw)
# X_val_scaled = scaler.transform(X_val_raw)

# =========================================================
# Log-transform target
# =========================================================
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)


In [10]:

# =========================================================
# LightGBM model (scikit-learn API)
# =========================================================
lgb_model = lgb.LGBMRegressor(
    objective="regression",
    learning_rate=0.05,
    n_estimators=250,
    num_leaves=32,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=1,
    random_state=42,
    n_jobs=-1
)

# =========================================================
# Fit with early stopping
# =========================================================
print("Training LightGBM...")
lgb_model.fit(
    X_train_scaled, y_train_log,
    eval_set=[(X_val_scaled, y_val_log)],
    eval_metric="l2",
)


Training LightGBM...


LightGBMError: Do not support special JSON characters in feature name.

In [None]:

# =========================================================
# Predict and evaluate
# =========================================================
y_pred_log = lgb_model.predict(X_val_scaled)
y_pred = np.expm1(y_pred_log)

mae = mean_absolute_error(y_val, y_pred)
smape_val = smape(y_val, y_pred)

print("==========================")
print(f"Final MAE: {mae:.4f}")
print(f"Final SMAPE: {smape_val:.2f}%")
print("==========================")