In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import lightgbm as lgb
import warnings

warnings.filterwarnings("ignore")

def smape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0
    return np.mean(diff) * 100

data = pd.read_csv("final_pca_whole_train_data.csv")
X = data.drop(columns=["price", "image_link", "sample_id"])
y = data["price"].values

X_train_raw, X_val_raw, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_val_scaled = scaler.transform(X_val_raw)

y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

lgb_model = lgb.LGBMRegressor(
    objective="regression",
    learning_rate=0.05,
    n_estimators=150,
    num_leaves=32,
    feature_fraction=0.9,
    bagging_fraction=0.95,
    bagging_freq=3,
    random_state=42,
    n_jobs=-1,

)

lgb_model.fit(
    X_train_scaled, y_train_log,
    eval_set=[(X_val_scaled, y_val_log)],
    eval_metric="l2",
)

y_pred_log = lgb_model.predict(X_val_scaled)
y_pred = np.expm1(y_pred_log)

mae = mean_absolute_error(y_val, y_pred)
smape_val = smape(y_val, y_pred)

print(f"Final MAE: {mae:.4f}")
print(f"Final SMAPE: {smape_val:.2f}%")

Training LightGBM...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.557147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 153261
[LightGBM] [Info] Number of data points in the train set: 59247, number of used features: 603
[LightGBM] [Info] Start training from score 2.736635
Final MAE: 15.1032
Final SMAPE: 66.19%
