In [None]:
# ============================================================
# Smart Product Pricing 
# (TF-IDF -> SVD, Target Encodings, LGB/XGB + Linear Stack, Stacker, Bias Fix)
# ============================================================

import os, re, gc, time, random, warnings
warnings.filterwarnings("ignore")
from pathlib import Path
from typing import Tuple
import numpy as np
import pandas as pd

T0 = time.time()
def tic(msg): 
    print(f"\n[START] {msg}")
    return time.time()
def toc(t0, msg): 
    print(f"[END] {msg} -> {time.time()-t0:.2f}s")

In [2]:
# ---------------- Tunables (safe defaults) ----------------
SEED        = 42
N_SPLITS    = 5            # 3 if memory tight
N_COMP      = 448          # 384 or 256 if memory tight
TE_ALPHA    = 15.0         # target encoding smoothing
TE_MIN_CNT  = 5            # min category count before shrink
LGB_EST     = 2000         # upper bound; early stop will cut
LGB_ESR     = 160
XGB_ROUNDS  = 2400
XGB_ESR     = 180

random.seed(SEED); np.random.seed(SEED)

In [3]:

# ---------------- Paths ----------------
IS_KAGGLE = Path("/kaggle").exists()
WORK_DIR  = Path("/kaggle/working") if IS_KAGGLE else Path("/content"); WORK_DIR.mkdir(parents=True, exist_ok=True)
DATA_DIR  = Path("/kaggle/input/challenge-ml-data") if IS_KAGGLE else Path("dataset")
TRAIN_CSV = DATA_DIR / "train.csv"; TEST_CSV = DATA_DIR / "test.csv"
print(f"DATA_DIR={DATA_DIR.resolve()}\nWORK_DIR={WORK_DIR.resolve()}")

DATA_DIR=/kaggle/input/challenge-ml-data
WORK_DIR=/kaggle/working


In [4]:

# ---------------- Utils ----------------
def smape(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=np.float64)
    y_pred = np.asarray(y_pred, dtype=np.float64)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    m = denom != 0
    out = np.zeros_like(denom)
    out[m] = np.abs(y_true[m] - y_pred[m]) / denom[m]
    return np.mean(out) * 100.0

def inv(z):
    return np.expm1(np.clip(z, -50, 50))

def clean_text(s: str) -> str:
    if not isinstance(s, str): return ""
    s = s.replace("\n"," ").replace("\r"," ")
    s = re.sub(r"\s+"," ", s)
    return s.strip()

def extract_numeric_value_unit(s: str) -> Tuple[float, str]:
    if not isinstance(s, str): return 0.0, ""
    val = 0.0; unit = ""
    mv = re.search(r"Value:\s*([0-9]*\.?[0-9]+)", s, flags=re.IGNORECASE)
    mu = re.search(r"Unit:\s*([A-Za-z ]+)", s, flags=re.IGNORECASE)
    if mv:
        try: val = float(mv.group(1))
        except: val = 0.0
    if mu: unit = mu.group(1).strip()
    return val, unit

def standardize_quantity(value: float, unit: str) -> Tuple[float, float, float]:
    if unit is None: unit = ""
    u = unit.strip().lower()
    v = float(value) if value is not None else 0.0
    if "count" in u or u == "ct" or "pcs" in u or "piece" in u: return 0.0, 0.0, v
    if "pound" in u or "lb" in u: return v*453.592, 0.0, 0.0
    if "ounce" in u or u == "oz":
        if "fl" in u: return 0.0, v*29.5735, 0.0
        return v*28.3495, 0.0, 0.0
    if "gram" in u or u == "g": return v, 0.0, 0.0
    if "kg" in u or "kilogram" in u: return v*1000.0, 0.0, 0.0
    if "ml" in u: return 0.0, v, 0.0
    if u == "l" or "liter" in u or "litre" in u: return 0.0, v*1000.0, 0.0
    if "fl oz" in u or "fluid ounce" in u or "fl" in u: return 0.0, v*29.5735, 0.0
    return 0.0, 0.0, 0.0

def parse_pack_count(s: str) -> float:
    if not isinstance(s, str): return 1.0
    m = re.search(r"(?:pack of|case of)\s*([0-9]+)", s, flags=re.IGNORECASE)
    if m:
        try: return float(m.group(1))
        except: return 1.0
    return 1.0

def parse_title_itemname(s: str) -> str:
    if not isinstance(s, str): return ""
    m = re.search(r"Item Name:\s*(.+?)(?:\s*Bullet Point|\s*Product Description:|\s*Value:|\s*$)",
                  s, flags=re.IGNORECASE|re.DOTALL)
    return clean_text(m.group(1)) if m else ""

def maybe_brand_from_title(title: str) -> str:
    if not isinstance(title, str) or not title: return ""
    toks = re.split(r"[|,\-\(\)\/\s]+", title)
    return toks[0][:40] if toks else ""

In [5]:

# ---------------- Load & basic features ----------------
t0 = tic("Load & basic features")
train = pd.read_csv(TRAIN_CSV)
test  = pd.read_csv(TEST_CSV)
print("Shapes:", train.shape, test.shape)
assert {"sample_id","catalog_content","image_link","price"}.issubset(train.columns)
assert {"sample_id","catalog_content","image_link"}.issubset(test.columns)

for df in (train, test):
    df["catalog_content"] = df["catalog_content"].astype(str).apply(clean_text)
    df["title"] = df["catalog_content"].apply(parse_title_itemname)
    df["brand_heur"] = df["title"].apply(maybe_brand_from_title)

    v_u = df["catalog_content"].apply(extract_numeric_value_unit)
    df["value_extracted"] = v_u.apply(lambda x: float(x[0]) if x and x[0] is not None else 0.0)
    df["unit_extracted"]  = v_u.apply(lambda x: x[1] if x and x[1] is not None else "")

    q_std = df.apply(lambda r: standardize_quantity(r["value_extracted"], r["unit_extracted"]),
                     axis=1, result_type="expand")
    df["qty_g"]  = q_std[0].astype(float)
    df["qty_ml"] = q_std[1].astype(float)
    df["qty_cnt"]= q_std[2].astype(float)

    df["pack_count"] = df["catalog_content"].apply(parse_pack_count).astype(float)
    df["len_chars"] = df["catalog_content"].str.len()
    df["len_words"] = df["catalog_content"].apply(lambda s: len(s.split()))
    df["upper_ratio"] = df["catalog_content"].apply(lambda s: (sum(1 for ch in s if ch.isupper()) / (len(s)+1e-6)))
toc(t0, "Load & basic features")

# Targets
y = train["price"].astype(float).values
y_log = np.log1p(np.clip(y, 0, None))



[START] Load & basic features
Shapes: (75000, 4) (75000, 3)
[END] Load & basic features -> 28.65s


In [6]:

# ---------------- Enhanced quantity parsing ----------------
t0 = tic("Enhanced quantity parsing")
_q_rgx_multi = re.compile(
    r'(\d+)\s*[xX×]\s*([0-9]*\.?[0-9]+)\s*(kg|g|lb|oz|ml|l|litre|liter|fl\s*oz)\b',
    flags=re.IGNORECASE
)
_unit_map_w = {"kg":1000.0, "g":1.0, "lb":453.592, "oz":28.3495}
_unit_map_v = {"l":1000.0, "litre":1000.0, "liter":1000.0, "ml":1.0, "fl oz":29.5735}
def parse_multi_qty(s: str):
    if not isinstance(s, str): return (0.0, 0.0, 1.0)
    s_ = s.lower().replace("fluid ounce","fl oz").replace("fl. oz","fl oz")
    m = _q_rgx_multi.search(s_)
    w_g = 0.0; v_ml = 0.0; cnt = parse_pack_count(s)
    if m:
        n = float(m.group(1)); val = float(m.group(2)); u = m.group(3).strip().lower()
        if u in _unit_map_w: w_g = n * val * _unit_map_w[u]
        elif u in _unit_map_v: v_ml = n * val * _unit_map_v[u]
    return (w_g, v_ml, cnt)

for df in (train, test):
    df["qty_g2"], df["qty_ml2"], df["pack_cnt2"] = zip(*df["catalog_content"].map(parse_multi_qty))
    df["qty_g_tot"]  = (df["qty_g"].fillna(0)  + df["qty_g2"].fillna(0)).astype(float)
    df["qty_ml_tot"] = (df["qty_ml"].fillna(0) + df["qty_ml2"].fillna(0)).astype(float)
    df["pack_cnt_tot"] = np.maximum(df["pack_count"].fillna(1.0), df["pack_cnt2"].fillna(1.0)).astype(float)
    df["per_unit_g"]  = (df["qty_g_tot"]  / (df["pack_cnt_tot"]+1e-6)).clip(0, 5e6)
    df["per_unit_ml"] = (df["qty_ml_tot"] / (df["pack_cnt_tot"]+1e-6)).clip(0, 5e6)
toc(t0, "Enhanced quantity parsing")

# numeric columns (optimized)
num_cols = [
    "qty_g_tot","qty_ml_tot","pack_cnt_tot","per_unit_g","per_unit_ml",
    "len_chars","len_words","upper_ratio","value_extracted"
]



[START] Enhanced quantity parsing
[END] Enhanced quantity parsing -> 7.03s


In [7]:
# ---------------- TF-IDF (word+char, stronger) ----------------
t0 = tic("TF-IDF fit/transform")
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse

text_tr = (train["title"].fillna("") + " " + train["catalog_content"].fillna("") + " " + train["brand_heur"].fillna("")).tolist()
text_te = (test["title"].fillna("")  + " " + test["catalog_content"].fillna("")  + " " + test["brand_heur"].fillna("")).tolist()

tfidf_word = TfidfVectorizer(
    min_df=4,            # a touch lower to keep useful rare tokens
    max_df=0.95,
    ngram_range=(1,2),
    strip_accents="unicode",
    sublinear_tf=True,   # helps long docs
    norm="l2"
)
tfidf_char = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3,6),   # capture more unit/pattern variants
    min_df=4,
    max_df=0.95
)

Xw_tr = tfidf_word.fit_transform(text_tr)
Xw_te = tfidf_word.transform(text_te)
Xc_tr = tfidf_char.fit_transform(text_tr)
Xc_te = tfidf_char.transform(text_te)

Xtxt_tr = sparse.hstack([Xw_tr, Xc_tr], format="csr")
Xtxt_te = sparse.hstack([Xw_te, Xc_te], format="csr")
del Xw_tr, Xw_te, Xc_tr, Xc_te; gc.collect()
toc(t0, "TF-IDF fit/transform")



[START] TF-IDF fit/transform
[END] TF-IDF fit/transform -> 379.35s


In [8]:

# ---------------- Target Encodings (Stratified CV + interactions) ----------------
t0 = tic("Target encodings + interactions")
def first_two_tokens(s):
    if not isinstance(s, str): return ("","")
    t = s.split()
    a = t[0] if len(t) > 0 else ""
    b = t[1] if len(t) > 1 else ""
    return (a, b)
train[["tok1","tok2"]] = train["title"].fillna("").apply(first_two_tokens).apply(pd.Series)
test[["tok1","tok2"]]  = test["title"].fillna("").apply(first_two_tokens).apply(pd.Series)

train["brand_unit"] = (train["brand_heur"].fillna("") + "§" + train["unit_extracted"].fillna("")).astype(str)
test["brand_unit"]  = (test["brand_heur"].fillna("")  + "§" + test["unit_extracted"].fillna("")).astype(str)
train["tok1_unit"]  = (train["tok1"].fillna("")       + "§" + train["unit_extracted"].fillna("")).astype(str)
test["tok1_unit"]   = (test["tok1"].fillna("")        + "§" + test["unit_extracted"].fillna("")).astype(str)

enc_cols = ["brand_heur","tok1","tok2","unit_extracted","brand_unit","tok1_unit"]

from sklearn.model_selection import StratifiedKFold
bins = pd.qcut(y_log, q=10, duplicates="drop").codes
SKF = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

def cv_target_encode_strat(df_tr, df_te, y_log, col, alpha=TE_ALPHA, min_count=TE_MIN_CNT):
    oof = np.zeros(len(df_tr), dtype=np.float32)
    global_mean = float(np.mean(y_log))
    for tr_idx, va_idx in SKF.split(df_tr, bins):
        tr_g = df_tr.iloc[tr_idx][col]; va_g = df_tr.iloc[va_idx][col]; tr_y = y_log[tr_idx]
        s = pd.DataFrame({"g": tr_g.values, "y": tr_y})
        agg = s.groupby("g")["y"].agg(["sum","count"]).reset_index()
        agg["mean_smooth"] = (agg["sum"] + alpha*global_mean) / (agg["count"] + alpha)
        enc = dict(zip(agg["g"], agg["mean_smooth"]))
        oof[va_idx] = va_g.map(enc).fillna(global_mean).values.astype(np.float32)
    sfull = pd.DataFrame({"g": df_tr[col].values, "y": y_log})
    agg = sfull.groupby("g")["y"].agg(["sum","count"]).reset_index()
    agg["mean_smooth"] = (agg["sum"] + alpha*global_mean) / (agg["count"] + alpha)
    enc_full = dict(zip(agg["g"], agg["mean_smooth"]))
    te = df_te[col].map(enc_full).fillna(global_mean).values.astype(np.float32)
    return oof, te

enc_tr_list, enc_te_list = [], []
for c in enc_cols:
    oof_c, te_c = cv_target_encode_strat(train, test, y_log, c)
    train[f"enc_{c}"] = oof_c; test[f"enc_{c}"] = te_c
    enc_tr_list.append(oof_c); enc_te_list.append(te_c)

enc_tr = np.vstack(enc_tr_list).T.astype(np.float32)
enc_te = np.vstack(enc_te_list).T.astype(np.float32)
toc(t0, "Target encodings + interactions")


[START] Target encodings + interactions
[END] Target encodings + interactions -> 15.76s


In [9]:

# ---------------- SVD compact features ----------------
t0 = tic(f"SVD({N_COMP}) + compact matrices")
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=N_COMP, random_state=SEED)
Ztr_txt = svd.fit_transform(Xtxt_tr).astype(np.float32)
Zte_txt = svd.transform(Xtxt_te).astype(np.float32)
print(f"SVD explained variance ~ {svd.explained_variance_ratio_.sum():.3f}")

N_tr_num = train[num_cols].astype(np.float32).values
N_te_num = test[num_cols].astype(np.float32).values

Xr_tr = np.hstack([Ztr_txt, N_tr_num, enc_tr]).astype(np.float32)
Xr_te = np.hstack([Zte_txt, N_te_num, enc_te]).astype(np.float32)
print("Compact shapes:", Xr_tr.shape, Xr_te.shape)

del Ztr_txt, Zte_txt, N_tr_num, N_te_num, enc_tr, enc_te, Xtxt_tr, Xtxt_te
gc.collect()
toc(t0, "SVD + compact")


[START] SVD(448) + compact matrices
SVD explained variance ~ 0.247
Compact shapes: (75000, 463) (75000, 463)
[END] SVD + compact -> 1368.65s


In [10]:
# ---------------- Models: Linear + LGB(GBDT+DART) + XGB ----------------
t0 = tic("Models (Linear + LGB-GBDT + LGB-DART + XGB)")
from sklearn.linear_model import Ridge, ElasticNet, HuberRegressor

oofs, preds, names = [], [], []

# Linear trio
oof_r = np.zeros(len(train), np.float32); pr_r = np.zeros(len(test), np.float32)
oof_e = np.zeros(len(train), np.float32); pr_e = np.zeros(len(test), np.float32)
oof_h = np.zeros(len(train), np.float32); pr_h = np.zeros(len(test), np.float32)

print("\n===== Stratified 5-Fold: Ridge + ENet + Huber =====")
for f, (tr_idx, va_idx) in enumerate(SKF.split(Xr_tr, bins), 1):
    Xtr, Xva = Xr_tr[tr_idx], Xr_tr[va_idx]; ytr, yva = y_log[tr_idx], y_log[va_idx]
    r = Ridge(alpha=1.8, solver="lsqr", max_iter=20000, tol=1e-4).fit(Xtr, ytr)
    e = ElasticNet(alpha=0.001, l1_ratio=0.12, max_iter=4500, tol=1e-3).fit(Xtr, ytr)
    h = HuberRegressor(alpha=1e-4, epsilon=1.25, max_iter=220).fit(Xtr, ytr)
    pv_r, pt_r = r.predict(Xva), r.predict(Xr_te)
    pv_e, pt_e = e.predict(Xva), e.predict(Xr_te)
    pv_h, pt_h = h.predict(Xva), h.predict(Xr_te)
    oof_r[va_idx], oof_e[va_idx], oof_h[va_idx] = pv_r, pv_e, pv_h
    pr_r += pt_r/SKF.n_splits; pr_e += pt_e/SKF.n_splits; pr_h += pt_h/SKF.n_splits
    print(f"Fold {f}  R={smape(inv(yva),inv(pv_r)):.3f}  E={smape(inv(yva),inv(pv_e)):.3f}  H={smape(inv(yva),inv(pv_h)):.3f}")
    del Xtr, Xva; gc.collect()
print("OOF  Ridge:", f"{smape(y, inv(oof_r)):.3f}")
print("OOF  ENet :", f"{smape(y, inv(oof_e)):.3f}")
print("OOF  Huber:", f"{smape(y, inv(oof_h)):.3f}")

oofs += [oof_r, oof_e, oof_h]; preds += [pr_r, pr_e, pr_h]; names += ["Ridge","ENet","Huber"]

# LightGBM (GBDT)
HAVE_LGB = False
try:
    import lightgbm as lgb
    HAVE_LGB = True
except Exception as e:
    print("LightGBM unavailable:", e)

if HAVE_LGB:
    print("\n===== Stratified 5-Fold: LightGBM-GBDT (compact) =====")
    oof_l = np.zeros(len(train), np.float32); pr_l = np.zeros(len(test), np.float32)
    lgb_params_gbdt = dict(
        objective='mae', boosting_type='gbdt',
        learning_rate=0.03, num_leaves=208, max_depth=-1,
        feature_fraction=0.9, bagging_fraction=0.9, bagging_freq=1,
        min_data_in_leaf=24, lambda_l1=0.0, lambda_l2=1.2,
        max_bin=255,
        n_estimators=2200, random_state=SEED, n_jobs=-1, verbose=-1
    )
    for f, (tr_idx, va_idx) in enumerate(SKF.split(Xr_tr, bins), 1):
        Xtr, Xva = Xr_tr[tr_idx], Xr_tr[va_idx]; ytr, yva = y_log[tr_idx], y_log[va_idx]
        m = lgb.LGBMRegressor(**lgb_params_gbdt)
        m.fit(Xtr, ytr, eval_set=[(Xva, yva)], eval_metric='mae',
              callbacks=[lgb.early_stopping(stopping_rounds=180, verbose=False)])
        pv = m.predict(Xva, num_iteration=m.best_iteration_)
        pt = m.predict(Xr_te, num_iteration=m.best_iteration_)
        oof_l[va_idx] = pv.astype(np.float32); pr_l += pt.astype(np.float32)/SKF.n_splits
        print(f"Fold {f}  LGB-GBDT SMAPE={smape(inv(yva), inv(pv)):.3f}")
        del Xtr, Xva; gc.collect()
    print("OOF  LGB-GBDT :", f"{smape(y, inv(oof_l)):.3f}")
    oofs.append(oof_l); preds.append(pr_l); names.append("LGB-GBDT")

    # LightGBM (DART) — adds diversity; often blends well with GBDT
    print("\n===== Stratified 5-Fold: LightGBM-DART (compact) =====")
    oof_ld = np.zeros(len(train), np.float32); pr_ld = np.zeros(len(test), np.float32)
    lgb_params_dart = dict(
        objective='mae', boosting_type='dart',
        learning_rate=0.035, num_leaves=160, max_depth=-1,
        feature_fraction=0.9, bagging_fraction=0.9, bagging_freq=1,
        min_data_in_leaf=20, lambda_l1=0.0, lambda_l2=1.0,
        max_bin=255,
        drop_rate=0.1, skip_drop=0.5, uniform_drop=True,
        n_estimators=2600, random_state=SEED, n_jobs=-1, verbose=-1
    )
    for f, (tr_idx, va_idx) in enumerate(SKF.split(Xr_tr, bins), 1):
        Xtr, Xva = Xr_tr[tr_idx], Xr_tr[va_idx]; ytr, yva = y_log[tr_idx], y_log[va_idx]
        m = lgb.LGBMRegressor(**lgb_params_dart)
        m.fit(Xtr, ytr, eval_set=[(Xva, yva)], eval_metric='mae',
              callbacks=[lgb.early_stopping(stopping_rounds=220, verbose=False)])
        pv = m.predict(Xva, num_iteration=m.best_iteration_)
        pt = m.predict(Xr_te, num_iteration=m.best_iteration_)
        oof_ld[va_idx] = pv.astype(np.float32); pr_ld += pt.astype(np.float32)/SKF.n_splits
        print(f"Fold {f}  LGB-DART SMAPE={smape(inv(yva), inv(pv)):.3f}")
        del Xtr, Xva; gc.collect()
    print("OOF  LGB-DART :", f"{smape(y, inv(oof_ld)):.3f}")
    oofs.append(oof_ld); preds.append(pr_ld); names.append("LGB-DART")

# XGBoost
HAVE_XGB = False
try:
    import xgboost as xgb
    HAVE_XGB = True
except Exception as e:
    print("XGBoost unavailable:", e)

if HAVE_XGB:
    print("\n===== Stratified 5-Fold: XGBoost (compact, tighter) =====")
    oof_x = np.zeros(len(train), np.float32); pr_x = np.zeros(len(test), np.float32)
    params = dict(
        objective='reg:absoluteerror',
        tree_method='gpu_hist' if os.environ.get("CUDA_VISIBLE_DEVICES") else 'hist',
        max_depth=8, min_child_weight=5,
        subsample=0.9, colsample_bytree=0.9,
        learning_rate=0.035, reg_alpha=0.0, reg_lambda=1.2,
        gamma=0.0, random_state=SEED, verbosity=0
    )
    for f, (tr_idx, va_idx) in enumerate(SKF.split(Xr_tr, bins), 1):
        Xtr, Xva = Xr_tr[tr_idx], Xr_tr[va_idx]; ytr, yva = y_log[tr_idx], y_log[va_idx]
        dtr = xgb.DMatrix(Xtr, label=ytr); dva = xgb.DMatrix(Xva, label=yva); dte = xgb.DMatrix(Xr_te)
        try:
            m = xgb.train(params=params, dtrain=dtr, num_boost_round=2600,
                          evals=[(dtr,"train"),(dva,"valid")], early_stopping_rounds=200, verbose_eval=False)
        except Exception:
            params_f = dict(params, max_depth=7)
            m = xgb.train(params=params_f, dtrain=dtr, num_boost_round=1800,
                          evals=[(dtr,"train"),(dva,"valid")], early_stopping_rounds=140, verbose_eval=False)
        pv = m.predict(dva, iteration_range=(0, m.best_iteration+1))
        pt = m.predict(dte, iteration_range=(0, m.best_iteration+1))
        oof_x[va_idx] = pv.astype(np.float32); pr_x += pt.astype(np.float32)/SKF.n_splits
        print(f"Fold {f}  XGB SMAPE={smape(inv(yva), inv(pv)):.3f}")
        del dtr, dva, dte, Xtr, Xva; gc.collect()
    print("OOF  XGB :", f"{smape(y, inv(oof_x)):.3f}")
    oofs.append(oof_x); preds.append(pr_x); names.append("XGB")

print("\nModels available:", names)
toc(t0, "Models (Linear + LGB-GBDT + LGB-DART + XGB)")



[START] Models (Linear + LGB-GBDT + LGB-DART + XGB)

===== Stratified 5-Fold: Ridge + ENet + Huber =====
Fold 1  R=70.658  E=58.734  H=68.602
Fold 2  R=70.108  E=58.026  H=68.919
Fold 3  R=70.222  E=58.153  H=64.393
Fold 4  R=70.009  E=58.218  H=64.031
Fold 5  R=70.193  E=57.907  H=68.178
OOF  Ridge: 70.238
OOF  ENet : 58.208
OOF  Huber: 66.825

===== Stratified 5-Fold: LightGBM-GBDT (compact) =====
Fold 1  LGB-GBDT SMAPE=49.092
Fold 2  LGB-GBDT SMAPE=48.884
Fold 3  LGB-GBDT SMAPE=49.228
Fold 4  LGB-GBDT SMAPE=48.990
Fold 5  LGB-GBDT SMAPE=48.609
OOF  LGB-GBDT : 48.961

===== Stratified 5-Fold: LightGBM-DART (compact) =====
Fold 1  LGB-DART SMAPE=195.760
Fold 2  LGB-DART SMAPE=195.838
Fold 3  LGB-DART SMAPE=195.839
Fold 4  LGB-DART SMAPE=195.832
Fold 5  LGB-DART SMAPE=195.970
OOF  LGB-DART : 195.848

===== Stratified 5-Fold: XGBoost (compact, tighter) =====
Fold 1  XGB SMAPE=50.134
Fold 2  XGB SMAPE=49.894
Fold 3  XGB SMAPE=50.148
Fold 4  XGB SMAPE=50.018
Fold 5  XGB SMAPE=49.502
OOF 

In [11]:
# ---------------- Meta-learner stacker (simplex, non-negative) ----------------
t0 = tic("Meta-learner stacker")
oof_stack  = np.stack(oofs,  axis=1).astype(np.float32)
test_stack = np.stack(preds, axis=1).astype(np.float32)
K = oof_stack.shape[1]

# init by projected gradient on y_log (convex surrogate)
w = np.ones(K, dtype=np.float32)/K
for it in range(300):
    grad = (oof_stack @ w - y_log).astype(np.float64) @ oof_stack
    w = w - 0.2 * grad.astype(np.float32) / max(1.0, np.linalg.norm(grad))
    w = np.maximum(w, 0.0); s = w.sum(); w = w/s if s>0 else np.ones(K, np.float32)/K

def smape_score_w(wvec): return smape(y, inv(oof_stack @ wvec))

best = smape_score_w(w.copy())
for _ in range(4):
    for k in range(K):
        base = w.copy(); others = 1.0 - base[k]
        span = np.linspace(max(0.0, base[k]-0.3), min(1.0, base[k]+0.3), 16, dtype=np.float32)
        best_local, best_s = base.copy(), best
        for ak in span:
            wk = base.copy(); wk[k] = ak
            if K > 1:
                if others > 0:
                    scale = (1.0 - ak)/others
                    for j in range(K):
                        if j != k: wk[j] = max(0.0, wk[j]*scale)
                else:
                    wk[:] = 0.0; wk[k] = 1.0
            s = smape_score_w(wk)
            if s < best_s: best_s, best_local = s, wk
        w, best = best_local, best_s

print(f"Stacker OOF SMAPE={best:.3f}  weights={w.round(4).tolist()}  sum={float(w.sum()):.4f}")
final_log = test_stack @ w
toc(t0, "Meta-learner stacker")

# ---------------- Residual bias corrections (brand + unit + brand×unit) ----------------
t0 = tic("Residual bias corrections (log-space)")
df_oof = pd.DataFrame({
    "brand": train["brand_heur"].fillna(""),
    "unit":  train["unit_extracted"].fillna(""),
    "bu":    (train["brand_heur"].fillna("") + "§" + train["unit_extracted"].fillna("")).astype(str),
    "y": y_log,
    "p": (oof_stack @ w)
})
df_oof["res"] = df_oof["y"] - df_oof["p"]

def smooth_bias(key):
    g = df_oof.groupby(key)["res"].agg(["mean","count"]).reset_index()
    ALPHA = 14.0  # shrinkage
    g["mean_smooth"] = (g["mean"] * g["count"]) / (g["count"] + ALPHA)
    return dict(zip(g[key], g["mean_smooth"]))

bias_brand = smooth_bias("brand")
bias_unit  = smooth_bias("unit")
bias_bu    = smooth_bias("bu")

b_brand = test["brand_heur"].fillna("").map(bias_brand).fillna(0.0).values.astype(np.float32)
b_unit  = test["unit_extracted"].fillna("").map(bias_unit).fillna(0.0).values.astype(np.float32)
b_bu    = (test["brand_heur"].fillna("") + "§" + test["unit_extracted"].fillna("")).map(bias_bu).fillna(0.0).values.astype(np.float32)

# Combine small biases; use conservative weights to avoid overfit
final_log_bias = final_log + 0.7*b_brand + 0.4*b_unit + 0.5*b_bu
toc(t0, "Residual bias corrections")



[START] Meta-learner stacker
Stacker OOF SMAPE=48.961  weights=[0.0, 0.0, 0.0, 1.0, 0.0, 0.0]  sum=1.0000
[END] Meta-learner stacker -> 0.90s

[START] Residual bias corrections (log-space)
[END] Residual bias corrections -> 0.16s


In [12]:
# ---------------- Post-process & Submission ----------------
t0 = tic("Post-process & submission")
preds = inv(final_log_bias)
q1, q3 = np.percentile(y, [25, 75]); iqr = q3 - q1; hi = q3 + 3.0*iqr
preds = np.clip(preds, 0.01, hi*5).astype(np.float64)

sub = pd.DataFrame({"sample_id": test["sample_id"].astype(int).values, "price": preds})[["sample_id","price"]]
OUT_PATH = WORK_DIR / "test_out.csv"
sub.to_csv(OUT_PATH, index=False)
print(f"\nSaved predictions to: {OUT_PATH.resolve()}")
print(sub.head(8))
toc(t0, "Post-process & submission")

print(f"\nTotal runtime: {time.time()-T0:.2f}s")


[START] Post-process & submission

Saved predictions to: /kaggle/working/test_out.csv
   sample_id      price
0     100179  13.531790
1     245611  22.735712
2     146263  17.402027
3      95658   6.051785
4      36806  27.372946
5     148239   5.574152
6      92659  11.784738
7       3780  15.106535
[END] Post-process & submission -> 0.15s

Total runtime: 15389.99s
