# Amazon ML Challenge — Multimodal Price Prediction




In [None]:
!fusermount -u /content/drive
!rm -rf /content/drive
from google.colab import drive
drive.mount('/content/drive')


fusermount: failed to unmount /content/drive: No such file or directory
Mounted at /content/drive


In [None]:
from pathlib import Path
DATA_DIR = "/content/drive/MyDrive/68e8d1d70b66d_student_resource/student_resource/dataset"
print("Exists:", Path(DATA_DIR).exists())
print("Files:", [x.name for x in Path(DATA_DIR).iterdir()])

import pandas as pd
train = pd.read_csv(f"{DATA_DIR}/train.csv")
test  = pd.read_csv(f"{DATA_DIR}/test.csv")
sample_in  = pd.read_csv(f"{DATA_DIR}/sample_test.csv")
sample_out = pd.read_csv(f"{DATA_DIR}/sample_test_out.csv")
print(train.shape, test.shape)


Exists: True
Files: ['sample_test_out.csv', 'sample_test.csv', 'test.csv', 'train.csv']
(75000, 4) (75000, 3)


In [None]:
!pip -q install git+https://github.com/openai/CLIP.git
!pip -q install open-clip-torch ftfy regex tqdm pillow
!pip -q install sentence-transformers scikit-learn joblib
!pip -q install requests aiohttp aiofiles tenacity



  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for clip (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import random
import time
from pathlib import Path
import numpy as np
import pandas as pd
import joblib
import torch
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

DATA_DIR = "/content/drive/MyDrive/68e8d1d70b66d_student_resource/student_resource/dataset"

BASE = "/content/drive/MyDrive/68e8d1d70b66d_student_resource/student_resource"
IMG_DIR = f"{BASE}/images"
EMB_DIR = f"{BASE}/embeddings"
SUB_DIR = f"{BASE}/submissions"

for d in [IMG_DIR, EMB_DIR, SUB_DIR]:
    os.makedirs(d, exist_ok=True)

print('Dirs ready:', IMG_DIR, EMB_DIR, SUB_DIR)

# Sanity check that CSVs exist where expected
from pathlib import Path
p = Path(DATA_DIR)
print("DATA_DIR exists:", p.exists())
print("Files:", [x.name for x in p.iterdir()] if p.exists() else "Not found")


Dirs ready: /content/drive/MyDrive/68e8d1d70b66d_student_resource/student_resource/images /content/drive/MyDrive/68e8d1d70b66d_student_resource/student_resource/embeddings /content/drive/MyDrive/68e8d1d70b66d_student_resource/student_resource/submissions
DATA_DIR exists: True
Files: ['sample_test_out.csv', 'sample_test.csv', 'test.csv', 'train.csv']


In [None]:
train = pd.read_csv(f"{DATA_DIR}/train.csv")
test  = pd.read_csv(f"{DATA_DIR}/test.csv")
sample_in  = pd.read_csv(f"{DATA_DIR}/sample_test.csv")
sample_out = pd.read_csv(f"{DATA_DIR}/sample_test_out.csv")


In [None]:
##
import re
import numpy as np

def normalize_text(s):
    if not isinstance(s, str):
        return ''
    return (s.replace('\u2019', "'")
             .replace('\u2018', "'")
             .replace('\u2013', '-')
             .replace('\u2014', '-')
             .strip())

def parse_pack_count(s):
    if not isinstance(s, str):
        return 1
    s = normalize_text(s)
    m = re.search(r'(pack of|per case|case of)\s*(\d+)', s, flags=re.I)
    if m:
        try: return int(m.group(2))
        except Exception: pass
    m = re.search(r'[\(\s]x\s*(\d+)[\)\s]', s, flags=re.I)
    if m:
        try: return int(m.group(1))
        except Exception: pass
    m = re.search(r'(\d+)[\-\s]?pack\b', s, flags=re.I)
    if m:
        try: return int(m.group(1))
        except Exception: pass
    return 1

# Additional pack-of patterns like "12 ct", "24 count", "6 pcs"
def parse_pack_count_extra(s):
    if not isinstance(s, str):
        return None
    s = normalize_text(s)
    m = re.search(r'(\d+)\s*(?:ct|count|pcs|pieces)\b', s, flags=re.I)
    if m:
        try: return int(m.group(1))
        except: pass
    # generic "<num> x" when not captured by previous regex
    m = re.search(r'\b(\d+)\s*x\b', s, flags=re.I)
    if m:
        try: return int(m.group(1))
        except: pass
    return None

def parse_size(s):
    if not isinstance(s, str):
        return np.nan, None
    s = normalize_text(s)
    m = re.search(r'(\d+(?:\.\d+)?)\s*(fl\.?\s*oz|fluid\s*ounce|ounce|oz|ml|l|g|kg|lb|lbs)\b', s, flags=re.I)
    if not m:
        return np.nan, None
    try:
        val = float(m.group(1))
    except Exception:
        return np.nan, None
    unit = m.group(2).lower().replace('.', '').replace(' ', '')
    return val, unit

# Composite qty patterns: "2x500g", "500 g x 2", "2 x 500 ml"
_QTY_BLOCKS = [
    r'(\d+(?:\.\d+)?)\s*(ml|l|g|kg|oz|lb|lbs)\s*x\s*(\d+)',
    r'(\d+)\s*x\s*(\d+(?:\.\d+)?)\s*(ml|l|g|kg|oz|lb|lbs)',
    r'(\d+)\s*x\s*(\d+(?:\.\d+)?)(ml|l|g|kg|oz|lb|lbs)',
    r'(\d+(?:\.\d+)?)\s*(ml|l|g|kg|oz|lb|lbs)\s*-\s*(?:pack of|x)\s*(\d+)',
    r'(\d+)\s*pack\s*of\s*(\d+(?:\.\d+)?)\s*(ml|l|g|kg|oz|lb|lbs)',
    r'(\d+)x(\d+(?:\.\d+)?)(ml|l|g|kg|oz|lb|lbs)'
]

def parse_composite_qty(s):
    if not isinstance(s, str):
        return None, None, None
    s = normalize_text(s)
    for pat in _QTY_BLOCKS:
        m = re.search(pat, s, flags=re.I)
        if m:
            g = [x for x in m.groups() if x is not None]
            nums  = [x for x in g if re.match(r'^\d+(?:\.\d+)?$', x)]
            units = [x.lower() for x in g if re.match(r'^(ml|l|g|kg|oz|lb|lbs)$', x, flags=re.I)]
            if len(nums) >= 2 and len(units) >= 1:
                try:
                    a, b = float(nums[0]), float(nums[1])  # two numbers found
                    # Interpret as count x size
                    if a.is_integer():
                        count = int(a); size_val = b
                    else:
                        count = int(b) if b.is_integer() else int(round(b))
                        size_val = a
                    unit = units[0].replace('.', '')
                    return count, size_val, unit
                except:
                    continue
    return None, None, None

UNIT_TO_BASE = {
    'oz': ('g', 28.3495), 'ounce': ('g', 28.3495),
    'floz': ('ml', 29.5735), 'fluidounce': ('ml', 29.5735),
    'ml': ('ml', 1.0), 'l': ('ml', 1000.0),
    'g': ('g', 1.0), 'kg': ('g', 1000.0),
    'lb': ('g', 453.592), 'lbs': ('g', 453.592),
}

def normalize_qty(v, u):
    if u is None or not np.isfinite(v):
        return np.nan, None
    if u in UNIT_TO_BASE:
        base, factor = UNIT_TO_BASE[u]
        return v * factor, base
    return np.nan, None

def brand_guess(s):
    if not isinstance(s, str):
        return ''
    s = normalize_text(s)
    m = re.search(r'Item Name:\s*([^,\n]+)', s, flags=re.I)
    if m:
        cand = m.group(1).strip()
        if cand:
            parts = cand.split()
            if len(parts) > 0:
                return parts[0]
    m2 = re.search(r"\b([A-Z][A-Za-z'-]{2,})\b", s)
    return m2.group(1) if m2 else ''

# === Apply base parsing ===
for df in [train, test]:
    df['catalog_content'] = df['catalog_content'].fillna('').apply(normalize_text)
    df['pack_count'] = df['catalog_content'].apply(parse_pack_count)
    size_list = df['catalog_content'].apply(parse_size)
    df['size_val']  = [v for v, _ in size_list]
    df['size_unit'] = [u for _, u in size_list]
    norm_list = [normalize_qty(v, u) for v, u in zip(df['size_val'], df['size_unit'])]
    df['norm_qty']  = [v for v, _ in norm_list]
    df['norm_unit'] = [u for _, u in norm_list]
    df['total_qty'] = df['pack_count'] * df['norm_qty'].fillna(0)
    df['brand_guess'] = df['catalog_content'].apply(brand_guess)

# === Extend with extra parsing and enriched features ===
for df in [train, test]:
    # Better pack count if missing
    pc_extra = df['catalog_content'].apply(parse_pack_count_extra).fillna(0).astype(int)
    df['pack_count'] = np.where((df['pack_count']<=1) & (pc_extra>1), pc_extra, df['pack_count'])

    # Composite quantity like "2x500g"
    comp = df['catalog_content'].apply(parse_composite_qty)
    comp_count = [c if c is not None else np.nan for c,_,_ in comp]
    comp_size  = [v if v is not None else np.nan for _,v,_ in comp]
    comp_unit  = [u if u is not None else None for _,_,u in comp]

    # Prefer composite when present
    df['size_val']  = np.where(np.isfinite(comp_size), comp_size, df['size_val'])
    df['size_unit'] = np.where(pd.Series(comp_unit).notna(), comp_unit, df['size_unit'])

    # Re-normalize quantities and totals
    norm_list2 = [normalize_qty(v, u) for v, u in zip(df['size_val'], df['size_unit'])]
    df['norm_qty']  = [v for v,_ in norm_list2]
    df['norm_unit'] = [u for _,u in norm_list2]
    df['total_qty'] = df['pack_count'] * df['norm_qty'].fillna(0)

# === Numeric features list (extended) ===
num_cols = ['pack_count', 'size_val', 'norm_qty', 'total_qty']
# === Additional text-based features for SMAPE boost ===
train['has_sale']      = train['catalog_content'].str.contains('sale|discount', case=False).astype(int)
train['text_word_cnt'] = train['catalog_content'].str.split().str.len()
train['avg_wlen']      = train['catalog_content'].str.len() / (train['text_word_cnt'] + 1)

test['has_sale']       = test['catalog_content'].str.contains('sale|discount', case=False).astype(int)
test['text_word_cnt']  = test['catalog_content'].str.split().str.len()
test['avg_wlen']       = test['catalog_content'].str.len() / (test['text_word_cnt'] + 1)

num_cols += ['has_sale', 'text_word_cnt', 'avg_wlen']
# Robust extras used by models
train['has_qty']  = (~train['norm_qty'].isna()).astype(int)
test['has_qty']   = (~test['norm_qty'].isna()).astype(int)
for df in (train, test):
    df['log_total_qty'] = np.log1p(df['total_qty'].fillna(0))
    df['qty_per_pack']  = np.log1p(df['norm_qty'].fillna(0)/np.maximum(df['pack_count'],1))
    df['has_pack']      = (df['pack_count']>1).astype(int)

extra_num = ['has_qty','log_total_qty','qty_per_pack','has_pack']
num_cols = list(dict.fromkeys(num_cols + extra_num))

for c in num_cols:
    train[c] = train[c].fillna(0).astype(float)
    test[c]  = test[c].fillna(0).astype(float)

print('Numeric feature summary:')
print(train[num_cols].describe())
print('Brand guess examples:', train['brand_guess'].value_counts().head(10).to_dict())


Numeric feature summary:
         pack_count      size_val      norm_qty     total_qty      has_sale  \
count  75000.000000  75000.000000  7.500000e+04  7.500000e+04  75000.000000   
mean      13.850880     17.755865  6.512404e+02  9.660751e+03      0.007133   
std      509.352042     95.467490  6.397394e+04  1.377993e+06      0.084158   
min        0.000000      0.000000  0.000000e+00  0.000000e+00      0.000000   
25%        1.000000      0.000000  0.000000e+00  0.000000e+00      0.000000   
50%        1.000000      4.250000  1.417475e+02  2.500000e+02      0.000000   
75%        6.000000     12.000000  4.036783e+02  9.071840e+02      0.000000   
max    93207.000000  17400.000000  1.740000e+07  3.740008e+08      1.000000   

       text_word_cnt      avg_wlen       has_qty  log_total_qty  qty_per_pack  \
count   75000.000000  75000.000000  75000.000000   75000.000000  75000.000000   
mean      147.851693      5.949507      0.714240       4.481155      3.347319   
std       137.068731

In [None]:
# Image embedding with CLIP. Uses local images in IMG_DIR named <sample_id>.jpg (or .png)
try:
    import clip
    use_openai_clip = True
except Exception:
    use_openai_clip = False
    try:
        import open_clip
    except Exception:
        pass

from PIL import Image
import tqdm
def load_image_for_sid(sid):
    p1 = Path(IMG_DIR)/f"{sid}.jpg"
    p2 = Path(IMG_DIR)/f"{sid}.png"
    if p1.exists(): return p1
    if p2.exists(): return p2
    return None

def encode_images(df, out_path):
    # returns numpy array of embeddings
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    if use_openai_clip:
        model, preprocess = clip.load('ViT-B/32', device=device)
        model.eval()
        emb_dim = model.visual.output_dim if hasattr(model, 'visual') else 512
    else:
        import open_clip
        model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
        model.to(device); model.eval()
        emb_dim = model.visual.output_dim if hasattr(model, 'visual') else 512

    out = []
    for sid in tqdm.tqdm(df['sample_id'].tolist(), desc=f'encode {out_path.name}'):
        p = load_image_for_sid(sid)
        if p is None:
            out.append(np.zeros(emb_dim, dtype=np.float32))
            continue
        try:
            im = Image.open(p).convert('RGB')
            x = preprocess(im).unsqueeze(0).to(device)
            with torch.no_grad():
                emb = model.encode_image(x) if use_openai_clip else model.encode_image(x)
                emb = emb.cpu().numpy().reshape(-1)
            out.append(emb)
        except Exception:
            out.append(np.zeros(emb_dim, dtype=np.float32))
    out = np.vstack(out)
    np.save(out_path, out)
    return out

IMG_EMB_TRAIN = Path(EMB_DIR)/'img_train.npy'
IMG_EMB_TEST  = Path(EMB_DIR)/'img_test.npy'
if IMG_EMB_TRAIN.exists() and IMG_EMB_TEST.exists():
    img_emb_train = np.load(IMG_EMB_TRAIN)
    img_emb_test  = np.load(IMG_EMB_TEST)
else:
    img_emb_train = encode_images(train, IMG_EMB_TRAIN)
    img_emb_test  = encode_images(test, IMG_EMB_TEST)
print('img_emb_train', img_emb_train.shape)


img_emb_train (75000, 512)


In [None]:
# Text embeddings (SBERT)
try:
    from sentence_transformers import SentenceTransformer
except Exception:
    raise RuntimeError('Please install sentence-transformers')
sbert_model = 'all-MiniLM-L6-v2'
TEXT_EMB_TRAIN = Path(EMB_DIR)/'text_train.npy'
TEXT_EMB_TEST  = Path(EMB_DIR)/'text_test.npy'
if TEXT_EMB_TRAIN.exists() and TEXT_EMB_TEST.exists():
    text_emb_train = np.load(TEXT_EMB_TRAIN)
    text_emb_test  = np.load(TEXT_EMB_TEST)
else:
    print('Loading SBERT model', sbert_model)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    sbert = SentenceTransformer(sbert_model, device=device)
    sbert._first_module().auto_model.half()
    sbert.max_seq_length = 256

    def chunked_encode(texts, chunks=8, bs=64):
        out = []
        n   = len(texts)
        for i in range(chunks):
            sub = texts[i*n//chunks:(i+1)*n//chunks]
            emb = sbert.encode(sub, batch_size=bs, convert_to_numpy=True)
            out.append(emb)
        return l2_normalize_rows(np.vstack(out))

    text_emb_train = chunked_encode(train['catalog_content'].tolist())
    text_emb_test  = chunked_encode(test['catalog_content'].tolist())

    tmp_train = TEXT_EMB_TRAIN.with_suffix('.tmp.npy')
    tmp_test  = TEXT_EMB_TEST.with_suffix('.tmp.npy')
    np.save(tmp_train, text_emb_train); tmp_train.replace(TEXT_EMB_TRAIN)
    np.save(tmp_test,  text_emb_test);  tmp_test.replace(TEXT_EMB_TEST)
print('text_emb_train', text_emb_train.shape)


text_emb_train (75000, 384)


In [None]:
##
# Text embeddings (SBERT) with model fallback + L2 normalization
try:
    from sentence_transformers import SentenceTransformer
except Exception:
    raise RuntimeError('Please install sentence-transformers')

def l2_normalize_rows(a, eps=1e-8):
    a = a.astype(np.float32, copy=False)
    norms = np.linalg.norm(a, axis=1, keepdims=True)
    return a / (norms + eps)

# Prefer stronger checkpoint, with safe fallbacks
candidates = ['all-mpnet-base-v2', 'all-MiniLM-L12-v2', 'all-MiniLM-L6-v2']
TEXT_EMB_TRAIN = Path(EMB_DIR)/'text_train.npy'
TEXT_EMB_TEST  = Path(EMB_DIR)/'text_test.npy'
MODEL_TAG      = Path(EMB_DIR)/'text_model.txt'

if TEXT_EMB_TRAIN.exists() and TEXT_EMB_TEST.exists():
    text_emb_train = np.load(TEXT_EMB_TRAIN)
    text_emb_test  = np.load(TEXT_EMB_TEST)
else:
    loaded = False
    for sbert_model in candidates:
        try:
            print('Loading SBERT model', sbert_model)
            sbert = SentenceTransformer(sbert_model)
            text_emb_train = sbert.encode(
                train['catalog_content'].tolist(),
                batch_size=64, show_progress_bar=True, convert_to_numpy=True
            )
            text_emb_test  = sbert.encode(
                test['catalog_content'].tolist(),
                batch_size=64, show_progress_bar=True, convert_to_numpy=True
            )
            # L2 normalize
            text_emb_train = l2_normalize_rows(text_emb_train)
            text_emb_test  = l2_normalize_rows(text_emb_test)
            # Cache
            np.save(TEXT_EMB_TRAIN, text_emb_train)
            np.save(TEXT_EMB_TEST, text_emb_test)
            with open(MODEL_TAG, 'w') as f:
                f.write(sbert_model)
            loaded = True
            break
        except Exception as e:
            print('Failed on', sbert_model, '->', e)
    if not loaded:
        raise RuntimeError('Could not load any Sentence-Transformers model.')

print('text_emb_train', text_emb_train.shape)


text_emb_train (75000, 384)


In [None]:
# Tabular features
train['brand_guess'] = train['brand_guess'].fillna('').astype(str)
brand_freq = train['brand_guess'].value_counts().to_dict()
train['brand_freq'] = train['brand_guess'].map(lambda x: brand_freq.get(x,0)).astype(float)
test['brand_freq']  = test['brand_guess'].map(lambda x: brand_freq.get(x,0)).fillna(0).astype(float)
tab_cols = ['pack_count','size_val','norm_qty','total_qty','brand_freq']
tab_train = train[tab_cols].fillna(0).values.astype(float)
tab_test  = test[tab_cols].fillna(0).values.astype(float)

# PCA dims (reduce SBERT & CLIP dims for speed)
PCA_TEXT_DIM = 128
PCA_IMG_DIM  = 128
pca_text_path = Path(EMB_DIR)/'pca_text.joblib'
pca_img_path  = Path(EMB_DIR)/'pca_img.joblib'
if pca_text_path.exists():
    pca_text = joblib.load(pca_text_path)
    text_p_train = pca_text.transform(text_emb_train)
    text_p_test  = pca_text.transform(text_emb_test)
else:
    pca_text = PCA(n_components=PCA_TEXT_DIM, random_state=SEED)
    text_p_train = pca_text.fit_transform(text_emb_train)
    text_p_test  = pca_text.transform(text_emb_test)
    joblib.dump(pca_text, pca_text_path)
if pca_img_path.exists():
    pca_img = joblib.load(pca_img_path)
    img_p_train = pca_img.transform(img_emb_train)
    img_p_test  = pca_img.transform(img_emb_test)
else:
    pca_img = PCA(n_components=PCA_IMG_DIM, random_state=SEED)
    img_p_train = pca_img.fit_transform(img_emb_train)
    img_p_test  = pca_img.transform(img_emb_test)
    joblib.dump(pca_img, pca_img_path)
print('Reduced shapes:', text_p_train.shape, img_p_train.shape)

X_train = np.hstack([text_p_train, img_p_train, tab_train])
X_test  = np.hstack([text_p_test, img_p_test, tab_test])
y_train = np.log1p(train['price'].values)
print('Final feature shapes', X_train.shape, X_test.shape)


Reduced shapes: (75000, 128) (75000, 128)
Final feature shapes (75000, 261) (75000, 261)


In [None]:
##
# === Tabular features (extended) ===
# brand_freq kept; add features engineered earlier
train['brand_guess'] = train['brand_guess'].fillna('').astype(str)
brand_freq = train['brand_guess'].value_counts().to_dict()
train['brand_freq'] = train['brand_guess'].map(lambda x: brand_freq.get(x,0)).astype(float)
test['brand_freq']  = test['brand_guess'].map(lambda x: brand_freq.get(x,0)).fillna(0).astype(float)

# Ensure these were created in the FE cell:
# ['has_qty','log_total_qty','qty_per_pack','has_pack','brand_te'] if TE added
extra_num = []
for col in ['has_qty','log_total_qty','qty_per_pack','has_pack','brand_te']:
    if col in train.columns:
        extra_num.append(col)

tab_cols = ['pack_count','size_val','norm_qty','total_qty','brand_freq'] + extra_num
tab_train = train[tab_cols].fillna(0).values.astype(float)
tab_test  = test[tab_cols].fillna(0).values.astype(float)

# === PCA for SBERT & CLIP ===
PCA_TEXT_DIM = 128
PCA_IMG_DIM  = 128
pca_text_path = Path(EMB_DIR)/'pca_text.joblib'
pca_img_path  = Path(EMB_DIR)/'pca_img.joblib'

if pca_text_path.exists():
    pca_text = joblib.load(pca_text_path)
    text_p_train = pca_text.transform(text_emb_train)
    text_p_test  = pca_text.transform(text_emb_test)
else:
    pca_text = PCA(n_components=PCA_TEXT_DIM, random_state=SEED)
    text_p_train = pca_text.fit_transform(text_emb_train)
    text_p_test  = pca_text.transform(text_emb_test)
    joblib.dump(pca_text, pca_text_path)

if pca_img_path.exists():
    pca_img = joblib.load(pca_img_path)
    img_p_train = pca_img.transform(img_emb_train)
    img_p_test  = pca_img.transform(img_emb_test)
else:
    pca_img = PCA(n_components=PCA_IMG_DIM, random_state=SEED)
    img_p_train = pca_img.fit_transform(img_emb_train)
    img_p_test  = pca_img.transform(img_emb_test)
    joblib.dump(pca_img, pca_img_path)

print('Reduced shapes:', text_p_train.shape, img_p_train.shape)

# === TF-IDF -> SVD channel (expects tfv/svd already fit earlier; if not, fit here) ===
if 'svd_train' in globals() and 'svd_test' in globals():
    svd_train_use = svd_train
    svd_test_use  = svd_test
else:
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.decomposition import TruncatedSVD
    tfv = TfidfVectorizer(min_df=5, max_df=0.9, ngram_range=(1,2), max_features=50000)
    tfv_train = tfv.fit_transform(train['catalog_content'])
    tfv_test  = tfv.transform(test['catalog_content'])
    svd = TruncatedSVD(n_components=128, random_state=SEED)
    svd_train_use = svd.fit_transform(tfv_train)
    svd_test_use  = svd.transform(tfv_test)

# === Assemble design matrices ===
X_train = np.hstack([text_p_train, img_p_train, svd_train_use, tab_train])
X_test  = np.hstack([text_p_test,  img_p_test,  svd_test_use,  tab_test])

# === Targets in log space with winsorization (ensure lo,hi computed earlier) ===
if 'y_log_clipped' in globals():
    y_train = y_log_clipped
else:
    y_true_price = train['price'].values
    y_log = np.log1p(y_true_price)
    lo, hi = np.percentile(y_log, [0.5, 99.5])
    y_train = np.clip(y_log, lo, hi)

print('Final feature shapes', X_train.shape, X_test.shape)

# === Standardize X for linear/MLP models ===
scaler = StandardScaler(with_mean=True, with_std=True)
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)


Reduced shapes: (75000, 128) (75000, 128)
Final feature shapes (75000, 393) (75000, 393)


In [None]:
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
class SmallRegressor(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 512), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(512, 128), nn.ReLU(), nn.Dropout(0.1),
            nn.Linear(128, 1)
        )
    def forward(self, x):
        return self.net(x)

def train_epoch(model, opt, loader, device):
    model.train(); total_loss=0.0
    for xb, yb in loader:
        xb = xb.to(device); yb = yb.to(device)
        opt.zero_grad()
        out = model(xb).squeeze(1)
        loss = torch.mean(torch.abs(out - yb))
        loss.backward(); opt.step()
        total_loss += loss.item()*xb.size(0)
    return total_loss/len(loader.dataset)

def valid_epoch(model, loader, device):
    model.eval(); preds=[]; trues=[]
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device); yb = yb.to(device)
            out = model(xb).squeeze(1)
            preds.append(out.cpu().numpy()); trues.append(yb.cpu().numpy())
    return np.concatenate(preds), np.concatenate(trues)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device', device)
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)
joblib.dump(scaler, Path(EMB_DIR)/'scaler.joblib')

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)
oof_mlp = np.zeros(len(train)); test_mlp = np.zeros(len(test))
oof_ridge = np.zeros(len(train)); test_ridge = np.zeros(len(test))
for fold, (tr_idx, va_idx) in enumerate(kf.split(X_train_s)):
    print(f'Fold {fold+1}/{n_splits}')
    X_tr, X_va = X_train_s[tr_idx], X_train_s[va_idx]
    y_tr, y_va = y_train[tr_idx], y_train[va_idx]
    # MLP
    model = SmallRegressor(X_tr.shape[1]).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode='min', factor=0.5, patience=2, verbose=True)
    tr_ds = TensorDataset(torch.tensor(X_tr,dtype=torch.float32), torch.tensor(y_tr,dtype=torch.float32))
    va_ds = TensorDataset(torch.tensor(X_va,dtype=torch.float32), torch.tensor(y_va,dtype=torch.float32))
    tr_loader = DataLoader(tr_ds, batch_size=256, shuffle=True)
    va_loader = DataLoader(va_ds, batch_size=256, shuffle=False)
    best_val = 1e9; best_state=None
    for epoch in range(1,7):
        tr_loss = train_epoch(model, opt, tr_loader, device)
        val_preds_log, val_trues_log = valid_epoch(model, va_loader, device)
        val_mae = np.mean(np.abs(val_preds_log - val_trues_log))
        print(f' Epoch {epoch}: tr_loss={tr_loss:.5f} val_mae_log={val_mae:.5f}')
        if val_mae < best_val:
            best_val = val_mae; best_state={k:v.cpu().clone() for k,v in model.state_dict().items()}
    model.load_state_dict(best_state)
    # oof
    model.eval()
    with torch.no_grad():
        oof_mlp[va_idx] = model(torch.tensor(X_va,dtype=torch.float32).to(device)).cpu().numpy().squeeze()
        # test preds
        preds = []
        bs=512
        for i in range(0, len(X_test_s), bs):
            xb = torch.tensor(X_test_s[i:i+bs], dtype=torch.float32).to(device)
            preds.append(model(xb).cpu().numpy())
        test_mlp += np.concatenate(preds).squeeze() / n_splits
    # Ridge
    ridge = Ridge(alpha=1.0)
    ridge.fit(X_tr, y_tr)
    oof_ridge[va_idx] = ridge.predict(X_va)
    test_ridge += ridge.predict(X_test_s) / n_splits
print('Converting back to price space and computing SMAPE...')
def smape(y_true, y_pred):
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    denom_safe = np.where(denom==0, 1.0, denom)
    return np.mean(np.abs(y_true-y_pred)/denom_safe)*100
oof_mlp_price = np.expm1(oof_mlp)
oof_ridge_price = np.expm1(oof_ridge)
print('MLP OOF SMAPE:', smape(train['price'].values, oof_mlp_price))
print('Ridge OOF SMAPE:', smape(train['price'].values, oof_ridge_price))


Device cuda
Fold 1/5
 Epoch 1: tr_loss=0.70308 val_mae_log=0.61639
 Epoch 2: tr_loss=0.58797 val_mae_log=0.59957
 Epoch 3: tr_loss=0.56427 val_mae_log=0.58880
 Epoch 4: tr_loss=0.54974 val_mae_log=0.58203
 Epoch 5: tr_loss=0.53619 val_mae_log=0.58850
 Epoch 6: tr_loss=0.52788 val_mae_log=0.57164
Fold 2/5
 Epoch 1: tr_loss=0.68924 val_mae_log=0.59757
 Epoch 2: tr_loss=0.58852 val_mae_log=0.58872
 Epoch 3: tr_loss=0.56522 val_mae_log=0.56694
 Epoch 4: tr_loss=0.54802 val_mae_log=0.55710
 Epoch 5: tr_loss=0.53916 val_mae_log=0.56297
 Epoch 6: tr_loss=0.52667 val_mae_log=0.55286
Fold 3/5
 Epoch 1: tr_loss=0.69368 val_mae_log=0.60344
 Epoch 2: tr_loss=0.58915 val_mae_log=0.58040
 Epoch 3: tr_loss=0.56573 val_mae_log=0.56735
 Epoch 4: tr_loss=0.54935 val_mae_log=0.55691
 Epoch 5: tr_loss=0.53703 val_mae_log=0.55902
 Epoch 6: tr_loss=0.52984 val_mae_log=0.54927
Fold 4/5
 Epoch 1: tr_loss=0.70127 val_mae_log=0.58104
 Epoch 2: tr_loss=0.59334 val_mae_log=0.57145
 Epoch 3: tr_loss=0.56807 val_ma

### Training Method V1

In [None]:
#
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

# --- Small MLP with BatchNorm and dropout ---
# AFTER
class BalancedRegressor(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 768),
            nn.BatchNorm1d(768),
            nn.ReLU(),
            nn.Dropout(0.15),  # Lower dropout
            nn.Linear(768, 384),
            nn.BatchNorm1d(384),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(384, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.05),
            nn.Linear(128, 1)
        )
    def forward(self, x): return self.net(x)


# --- Train/valid with Huber loss in log-space ---
def train_epoch(model, opt, loader, device, criterion):
    model.train(); total_loss=0.0
    for xb, yb in loader:
        xb = xb.to(device); yb = yb.to(device)
        opt.zero_grad()
        out = model(xb).squeeze(1)
        loss = criterion(out, yb)
        loss.backward(); opt.step()
        total_loss += loss.item()*xb.size(0)
    return total_loss/len(loader.dataset)

def valid_epoch(model, loader, device):
    model.eval(); preds=[]; trues=[]
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device); yb = yb.to(device)
            out = model(xb).squeeze(1)
            preds.append(out.cpu().numpy()); trues.append(yb.cpu().numpy())
    return np.concatenate(preds), np.concatenate(trues)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Device', device)

# X_train_s, X_test_s, y_train (winsorized log) should be defined earlier
joblib.dump(scaler, Path(EMB_DIR)/'scaler.joblib')

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

oof_mlp_log   = np.zeros(len(train)); pred_mlp_log   = np.zeros(len(test))
oof_ridge_log = np.zeros(len(train)); pred_ridge_log = np.zeros(len(test))
# Initialize pred_lgb_log and oof_ensemble_log before the loop
pred_lgb_log = np.zeros(len(test))
oof_ensemble_log = np.zeros(len(train))

for fold, (tr_idx, va_idx) in enumerate(kf.split(X_train_s)):
    print(f'Fold {fold+1}/{n_splits}')
    X_tr, X_va = X_train_s[tr_idx], X_train_s[va_idx]
    y_tr, y_va = y_train[tr_idx], y_train[va_idx]  # log space

    # --- MLP ---
    model = BalancedRegressor(X_tr.shape[1]).to(device)
    opt = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-5)  # Lower LR, less decay
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode='min', factor=0.7, patience=3)

    criterion = nn.SmoothL1Loss(beta=0.1)  # Huber in log-space

    tr_ds = TensorDataset(torch.tensor(X_tr,dtype=torch.float32), torch.tensor(y_tr,dtype=torch.float32))
    va_ds = TensorDataset(torch.tensor(X_va,dtype=torch.float32), torch.tensor(y_va,dtype=torch.float32))
    tr_loader = DataLoader(tr_ds, batch_size=256, shuffle=True, drop_last=False)
    va_loader = DataLoader(va_ds, batch_size=256, shuffle=False, drop_last=False)

    best_val = 1e9; best_state=None; wait=0; patience=4
    for epoch in range(1, 50):  # up to 30 epochs
        tr_loss = train_epoch(model, opt, tr_loader, device, criterion)
        val_preds_log, val_trues_log = valid_epoch(model, va_loader, device)
        val_mae = np.mean(np.abs(val_preds_log - val_trues_log))
        scheduler.step(val_mae)
        print(f' Epoch {epoch}: tr_loss={tr_loss:.5f} val_mae_log={val_mae:.5f}')
        if val_mae < best_val - 1e-4:
            best_val = val_mae; best_state={k:v.cpu().clone() for k,v in model.state_dict().items()}
            wait=0
        else:
            wait += 1
            # Decay learning rate for MLP
            for g in opt.param_groups: g['lr'] *= 0.5
            if wait >= patience:
                break
    if best_state is not None:
        model.load_state_dict(best_state)

    # OOF and test predictions (log space)
    model.eval()
    with torch.no_grad():
        oof_mlp_log[va_idx] = model(torch.tensor(X_va,dtype=torch.float32).to(device)).cpu().numpy().squeeze()
        preds = []
        bs=512
        for i in range(0, len(X_test_s), bs):
            xb = torch.tensor(X_test_s[i:i+bs], dtype=torch.float32).to(device)
            preds.append(model(xb).cpu().numpy())
        pred_mlp_log += np.concatenate(preds).squeeze() / n_splits

    # --- Ridge (stabilized in log-space with y-scaler) ---
    # Use the same scaler as for MLP
    # x_scaler = StandardScaler(with_mean=False, with_std=False)  # already standardized; keep identity
    X_tr_r, X_va_r = X_train_s[tr_idx], X_train_s[va_idx] # Use scaled data

    y_scaler = StandardScaler().fit(y_train[tr_idx].reshape(-1,1))
    ridge = Ridge(alpha=1.0, random_state=SEED)
    ridge.fit(X_tr_r, y_scaler.transform(y_train[tr_idx].reshape(-1,1)).ravel())

    pred_va_std = ridge.predict(X_va_r)
    oof_ridge_log[va_idx] = y_scaler.inverse_transform(pred_va_std.reshape(-1,1)).ravel()

    # For test, refit on full train each fold or reuse final; here we average across folds
    pred_test_std = ridge.predict(X_test_s)
    pred_ridge_log += y_scaler.inverse_transform(pred_test_std.reshape(-1,1)).ravel() / n_splits

    # --- LightGBM stacking (train on log-space for better blending) ---
    import lightgbm as lgb
    lgb_params = {
        'objective': 'regression', 'metric': 'l1', 'learning_rate': 0.03,  # Slower LR
        'num_leaves': 63, 'max_depth': 7, 'feature_fraction': 0.85,  # More leaves/depth
        'bagging_fraction': 0.85, 'min_child_samples': 50, 'verbosity': -1,
        'seed': SEED # Add seed for reproducibility
    }
    y_tr_log = y_train[tr_idx] # y_train is already winsorized log
    y_va_log = y_train[va_idx]

    lgb_train = lgb.Dataset(X_tr, y_tr_log)
    lgb_valid = lgb.Dataset(X_va, y_va_log, reference=lgb_train)
    lgb_model = lgb.train(
        lgb_params, lgb_train, num_boost_round=500,  # More rounds
        valid_sets=[lgb_valid],
        callbacks=[lgb.early_stopping(30), lgb.log_evaluation(0)]
    )

    # OOF prediction for LGBM
    pred_va_lgb_log = lgb_model.predict(X_va)

    # Accumulate LGB test preds separately
    pred_lgb_log += lgb_model.predict(X_test_s) / n_splits  # Global pred_lgb_log init outside loop

    # === OOF Blending in log-space ===
    # Blend OOF in log-space (weighted for SMAPE)
    # Use pred_va_lgb_log for OOF LGBM
    oof_ensemble_log[va_idx] = (0.4 * oof_mlp_log[va_idx] +
                                0.35 * oof_ridge_log[va_idx] +
                                0.25 * pred_va_lgb_log)

# === Ensemble test predictions in log-space ===
# Use the accumulated test predictions
blend_test_log = (0.4 * pred_mlp_log +
                  0.35 * pred_ridge_log +
                  0.25 * pred_lgb_log)


print('Converting back to price space and computing SMAPE...')

# --- Safe inverse + SMAPE ---
def safe_expm1_clip(y_log, lo, hi): # Use lo, hi from winsorization
    y_log = np.clip(y_log, lo, hi).astype(np.float64)
    y = np.expm1(y_log)
    return np.clip(y, 0.01, None)

def smape_safe(y_true, y_pred):
    y_true = y_true.astype(np.float64); y_pred = y_pred.astype(np.float64)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    denom = np.maximum(denom, 1.0)
    return np.mean(np.abs(y_true - y_pred) / denom) * 100.0

# If you computed lo,hi earlier from winsorization, reuse them; else defaults are fine
# Assuming lo and hi are already defined from the data wrangling stage
# If not, uncomment and run the following lines:
if 'lo' not in globals() or 'hi' not in globals():
    y_true_price = train['price'].values
    y_log = np.log1p(y_true_price)
    lo, hi = np.percentile(y_log, [0.5, 99.5])


# Evaluate ensemble OOF SMAPE
oof_ensemble_price = safe_expm1_clip(oof_ensemble_log, lo, hi)
ensemble_smape = smape_safe(train['price'].values, oof_ensemble_price)
print('Ensemble OOF SMAPE:', ensemble_smape)

# No need to print individual MLP and Ridge SMAPE here, as the focus is on the ensemble
# print('MLP OOF SMAPE:', smape_safe(train['price'].values, safe_expm1_clip(oof_mlp_log, lo, hi)))
# print('Ridge OOF SMAPE:', smape_safe(train['price'].values, safe_expm1_clip(oof_ridge_log, lo, hi)))

Device cuda
Fold 1/5
 Epoch 1: tr_loss=0.96619 val_mae_log=0.59676
 Epoch 2: tr_loss=0.52409 val_mae_log=0.56979
 Epoch 3: tr_loss=0.48329 val_mae_log=0.56791
 Epoch 4: tr_loss=0.45854 val_mae_log=0.55266
 Epoch 5: tr_loss=0.43919 val_mae_log=0.54349
 Epoch 6: tr_loss=0.42414 val_mae_log=0.54236
 Epoch 7: tr_loss=0.40917 val_mae_log=0.54109
 Epoch 8: tr_loss=0.39830 val_mae_log=0.54005
 Epoch 9: tr_loss=0.38537 val_mae_log=0.53452
 Epoch 10: tr_loss=0.37705 val_mae_log=0.53402
 Epoch 11: tr_loss=0.36973 val_mae_log=0.53576
 Epoch 12: tr_loss=0.34179 val_mae_log=0.52401
 Epoch 13: tr_loss=0.33173 val_mae_log=0.52603
 Epoch 14: tr_loss=0.31485 val_mae_log=0.52027
 Epoch 15: tr_loss=0.30925 val_mae_log=0.52272
 Epoch 16: tr_loss=0.29981 val_mae_log=0.52172
 Epoch 17: tr_loss=0.29565 val_mae_log=0.52028
 Epoch 18: tr_loss=0.29156 val_mae_log=0.51973
 Epoch 19: tr_loss=0.29123 val_mae_log=0.51981
 Epoch 20: tr_loss=0.28931 val_mae_log=0.51927
 Epoch 21: tr_loss=0.28885 val_mae_log=0.51929
 

In [None]:
# # Ensemble predictions and save submission
# w_mlp = 0.7
# w_ridge = 0.3
# test_pred_price = w_mlp * np.expm1(test_mlp) + w_ridge * np.expm1(test_ridge)
# low_cap, high_cap = np.percentile(train['price'].values, [0.5, 99.5])
# test_pred_price = np.clip(test_pred_price, 0.01, high_cap*3.0)
# sub = pd.DataFrame({'sample_id': test['sample_id'].values, 'price': test_pred_price})
# out_file = Path(SUB_DIR)/'test_out.csv'
# sub.to_csv(out_file, index=False)
# print('Saved submission to', out_file)
# # Save OOF for analysis
# pd.DataFrame({'sample_id':train['sample_id'],'price_true':train['price'],'oof_mlp_log':oof_mlp,'oof_ridge_log':oof_ridge}).to_csv(Path(SUB_DIR)/'oof.csv', index=False)


  test_pred_price = w_mlp * np.expm1(test_mlp) + w_ridge * np.expm1(test_ridge)


Saved submission to /content/drive/MyDrive/amazon ml challenge/68e8d1d70b66d_student_resource/student_resource/submissions/test_out.csv


In [None]:
# === OOF-weighted blend and submission ===

# Ensure these exist from the training cell:
# oof_mlp_log, pred_mlp_log, oof_ridge_log, pred_ridge_log, pred_lgb_log
# Also lo, hi from winsorization; if not present, define defaults:
if 'lo' not in globals() or 'hi' not in globals():
    y_log_temp = np.log1p(train['price'].values)
    lo, hi = np.percentile(y_log_temp, [0.5, 99.5])

def safe_expm1_clip(y_log, lo=-5.0, hi=12.0):
    y_log = np.clip(y_log, lo, hi).astype(np.float64)
    y = np.expm1(y_log)
    return np.clip(y, 0.01, None)

def smape_safe(y_true, y_pred):
    y_true = y_true.astype(np.float64); y_pred = y_pred.astype(np.float64)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    denom = np.maximum(denom, 1.0)
    return np.mean(np.abs(y_true - y_pred) / denom) * 100.0

# Grid search non-negative weights that sum to 1 (3-model case)
# weights should sum to 1: w_mlp + w_ridge + w_lgb = 1
# We can iterate over two weights and derive the third
best_err = 1e9; best_w = (1/3, 1/3, 1/3) # default
y_true_price = train['price'].values

# Define a finer grid for potentially better optimization
grid_size = 11 # Increase for finer search, e.g., 21 for 0.05 steps
grid = np.linspace(0, 1, grid_size)

for w_mlp in grid:
    for w_ridge in grid:
        w_lgb = 1.0 - w_mlp - w_ridge
        if w_lgb >= 0: # Ensure weights are non-negative
            blend_oof_log = w_mlp * oof_mlp_log + w_ridge * oof_ridge_log + w_lgb * oof_ensemble_log # Note: oof_ensemble_log already contains blended MLP+Ridge+LGB, but this is for re-optimizing overall blend
            blend_oof_price = safe_expm1_clip(blend_oof_log, lo, hi)
            err = smape_safe(y_true_price, blend_oof_price)
            if err < best_err:
                best_err = err; best_w = (w_mlp, w_ridge, w_lgb)

print("Best OOF weights (MLP, Ridge, LGB):", best_w, "OOF SMAPE:", best_err)

# Apply weights to test predictions (log space) and inverse safely
w_mlp, w_ridge, w_lgb = best_w
# Note: pred_lgb_log needs to be calculated from the individual LGB models if not already accumulated correctly
# Assuming pred_lgb_log was accumulated in the training loop along with pred_mlp_log and pred_ridge_log
blend_test_log = w_mlp * pred_mlp_log + w_ridge * pred_ridge_log + w_lgb * pred_lgb_log

test_pred_price = safe_expm1_clip(blend_test_log, lo, hi)

# Clip to reasonable bounds derived from train
# Use slightly wider bounds for test predictions
low_cap, high_cap = np.percentile(y_true_price, [0.5, 99.5])
test_pred_price = np.clip(test_pred_price, 0.01, high_cap * 3.0)


# Save submission
sub = pd.DataFrame({'sample_id': test['sample_id'].values, 'price': test_pred_price})
out_file = Path(SUB_DIR) / 'test_out.csv'
sub.to_csv(out_file, index=False)
print('Saved submission to', out_file)

# Save OOF diagnostics (log and price)
oof_mlp_price   = safe_expm1_clip(oof_mlp_log,   lo, hi)
oof_ridge_price = safe_expm1_clip(oof_ridge_log, lo, hi)
oof_lgb_price   = safe_expm1_clip(lgb_model.predict(X_train_s), lo, hi) # Using final LGB model for simplicity here

oof_df = pd.DataFrame({
    'sample_id': train['sample_id'],
    'price_true': y_true_price,
    'oof_mlp_log': oof_mlp_log,
    'oof_ridge_log': oof_ridge_log,
    'oof_lgb_log': lgb_model.predict(X_train_s),
    'oof_blend_log': blend_oof_log,
    'oof_mlp_price': oof_mlp_price,
    'oof_ridge_price': oof_ridge_price,
    'oof_lgb_price': oof_lgb_price,
    'oof_blend_price': safe_expm1_clip(blend_oof_log, lo, hi) # Recalculate blend_oof_price using the best weights
})
oof_df.to_csv(Path(SUB_DIR)/'oof.csv', index=False)

Best OOF weights (MLP, Ridge, LGB): (np.float64(0.7000000000000001), np.float64(0.0), np.float64(0.29999999999999993)) OOF SMAPE: 50.95796340947655
Saved submission to /content/drive/MyDrive/68e8d1d70b66d_student_resource/student_resource/submissions/test_out.csv


In [None]:
##
# === OOF-weighted blend and submission ===

# Ensure these exist from the training cell:
# oof_mlp_log, pred_mlp_log, oof_ridge_log, pred_ridge_log
# Also lo, hi from winsorization; if not present, define defaults:
if 'lo' not in globals() or 'hi' not in globals():
    y_log_temp = np.log1p(train['price'].values)
    lo, hi = np.percentile(y_log_temp, [0.5, 99.5])

def safe_expm1_clip(y_log, lo=-5.0, hi=12.0):
    y_log = np.clip(y_log, lo, hi).astype(np.float64)
    y = np.expm1(y_log)
    return np.clip(y, 0.01, None)

def smape_safe(y_true, y_pred):
    y_true = y_true.astype(np.float64); y_pred = y_pred.astype(np.float64)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    denom = np.maximum(denom, 1.0)
    return np.mean(np.abs(y_true - y_pred) / denom) * 100.0

# Grid search non-negative weights that sum to 1 (2-model case)
grid = np.linspace(0, 1, 21)  # 0.0 to 1.0 step 0.05
best_err = 1e9; best_w = (0.7, 0.3)  # default
y_true_price = train['price'].values

for w in grid:
    w_mlp, w_ridge = w, 1.0 - w
    blend_oof_log = w_mlp * oof_mlp_log + w_ridge * oof_ridge_log
    blend_oof_price = safe_expm1_clip(blend_oof_log, lo, hi)
    err = smape_safe(y_true_price, blend_oof_price)
    if err < best_err:
        best_err = err; best_w = (w_mlp, w_ridge)

print("Best OOF weights (MLP, Ridge):", best_w, "OOF SMAPE:", best_err)

# Apply weights to test predictions (log space) and inverse safely
w_mlp, w_ridge = best_w
blend_test_log = w_mlp * pred_mlp_log + w_ridge * pred_ridge_log
test_pred_price = safe_expm1_clip(blend_test_log, lo, hi)

# Clip to reasonable bounds derived from train
low_cap, high_cap = np.percentile(y_true_price, [0.5, 99.5])
test_pred_price = np.clip(test_pred_price, 0.01, high_cap * 3.0)

# Save submission
sub = pd.DataFrame({'sample_id': test['sample_id'].values, 'price': test_pred_price})
out_file = Path(SUB_DIR) / 'test_out.csv'
sub.to_csv(out_file, index=False)
print('Saved submission to', out_file)

# Save OOF diagnostics (log and price)
oof_mlp_price   = safe_expm1_clip(oof_mlp_log,   lo, hi)
oof_ridge_price = safe_expm1_clip(oof_ridge_log, lo, hi)
blend_oof_price = safe_expm1_clip(w_mlp*oof_mlp_log + w_ridge*oof_ridge_log, lo, hi)

oof_df = pd.DataFrame({
    'sample_id': train['sample_id'],
    'price_true': y_true_price,
    'oof_mlp_log': oof_mlp_log,
    'oof_ridge_log': oof_ridge_log,
    'oof_blend_log': w_mlp*oof_mlp_log + w_ridge*oof_ridge_log,
    'oof_mlp_price': oof_mlp_price,
    'oof_ridge_price': oof_ridge_price,
    'oof_blend_price': blend_oof_price,
})
oof_df.to_csv(Path(SUB_DIR)/'oof.csv', index=False)


Best OOF weights (MLP, Ridge): (np.float64(0.30000000000000004), np.float64(0.7)) OOF SMAPE: 58.27530762327216
Saved submission to /content/drive/MyDrive/68e8d1d70b66d_student_resource/student_resource/submissions/test_out.csv
