# Solution Notebook

This notebook demonstrates the baseline training and inference scripts provided in `src/`. It is intentionally minimal.

In [None]:


{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# solution.ipynb\n",
    "import os\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.linear_model import Ridge\n",
    "from sklearn.model_selection import StratifiedKFold\n",
    "from sklearn.metrics import mean_absolute_error\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn.isotonic import IsotonicRegression\n",
    "import lightgbm as lgb\n",
    "import catboost as cb\n",
    "import torch\n",
    "import torch.nn as nn\n",
    "import timm\n",
    "from torch.utils.data import Dataset, DataLoader\n",
    "from PIL import Image\n",
    "import torchvision.transforms as transforms\n",
    "from tqdm import tqdm\n",
    "import joblib\n",
    "import re\n",
    "import requests\n",
    "from time import sleep\n",
    "from concurrent.futures import ThreadPoolExecutor\n",
    "import random\n",
    "from scipy.optimize import minimize\n",
    "\n",
    "# Set seeds\n",
    "def set_seed(seed=42):\n",
    "    random.seed(seed)\n",
    "    np.random.seed(seed)\n",
    "    torch.manual_seed(seed)\n",
    "    torch.cuda.manual_seed(seed)\n",
    "    torch.backends.cudnn.deterministic = True\n",
    "    torch.backends.cudnn.benchmark = False\n",
    "    os.environ['PYTHONHASHSEED'] = str(seed)\n",
    "\n",
    "set_seed(42)\n",
    "\n",
    "# SMAPE metric\n",
    "def smape(y_true, y_pred):\n",
    "    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-8))\n",
    "\n",
    "# Download images function (from utils.py)\n",
    "def download_images(image_links, out_dir, concurrency=5):\n",
    "    def download_single_image(url, path):\n",
    "        try:\n",
    "            response = requests.get(url, stream=True, timeout=10)\n",
    "            response.raise_for_status()\n",
    "            with open(path, 'wb') as f:\n",
    "                for chunk in response.iter_content(chunk_size=8192):\n",
    "                    f.write(chunk)\n",
    "            return True\n",
    "        except Exception:\n",
    "            return False\n",
    "\n",
    "    os.makedirs(out_dir, exist_ok=True)\n",
    "    paths = []\n",
    "    with ThreadPoolExecutor(max_workers=concurrency) as executor:\n",
    "        futures = []\n",
    "        for i, url in enumerate(image_links):\n",
    "            path = os.path.join(out_dir, f'{i}.jpg')\n",
    "            paths.append(path)\n",
    "            futures.append(executor.submit(download_single_image, url, path))\n",
    "            sleep(0.1)  # backoff\n",
    "        for future in tqdm(futures):\n",
    "            future.result()\n",
    "    return paths\n",
    "\n",
    "# Text feature extraction\n",
    "def extract_text_features(df):\n",
    "    df['text'] = df['catalog_content'].fillna('')\n",
    "    df['text_len'] = df['text'].str.len()\n",
    "    df['word_count'] = df['text'].str.split().str.len()\n",
    "    df['digit_count'] = df['text'].str.findall(r'\\d').str.len()\n",
    "    df['punct_count'] = df['text'].str.findall(r'[\\.,;:!?]').str.len()\n",
    "    df['avg_token_len'] = df['text'].apply(lambda x: np.mean([len(w) for w in x.split()]) if len(x.split()) > 0 else 0)\n",
    "\n",
    "    # Pack quantity\n",
    "    pack_patterns = r'(pack of \\d+|\\d+ pack|\\d+ count|\\d+ ct|\\d+pk|\\d+ pk)'\n",
    "    df['pack_count'] = df['text'].str.findall(pack_patterns, flags=re.I).str.len() + df['text'].str.count(r'\\d+') // 2  # rough\n",
    "    df['pack_count'] = df['pack_count'].clip(0, 50)\n",
    "\n",
    "    # Unit qty\n",
    "    unit_patterns = r'(\\d+\\.?\\d*)\\s*(oz|ounce|fl oz|g|gram|kg|lb|pound|ml|l|liter|ct|count|pack)'\n",
    "    df['unit_qty'] = df['text'].str.extract(unit_patterns, flags=re.I)[0].astype(float).fillna(0)\n",
    "    df['unit'] = df['text'].str.extract(unit_patterns, flags=re.I)[1].fillna('unknown')\n",
    "    le_unit = LabelEncoder()\n",
    "    df['unit'] = le_unit.fit_transform(df['unit'])\n",
    "\n",
    "    # Brand heuristic\n",
    "    def extract_brand(text):\n",
    "        tokens = re.findall(r'\\b[A-Z][a-zA-Z]{2,}\\b', text)\n",
    "        if tokens:\n",
    "            return tokens[0]\n",
    "        return 'unknown'\n",
    "    df['brand'] = df['text'].apply(extract_brand)\n",
    "    brand_freq = df['brand'].value_counts()\n",
    "    top_brands = brand_freq.head(100).index\n",
    "    df['brand'] = df['brand'].apply(lambda x: x if x in top_brands else 'other')\n",
    "    le_brand = LabelEncoder()\n",
    "    df['brand'] = le_brand.fit_transform(df['brand'])\n",
    "\n",
    "    return df\n",
    "\n",
    "# TF-IDF and Ridge meta\n",
    "def tfidf_ridge_meta(X_text, y_log=None, fit=True, vectorizers=None, ridge=None):\n",
    "    if fit:\n",
    "        tfidf_word = TfidfVectorizer(ngram_range=(1,2), min_df=3, max_features=150000, analyzer='word')\n",
    "        tfidf_char = TfidfVectorizer(ngram_range=(3,5), min_df=3, max_features=150000, analyzer='char')\n",
    "        X_tfidf = scipy.sparse.hstack([tfidf_word.fit_transform(X_text), tfidf_char.fit_transform(X_text)])\n",
    "        ridge = Ridge(alpha=1.0)\n",
    "        ridge.fit(X_tfidf, y_log)\n",
    "        return ridge.predict(X_tfidf), (tfidf_word, tfidf_char, ridge)\n",
    "    else:\n",
    "        tfidf_word, tfidf_char, ridge = vectorizers\n",
    "        X_tfidf = scipy.sparse.hstack([tfidf_word.transform(X_text), tfidf_char.transform(X_text)])\n",
    "        return ridge.predict(X_tfidf)\n",
    "\n",
    "# Image embeddings\n",
    "device = 'cuda' if torch.cuda.is_available() else 'cpu'\n",
    "model = timm.create_model('resnet18.a1_in1k', pretrained=True, num_classes=0).to(device)\n",
    "model.eval()\n",
    "transform = transforms.Compose([\n",
    "    transforms.Resize(256),\n",
    "    transforms.CenterCrop(224),\n",
    "    transforms.ToTensor(),\n",
    "    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n",
    "])\n",
    "\n",
    "class ImageDataset(Dataset):\n",
    "    def __init__(self, paths):\n",
    "        self.paths = paths\n",
    "\n",
    "    def __len__(self):\n",
    "        return len(self.paths)\n",
    "\n",
    "    def __getitem__(self, idx):\n",
    "        path = self.paths[idx]\n",
    "        try:\n",
    "            img = Image.open(path).convert('RGB')\n",
    "            img = transform(img)\n",
    "            return img, 0  # dummy label\n",
    "        except:\n",
    "            return torch.zeros(3, 224, 224), 1  # missing flag\n",
    "\n",
    "def extract_image_embeddings(paths, batch_size=32):\n",
    "    dataset = ImageDataset(paths)\n",
    "    loader = DataLoader(dataset, batch_size=batch_size, num_workers=0, shuffle=False)\n",
    "    embeddings = []\n",
    "    missing = []\n",
    "    with torch.no_grad():\n",
    "        for imgs, flags in tqdm(loader):\n",
    "            imgs = imgs.to(device)\n",
    "            emb = model(imgs)\n",
    "            embeddings.append(emb.cpu().numpy())\n",
    "            missing.extend(flags.numpy())\n",
    "    embeddings = np.vstack(embeddings)\n",
    "    embeddings[np.array(missing) == 1] = 0\n",
    "    return embeddings, np.array(missing)\n",
    "\n",
    "# Main pipeline\n",
    "train_df = pd.read_csv('dataset/train.csv')\n",
    "test_df = pd.read_csv('dataset/test.csv')\n",
    "\n",
    "# Download images\n",
    "os.makedirs('images/train', exist_ok=True)\n",
    "os.makedirs('images/test', exist_ok=True)\n",
    "train_paths = download_images(train_df['image_link'], 'images/train')\n",
    "test_paths = download_images(test_df['image_link'], 'images/test')\n",
    "\n",
    "# Extract features\n",
    "train_df = extract_text_features(train_df)\n",
    "test_df = extract_text_features(test_df)\n",
    "\n",
    "y = train_df['price'].values\n",
    "y_log = np.log1p(y)\n",
    "\n",
    "# TF-IDF meta\n",
    "train_meta, vectorizers = tfidf_ridge_meta(train_df['text'], y_log, fit=True)\n",
    "test_meta = tfidf_ridge_meta(test_df['text'], fit=False, vectorizers=vectorizers)\n",
    "train_df['ridge_meta'] = train_meta\n",
    "test_df['ridge_meta'] = test_meta\n",
    "\n",
    "# Image embeddings\n",
    "train_emb, train_missing = extract_image_embeddings(train_paths)\n",
    "test_emb, test_missing = extract_image_embeddings(test_paths)\n",
    "train_df['image_missing'] = train_missing\n",
    "test_df['image_missing'] = test_missing\n",
    "\n",
    "# Features\n",
    "num_feats = ['text_len', 'word_count', 'digit_count', 'punct_count', 'avg_token_len', 'pack_count', 'unit_qty', 'ridge_meta', 'image_missing']\n",
    "cat_feats = ['unit', 'brand']\n",
    "X_train_num = train_df[num_feats].values\n",
    "X_test_num = test_df[num_feats].values\n",
    "X_train_cat = train_df[cat_feats].values\n",
    "X_test_cat = test_df[cat_feats].values\n",
    "\n",
    "# CV setup\n",
    "bins = pd.qcut(y, q=20, labels=False, duplicates='drop')\n",
    "skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)\n",
    "\n",
    "oof_a = np.zeros(len(train_df))\n",
    "oof_b = np.zeros(len(train_df))\n",
    "oof_c = np.zeros(len(train_df))\n",
    "test_pred_a = np.zeros(len(test_df))\n",
    "test_pred_b = np.zeros(len(test_df))\n",
    "test_pred_c = np.zeros(len(test_df))\n",
    "\n",
    "for fold, (trn_idx, val_idx) in enumerate(skf.split(train_df, bins)):\n",
    "    X_trn_num, X_val_num = X_train_num[trn_idx], X_train_num[val_idx]\n",
    "    X_trn_cat, X_val_cat = X_train_cat[trn_idx], X_train_cat[val_idx]\n",
    "    y_trn_log, y_val_log = y_log[trn_idx], y_log[val_idx]\n",
    "    y_val = y[val_idx]\n",
    "    X_trn_emb, X_val_emb = train_emb[trn_idx], train_emb[val_idx]\n",
    "\n",
    "    # Model A: LGBM on tabular\n",
    "    params_a = {'objective': 'regression', 'metric': 'rmse', 'num_leaves': 64, 'feature_fraction': 0.7, 'bagging_fraction': 0.8,\n",
    "                'min_data_in_leaf': 50, 'lambda_l2': 5.0, 'verbose': -1, 'random_state': 42}\n",
    "    trn_data_a = lgb.Dataset(pd.DataFrame(np.hstack([X_trn_num, X_trn_cat]), columns=num_feats + cat_feats), label=y_trn_log)\n",
    "    val_data_a = lgb.Dataset(pd.DataFrame(np.hstack([X_val_num, X_val_cat]), columns=num_feats + cat_feats), label=y_val_log)\n",
    "    model_a = lgb.train(params_a, trn_data_a, num_boost_round=1000, valid_sets=[val_data_a], callbacks=[lgb.early_stopping(50)])\n",
    "    oof_a[val_idx] = model_a.predict(pd.DataFrame(np.hstack([X_val_num, X_val_cat]), columns=num_feats + cat_feats))\n",
    "    test_pred_a += model_a.predict(pd.DataFrame(np.hstack([X_test_num, X_test_cat]), columns=num_feats + cat_feats)) / 5\n",
    "\n",
    "    # Model B: CatBoost on tabular\n",
    "    model_b = cb.CatBoostRegressor(loss_function='RMSE', cat_features=cat_feats, random_seed=42, verbose=0, iterations=1000, early_stopping_rounds=50)\n",
    "    model_b.fit(pd.DataFrame(np.hstack([X_trn_num, X_trn_cat]), columns=num_feats + cat_feats), y_trn_log,\n",
    "                eval_set=(pd.DataFrame(np.hstack([X_val_num, X_val_cat]), columns=num_feats + cat_feats), y_val_log))\n",
    "    oof_b[val_idx] = model_b.predict(pd.DataFrame(np.hstack([X_val_num, X_val_cat]), columns=num_feats + cat_feats))\n",
    "    test_pred_b += model_b.predict(pd.DataFrame(np.hstack([X_test_num, X_test_cat]), columns=num_feats + cat_feats)) / 5\n",
    "\n",
    "    # Model C: LGBM on images\n",
    "    params_c = {'objective': 'regression', 'metric': 'rmse', 'num_leaves': 31, 'verbose': -1, 'random_state': 42}\n",
    "    trn_data_c = lgb.Dataset(X_trn_emb, label=y_trn_log)\n",
    "    val_data_c = lgb.Dataset(X_val_emb, label=y_val_log)\n",
    "    model_c = lgb.train(params_c, trn_data_c, num_boost_round=500, valid_sets=[val_data_c], callbacks=[lgb.early_stopping(50)])\n",
    "    oof_c[val_idx] = model_c.predict(X_val_emb)\n",
    "    test_pred_c += model_c.predict(test_emb) / 5\n",
    "\n",
    "# Ensemble weights\n",
    "def ensemble_smape(weights):\n",
    "    oof_ens = weights[0] * oof_a + weights[1] * oof_b + weights[2] * oof_c\n",
    "    pred = np.expm1(oof_ens)\n",
    "    pred = np.clip(pred, 0.01, None)\n",
    "    return smape(y, pred)\n",
    "\n",
    "res = minimize(ensemble_smape, [1/3, 1/3, 1/3], method='Nelder-Mead', bounds=[(0,1),(0,1),(0,1)])\n",
    "weights = res.x / res.x.sum()\n",
    "print('Weights:', weights)\n",
    "oof_ens = weights[0] * oof_a + weights[1] * oof_b + weights[2] * oof_c\n",
    "cv_smape = smape(y, np.clip(np.expm1(oof_ens), 0.01, None))\n",
    "print('CV SMAPE:', cv_smape)\n",
    "\n",
    "# Isotonic calibration\n",
    "iso = IsotonicRegression(out_of_bounds='clip')\n",
    "iso.fit(np.expm1(oof_ens), y)\n",
    "oof_cal = iso.predict(np.expm1(oof_ens))\n",
    "print('Calibrated CV SMAPE:', smape(y, oof_cal))\n",
    "\n",
    "# Test preds\n",
    "test_ens = weights[0] * test_pred_a + weights[1] * test_pred_b + weights[2] * test_pred_c\n",
    "test_pred = np.expm1(test_ens)\n",
    "test_pred = np.clip(test_pred, 0.01, None)\n",
    "p1, p99 = np.percentile(y, [1, 99])\n",
    "test_pred = np.clip(test_pred, p1, p99)\n",
    "test_cal = iso.predict(test_pred)\n",
    "\n",
    "# Save models (for infer)\n",
    "os.makedirs('model', exist_ok=True)\n",
    "joblib.dump((model_a, model_b, model_c, weights, vectorizers, le_unit, le_brand, iso, num_feats, cat_feats), 'model/models.pkl')\n",
    "\n",
    "# Output\n",
    "out_df = pd.DataFrame({'sample_id': test_df['sample_id'], 'price': test_cal})\n",
    "out_df.to_csv('test_out.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}