### Setup

In [4]:
import re
import os
import gc
import torch
import urllib
import warnings
import open_clip
import numpy as np
import pandas as pd
import xgboost as xgb
import multiprocessing

from tqdm import tqdm
from PIL import Image
from pathlib import Path
from functools import partial
from sklearn.model_selection import KFold
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import RobustScaler, LabelEncoder

warnings.filterwarnings('ignore')

In [6]:
# Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

Device: cuda
GPU: NVIDIA H100 80GB HBM3
VRAM: 79.1 GB


In [6]:
# Loading Data
print("[1/7] Loading Data")
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print(f"Train: {len(train_df):,}, Test: {len(test_df):,}")

[1/7] Loading Data
Train: 75,000, Test: 75,000


In [8]:
# Extracting item_name, bullet_points and product_description from catalog_content column
print("[2/7] Extracting item_name, bullet_points and product_description from catalog_content column")

def extract_fields(text):
    """Extract Item Name, Bullet Points, and Product Description from the string."""
    item_name_match = re.search(r"Item Name:\s*(.+?)(?:\n|$)", text)
    item_name = item_name_match.group(1).strip() if item_name_match else None

    # Capture all bullet points with flexible numbering or single 'Bullet Point'
    bullets = re.findall(r"Bullet Point\s*\d*:\s*(.+)", text)
    bullet_points = [b.strip() for b in bullets] if bullets else []

    # Capture product description
    prod_desc_match = re.search(r"Product Description:\s*(.+)", text, re.DOTALL)
    prod_desc = prod_desc_match.group(1).strip() if prod_desc_match else None

    return pd.Series({
        "item_name": item_name,
        "bullet_points": bullet_points if bullet_points else None,
        "product_description": prod_desc
    })

# extract fields from product_description of train data
df_extracted = train_df["catalog_content"].apply(extract_fields)
train_df = pd.concat([train_df, df_extracted], axis=1)

# extract fields from product_description of test data
df_extracted = test_df["catalog_content"].apply(extract_fields)
test_df = pd.concat([test_df, df_extracted], axis=1)

[2/7] Extracting item_name, bullet_points and product_description from catalog_content column


In [9]:
# Dowloading Images
print("[3/7] Downloading Images")

def download_image(image_link, savefolder):
    if isinstance(image_link, str):
        filename = Path(image_link).name
        image_save_path = os.path.join(savefolder, filename)
        if not os.path.exists(image_save_path):
            try:
                urllib.request.urlretrieve(image_link, image_save_path)
            except Exception as ex:
                print(f'Warning: Not able to download - {image_link}\n{ex}')
        return image_save_path
    return None

def download_images(image_links, download_folder):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    results = []
    download_image_partial = partial(download_image, savefolder=download_folder)

    with multiprocessing.Pool(100) as pool:
        for result in tqdm(pool.imap(download_image_partial, image_links),
                          total=len(image_links), desc="Downloading"):
            results.append(result)
        pool.close()
        pool.join()

    return results

# Download train images
train_folder = 'train_images'
print(f"Downloading {len(train_df):,} train images...")
train_image_paths = download_images(train_df['image_link'].tolist(), train_folder)

# Download test images
test_folder = 'test_images'
print(f"Downloading {len(test_df):,} test images...")
test_image_paths = download_images(test_df['image_link'].tolist(), test_folder)

print("All images downloaded!")

# Map URLs to local paths
train_df['image_path'] = [os.path.join(train_folder, Path(url).name)
                          if isinstance(url, str) else None
                          for url in train_df['image_link']]
test_df['image_path'] = [os.path.join(test_folder, Path(url).name)
                         if isinstance(url, str) else None
                         for url in test_df['image_link']]

[3/7] Downloading Images
Downloading 75,000 train images...


Downloading:  49%|████▉     | 36880/75000 [00:01<00:01, 33079.30it/s]

HTTP Error 404: Not Found


Downloading: 100%|██████████| 75000/75000 [00:02<00:00, 33292.95it/s]

Downloading 75,000 test images...



Downloading:  54%|█████▍    | 40619/75000 [00:01<00:01, 31668.05it/s]

HTTP Error 404: Not Found


Downloading: 100%|██████████| 75000/75000 [00:02<00:00, 32606.18it/s]


All images downloaded!


### Exploratory Data Analysis

In [10]:
train_df.head()

Unnamed: 0,sample_id,catalog_content,image_link,price,item_name,bullet_points,product_description,image_path
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89,"La Victoria Green Taco Sauce Mild, 12 Ounce (P...",,,train_images/51mo8htwTHL.jpg
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12,"Salerno Cookies, The Original Butter Cookies, ...",[Original Butter Cookies: Classic butter cooki...,,train_images/71YtriIHAAL.jpg
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97,"Bear Creek Hearty Soup Bowl, Creamy Chicken wi...",[Loaded with hearty long grain wild rice and v...,,train_images/51+PFEe-w-L.jpg
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34,Judee’s Blue Cheese Powder 11.25 oz - Gluten-F...,"[Add to your favorite appetizers, dips & sprea...",Judees Powdered Blue Cheese cheddar cheese pow...,train_images/41mu0HAToDL.jpg
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49,"kedem Sherry Cooking Wine, 12.7 Ounce - 12 per...","[kedem Sherry Cooking Wine, 12.7 Ounce - 12 pe...",,train_images/41sA037+QvL.jpg


In [11]:
test_df.head()

Unnamed: 0,sample_id,catalog_content,image_link,item_name,bullet_points,product_description,image_path
0,100179,Item Name: Rani 14-Spice Eshamaya's Mango Chut...,https://m.media-amazon.com/images/I/71hoAn78AW...,Rani 14-Spice Eshamaya's Mango Chutney (Indian...,[You'll LOVE our 14-Spice Eshamaya's Mango Chu...,Mango chutney is made from diced green mangoes...,test_images/71hoAn78AWL.jpg
1,245611,Item Name: Natural MILK TEA Flavoring extract ...,https://m.media-amazon.com/images/I/61ex8NHCIj...,Natural MILK TEA Flavoring extract by HALO PAN...,"[Authentic Tasting, Asian-Inspired Natural fla...",Check our popular Milk Tea flavoring extract i...,test_images/61ex8NHCIjL.jpg
2,146263,Item Name: Honey Filled Hard Candy - Bulk Pack...,https://m.media-amazon.com/images/I/61KCM61J8e...,Honey Filled Hard Candy - Bulk Pack 2 Pounds -...,[Honey Filled Hard Candy; 2-pound bulk pack; a...,Honey Filled Hard Candy - Bulk Pack 2 Pounds -...,test_images/61KCM61J8eL.jpg
3,95658,Item Name: Vlasic Snack'mm's Kosher Dill 16 Oz...,https://m.media-amazon.com/images/I/51Ex6uOH7y...,Vlasic Snack'mm's Kosher Dill 16 Oz (Pack of 2),,,test_images/51Ex6uOH7yL.jpg
4,36806,"Item Name: McCormick Culinary Vanilla Extract,...",https://m.media-amazon.com/images/I/71QYlrOMoS...,"McCormick Culinary Vanilla Extract, 32 fl oz -...",[PREMIUM INGREDIENTS: McCormick Culinary Pure ...,,test_images/71QYlrOMoSL.jpg


In [7]:
# Checking the fields present in content_catalog column
content_catalog = list(train_df.catalog_content)
for i in content_catalog[:5]:
  print(i)

Item Name: La Victoria Green Taco Sauce Mild, 12 Ounce (Pack of 6)
Value: 72.0
Unit: Fl Oz

Item Name: Salerno Cookies, The Original Butter Cookies, 8 Ounce (Pack of 4)
Bullet Point 1: Original Butter Cookies: Classic butter cookies made with real butter
Bullet Point 2: Variety Pack: Includes 4 boxes with 32 cookies total
Bullet Point 3: Occasion Perfect: Delicious cookies for birthdays, weddings, anniversaries
Bullet Point 4: Shareable Treats: Fun to give and enjoy with friends and family
Bullet Point 5: Salerno Brand: Trusted brand of delicious butter cookies since 1925
Value: 32.0
Unit: Ounce

Item Name: Bear Creek Hearty Soup Bowl, Creamy Chicken with Rice, 1.9 Ounce (Pack of 6)
Bullet Point 1: Loaded with hearty long grain wild rice and vegetables
Bullet Point 2: Full of hearty goodness
Bullet Point 3: Single serve bowls
Bullet Point 4: Easy to prepare mix
Bullet Point 5: 0 grams trans fat
Value: 11.4
Unit: Ounce

Item Name: Judee’s Blue Cheese Powder 11.25 oz - Gluten-Free and Nu

`content_catalog column has Item Name, Bullet Point, Product Description, Value and Unit. All observations do not have all the fields.`

### Feature Engineering

In [13]:
# Feature Engineering
print("[4/7] Engineering Value/Unit features...")

def extract_value_unit_features(text):
    """Extract critical numeric features with high price correlation"""
    features = {}
    text_str = str(text)

    # Extract Value
    value_match = re.search(r'Value:\s*([\d.]+)', text_str)
    value = float(value_match.group(1)) if value_match else 0

    # Extract Unit
    unit_match = re.search(r'Unit:\s*([^\n]+)', text_str)
    unit = unit_match.group(1).strip() if unit_match else 'unknown'

    # Pack quantity patterns
    pack_patterns = [
        r'pack of (\d+)', r'\(pack of (\d+)\)', r'(\d+)[- ]pack',
        r'(\d+) count', r'set of (\d+)', r'case of (\d+)',
    ]
    pack_qty = 1
    for pattern in pack_patterns:
        matches = re.findall(pattern, text_str.lower())
        if matches:
            pack_qty = max(pack_qty, max([int(m) for m in matches]))

    # Create features
    features['value'] = value
    features['log_value'] = np.log1p(value)
    features['sqrt_value'] = np.sqrt(value)
    features['pack_quantity'] = pack_qty
    features['log_pack'] = np.log1p(pack_qty)
    features['total_quantity'] = value * pack_qty
    features['log_total_qty'] = np.log1p(value * pack_qty)
    features['sqrt_total_qty'] = np.sqrt(value * pack_qty)
    features['unit'] = unit

    return features

train_features = pd.DataFrame([extract_value_unit_features(t)
                               for t in tqdm(train_df['catalog_content'],
                                           desc="Train features")])
test_features = pd.DataFrame([extract_value_unit_features(t)
                             for t in tqdm(test_df['catalog_content'],
                                         desc="Test features")])

# Encode units
le = LabelEncoder()
all_units = pd.concat([train_features['unit'], test_features['unit']])
le.fit(all_units.astype(str))
train_features['unit_encoded'] = le.transform(train_features['unit'].astype(str))
test_features['unit_encoded'] = le.transform(test_features['unit'].astype(str))

train_features = train_features.drop('unit', axis=1)
test_features = test_features.drop('unit', axis=1)

print(f"{train_features.shape[1]} engineered features")

# Show correlations
train_with_price = train_features.copy()
train_with_price['price'] = train_df['price']
print("Top correlations with price:")
print(train_with_price.corr()['price'].abs().sort_values(ascending=False).head(8))

[4/7] Engineering Value/Unit features...


Train features: 100%|██████████| 75000/75000 [00:03<00:00, 18950.01it/s]
Test features: 100%|██████████| 75000/75000 [00:03<00:00, 19271.48it/s]


9 engineered features
Top correlations with price:
price             1.000000
sqrt_value        0.163218
log_value         0.126561
log_total_qty     0.107146
unit_encoded      0.089220
value             0.064510
log_pack          0.046921
sqrt_total_qty    0.000456
Name: price, dtype: float64


In [14]:
train_features.head()

Unnamed: 0,value,log_value,sqrt_value,pack_quantity,log_pack,total_quantity,log_total_qty,sqrt_total_qty,unit_encoded
0,72.0,4.290459,8.485281,6,1.94591,432.0,6.070738,20.78461,46
1,32.0,3.496508,5.656854,4,1.609438,128.0,4.859812,11.313708,73
2,11.4,2.517696,3.376389,6,1.94591,68.4,4.239887,8.270429,73
3,11.25,2.505526,3.354102,1,0.693147,11.25,2.505526,3.354102,73
4,12.0,2.564949,3.464102,1,0.693147,12.0,2.564949,3.464102,37


In [15]:
test_features.head()

Unnamed: 0,value,log_value,sqrt_value,pack_quantity,log_pack,total_quantity,log_total_qty,sqrt_total_qty,unit_encoded
0,10.5,2.442347,3.24037,1,0.693147,10.5,2.442347,3.24037,73
1,2.0,1.098612,1.414214,1,0.693147,2.0,1.098612,1.414214,46
2,32.0,3.496508,5.656854,1,0.693147,32.0,3.496508,5.656854,73
3,2.0,1.098612,1.414214,2,1.098612,4.0,1.609438,2.0,37
4,32.0,3.496508,5.656854,1,0.693147,32.0,3.496508,5.656854,46


### Generate Embeddings

In [None]:
# Downloading Model
model_name = 'hf-hub:Marqo/marqo-ecommerce-embeddings-L'
model, preprocess_train, preprocess_val = open_clip.create_model_and_transforms(model_name)
tokenizer = open_clip.get_tokenizer(model_name)

model = model.to(device)
model.eval()

print("Model loaded successfully!")

In [17]:
# Generate Marqo Embeddings
print("[5/7] Generating Marqo Embeddings")

# ==============================
#  Dataset: Separate Text Columns
# ==============================
class EcommerceDataset(Dataset):
    """Optimized dataset for fast parallel loading with multi-text fields"""
    def __init__(self, df, preprocess):
        self.image_paths = df['image_path'].tolist()
        self.item_names = df['item_name'].astype(str).tolist()
        self.bullet_points = df['bullet_points'].astype(str).tolist()
        self.descriptions = df['product_description'].astype(str).tolist()
        self.preprocess = preprocess

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        # Load and preprocess image
        img_path = self.image_paths[idx]
        try:
            img = Image.open(img_path).convert('RGB')
            img_tensor = self.preprocess(img)
        except:
            # Fallback: white image if corrupt/missing
            white_img = Image.new('RGB', (224, 224), color='white')
            img_tensor = self.preprocess(white_img)

        # Return all 3 text fields
        item_name = self.item_names[idx]
        bullet_points = self.bullet_points[idx]
        description = self.descriptions[idx]

        return img_tensor, item_name, bullet_points, description


def collate_fn(batch):
    """Custom collate to handle multiple text fields"""
    images = torch.stack([item[0] for item in batch])
    item_names = [item[1] for item in batch]
    bullet_points = [item[2] for item in batch]
    descriptions = [item[3] for item in batch]
    return images, item_names, bullet_points, descriptions


# ==============================
#  Optimized Embedding Generator
# ==============================
def generate_embeddings_optimized(df, batch_size=512, text_weights=(0.6, 0.3, 0.1)):
    """
    Generate embeddings with DataLoader optimization.
    Now supports separate embeddings for item name, bullet points, and description.
    text_weights controls how to combine them (weighted average).
    """
    dataset = EcommerceDataset(df, preprocess_val)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=16,
        pin_memory=True,
        prefetch_factor=4,
        persistent_workers=True,
        collate_fn=collate_fn
    )

    all_image_features = []
    all_text_features = []

    print(f"Batch size: {batch_size}, Num workers: 16, Prefetch: 4")
    w_name, w_bullet, w_desc = text_weights

    with torch.no_grad():
        for batch_idx, (images, item_names, bullet_points, descriptions) in enumerate(
            tqdm(dataloader, desc="Processing")
        ):
            # Move images to GPU
            images = images.to(device, non_blocking=True)

            # Encode image embeddings
            image_features = model.encode_image(images, normalize=True)

            # Encode text fields separately
            name_tokens = tokenizer(item_names).to(device)
            bullet_tokens = tokenizer(bullet_points).to(device)
            desc_tokens = tokenizer(descriptions).to(device)

            name_features = model.encode_text(name_tokens, normalize=True)
            bullet_features = model.encode_text(bullet_tokens, normalize=True)
            desc_features = model.encode_text(desc_tokens, normalize=True)

            # Weighted text fusion (default 0.6/0.3/0.1)
            text_features = (
                w_name * name_features +
                w_bullet * bullet_features +
                w_desc * desc_features
            )
            text_features = torch.nn.functional.normalize(text_features, dim=-1)

            # Move to CPU
            all_image_features.append(image_features.cpu())
            all_text_features.append(text_features.cpu())

            # Periodic cache cleanup
            if batch_idx % 50 == 0 and batch_idx > 0:
                torch.cuda.empty_cache()

    # Concatenate all batches
    image_embeddings = torch.cat(all_image_features).numpy()
    text_embeddings = torch.cat(all_text_features).numpy()

    return image_embeddings, text_embeddings


# ==============================
#  Generate Train/Test Embeddings
# ==============================
print("Processing training set...")
train_img_emb, train_txt_emb = generate_embeddings_optimized(train_df, batch_size=512)

print("Processing test set...")
test_img_emb, test_txt_emb = generate_embeddings_optimized(test_df, batch_size=512)

print(f"Image embeddings: {train_img_emb.shape}")
print(f"Text embeddings: {train_txt_emb.shape}")

# Combine image + fused text embeddings
train_embeddings = np.hstack([train_img_emb, train_txt_emb])
test_embeddings = np.hstack([test_img_emb, test_txt_emb])

print(f"Combined embeddings: {train_embeddings.shape}")

# Save embeddings
np.save('train_marqo_embeddings.npy', train_embeddings)
np.save('test_marqo_embeddings.npy', test_embeddings)
print("Embeddings saved to disk")

# Memory cleanup
del model, tokenizer, preprocess_val
gc.collect()
torch.cuda.empty_cache()

[5/7] Generating Marqo Embeddings
Processing training set...
Batch size: 512, Num workers: 16, Prefetch: 4


Processing: 100%|██████████| 147/147 [34:11<00:00, 13.96s/it]  


Processing test set...
Batch size: 512, Num workers: 16, Prefetch: 4


Processing: 100%|██████████| 147/147 [24:21<00:00,  9.94s/it] 


Image embeddings: (75000, 1024)
Text embeddings: (75000, 1024)
Combined embeddings: (75000, 2048)
Embeddings saved to disk


In [18]:
# Combining Features
print("[6/7] Combining Marqo embeddings + engineered features")

scaler = RobustScaler()
train_features_scaled = scaler.fit_transform(train_features)
test_features_scaled = scaler.transform(test_features)

X_train = np.hstack([train_embeddings, train_features_scaled])
X_test = np.hstack([test_embeddings, test_features_scaled])
y_train = np.log1p(train_df['price'].values)

print(f"Final feature shape: {X_train.shape}")
print(f"Marqo embeddings: {train_embeddings.shape[1]}")
print(f"Engineered features: {train_features_scaled.shape[1]}")

[6/7] Combining Marqo embeddings + engineered features
Final feature shape: (75000, 2057)
Marqo embeddings: 2048
Engineered features: 9


### Model Training

In [None]:
# Training XGB with 5-FOLD cross-validation on GPU
print("[7/7] Training XGBoost with 5-Fold CV on GPU")

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X_train))
test_preds = np.zeros(len(X_test))
fold_scores = []

# Optimized parameters for XGBoost with GPU
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'mae',
    'max_depth': 8,
    'learning_rate': 0.02,
    'subsample': 0.8,         
    'colsample_bytree': 0.8,  
    'lambda': 0.2,            # L2 regularization (reg_lambda)
    'alpha': 0.2,             # L1 regularization (reg_alpha)
    'min_child_weight': 30,
    'n_jobs': -1,
    'verbosity': 0,

    # --- GPU PARAMETERS ---
    'tree_method': 'gpu_hist',   # Enables GPU acceleration for training
    'predictor': 'gpu_predictor', # Enables GPU acceleration for prediction
    # --------------------------
}

for fold, (tr_idx, val_idx) in enumerate(kf.split(X_train)):
    print(f"\n{'='*60}")
    print(f"  FOLD {fold+1}/5")
    print(f"{'='*60}")

    # Convert to DMatrix format required by xgb.train
    dtrain = xgb.DMatrix(X_train[tr_idx], label=y_train[tr_idx])
    dval = xgb.DMatrix(X_train[val_idx], label=y_train[val_idx])

    evals_result = {}

    #  Early stopping callback
    early_stop_callback = xgb.callback.EarlyStopping(
        rounds=100,
        save_best=True  # Automatically saves the best iteration
    )

    model = xgb.train(
        params,
        dtrain,
        num_boost_round=5000,
        evals=[(dval, 'validation')],
        evals_result=evals_result,
        callbacks=[early_stop_callback],
        verbose_eval=False  # Suppress per-iteration output
    )

    # Predict (OOF and Test)
    # Use best_iteration
    best_iteration = model.best_iteration
    dval_pred = xgb.DMatrix(X_train[val_idx])
    dtest_pred = xgb.DMatrix(X_test)

    # OOF Prediction: Use best_iteration directly
    oof_preds[val_idx] = model.predict(dval_pred, iteration_range=(0, best_iteration))

    # Test Prediction
    test_preds += model.predict(dtest_pred, iteration_range=(0, best_iteration)) / 5

    # Calculate SMAPE
    # Un-log transform and clip prediction (log-transformed target -> price)
    pred = np.maximum(np.expm1(oof_preds[val_idx]), 0.01)
    true = np.expm1(y_train[val_idx])  # Un-log transform true label

    # Correct SMAPE formula: uses average denominator
    smape = 100 * np.mean(2 * np.abs(true - pred) / (np.abs(true) + np.abs(pred)))
    fold_scores.append(smape)

    print(f"  Fold {fold+1} SMAPE: {smape:.4f}%")
    print(f"  Best iteration: {best_iteration}")

# Overall SMAPE
final_oof = np.maximum(np.expm1(oof_preds), 0.01)
y_true = train_df['price'].values  # Use the original target for final SMAPE
overall_smape = 100 * np.mean(2 * np.abs(y_true - final_oof) /
                             (np.abs(y_true) + np.abs(final_oof)))

print(f"\n{'='*80}")
print(f"MARQO ECOMMERCE-L OOF SMAPE: {overall_smape:.4f}%")
print(f"{'='*80}")
print(f"Fold scores: {[f'{s:.4f}%' for s in fold_scores]}")
print(f"Fold std: {np.std(fold_scores):.4f}%")
print(f"Mean fold: {np.mean(fold_scores):.4f}%")
print(f"{'='*80}")


[7/7] Training XGBoost with 5-Fold CV on GPU...

  FOLD 1/5


  Fold 1 SMAPE: 49.2047%
  Best iteration: 4999

  FOLD 2/5
  Fold 2 SMAPE: 48.0522%
  Best iteration: 4997

  FOLD 3/5
  Fold 3 SMAPE: 48.4541%
  Best iteration: 4999

  FOLD 4/5
  Fold 4 SMAPE: 47.4840%
  Best iteration: 4999

  FOLD 5/5
  Fold 5 SMAPE: 48.2107%
  Best iteration: 4998

MARQO ECOMMERCE-L OOF SMAPE: 48.2811%
Fold scores: ['49.2047%', '48.0522%', '48.4541%', '47.4840%', '48.2107%']
Fold std: 0.5614%
Mean fold: 48.2811%


### Submission

In [25]:
# Submission File
print("Generating Submission File")

final_test = np.maximum(np.expm1(test_preds), 0.01)

submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': final_test
})
submission.to_csv('submission.csv',
                 index=False, float_format='%.2f')

print("\n" + "="*80)
print("PREDICTION STATISTICS")
print("="*80)
print(f"Mean price: ${final_test.mean():.2f}")
print(f"Median price: ${np.median(final_test):.2f}")
print(f"Min price: ${final_test.min():.2f}")
print(f"Max price: ${final_test.max():.2f}")
print(f"Std: ${final_test.std():.2f}")
print(f"OOF SMAPE: {overall_smape:.4f}%")
print("="*80)

Generating Submission File

PREDICTION STATISTICS
Mean price: $18.55
Median price: $13.96
Min price: $0.76
Max price: $240.93
Std: $16.81
OOF SMAPE: 48.2811%
