# CatBoost CV Upgrade
Enhanced notebook with richer feature engineering, robust cross-validation, and calibrated CatBoost inference for the Datathon production challenge.

In [1]:
# Imports and configuration
from pathlib import Path
import warnings

import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from catboost.utils import get_gpu_device_count
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import KFold

warnings.filterwarnings("ignore", category=FutureWarning)

SEED = 42
GPU_DEVICES = "0"  # set to "0,1" to leverage multiple GPUs
GPU_AVAILABLE = get_gpu_device_count() > 0
CATBOOST_TASK_TYPE = "GPU" if GPU_AVAILABLE else "CPU"
if GPU_AVAILABLE:
    print(f"CatBoost GPU detected on device(s): {GPU_DEVICES}")
else:
    print("CatBoost GPU not detected; falling back to CPU training.")

DATA_DIR = Path("data")
SUBMISSION_DIR = Path("submissions")
SUBMISSION_DIR.mkdir(parents=True, exist_ok=True)

TRAIN_PATH = DATA_DIR / "train.csv"
TEST_PATH = DATA_DIR / "test.csv"
SAMPLE_SUB_PATH = DATA_DIR / "sample_submission.csv"

for required_path in [TRAIN_PATH, TEST_PATH]:
    if not required_path.exists():
        raise FileNotFoundError(
            f"Missing required dataset: {required_path.resolve()} â€” "
            "please place the competition CSVs inside data/."
)

def load_dataset(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, sep=";")
    df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
    return df

train_df = load_dataset(TRAIN_PATH)
test_df = load_dataset(TEST_PATH)
sample_submission = load_dataset(SAMPLE_SUB_PATH) if SAMPLE_SUB_PATH.exists() else None

print(f"Train shape: {train_df.shape} | Test shape: {test_df.shape}")

CatBoost GPU detected on device(s): 0
Train shape: (95339, 33) | Test shape: (2250, 28)


In [2]:
# Helper utilities for feature engineering
def parse_embedding_vector(raw_value):
    if pd.isna(raw_value) or raw_value == "":
        return np.zeros(1, dtype=np.float32)
    text = str(raw_value).replace("[", "").replace("]", "")
    values = [part.strip() for part in text.split(",") if part.strip() != ""]
    if not values:
        return np.zeros(1, dtype=np.float32)
    try:
        return np.array([float(v) for v in values], dtype=np.float32)
    except ValueError:
        return np.zeros(len(values) or 1, dtype=np.float32)

def build_embedding_matrix(series: pd.Series) -> np.ndarray:
    parsed = [parse_embedding_vector(value) for value in series]
    max_dim = max((vec.size for vec in parsed), default=1)
    matrix = np.zeros((len(parsed), max_dim), dtype=np.float32)
    for row_idx, vec in enumerate(parsed):
        matrix[row_idx, : vec.size] = vec
    return matrix

def add_embedding_features(df: pd.DataFrame, column: str = "image_embedding", n_components: int = 48) -> pd.DataFrame:
    if column not in df.columns:
        return df
    df = df.copy()
    emb_matrix = build_embedding_matrix(df[column])
    emb_numeric = np.nan_to_num(emb_matrix)
    df["img_emb_mean"] = emb_numeric.mean(axis=1)
    df["img_emb_std"] = emb_numeric.std(axis=1)
    df["img_emb_abs_mean"] = np.abs(emb_numeric).mean(axis=1)
    df["img_emb_max"] = emb_numeric.max(axis=1)
    df["img_emb_min"] = emb_numeric.min(axis=1)
    df["img_emb_median"] = np.nanmedian(emb_numeric, axis=1)
    df["img_emb_q25"] = np.nanquantile(emb_numeric, 0.25, axis=1)
    df["img_emb_q75"] = np.nanquantile(emb_numeric, 0.75, axis=1)
    df["img_emb_energy"] = (emb_numeric ** 2).sum(axis=1)
    df["img_emb_norm"] = np.linalg.norm(emb_numeric, axis=1)

    usable_components = min(n_components, emb_matrix.shape[1], max(1, emb_matrix.shape[0] - 1))
    emb_pca = None
    if emb_matrix.shape[0] > 1 and usable_components >= 1:
        pca = PCA(n_components=usable_components, random_state=SEED)
        emb_pca = pca.fit_transform(emb_numeric)
        for comp_idx in range(usable_components):
            df[f"img_pca_{comp_idx}"] = emb_pca[:, comp_idx]

    cluster_source = emb_pca if emb_pca is not None and emb_pca.shape[1] >= 2 else emb_numeric
    if cluster_source.shape[0] >= 2:
        cluster_count = min(8, max(2, cluster_source.shape[0]))
        try:
            kmeans = KMeans(n_clusters=cluster_count, random_state=SEED, n_init=10)
            cluster_labels = kmeans.fit_predict(cluster_source)
            df["img_emb_cluster"] = cluster_labels.astype(str)
            df["img_emb_cluster_dist"] = kmeans.transform(cluster_source).min(axis=1)
        except ValueError:
            df["img_emb_cluster"] = "0"
            df["img_emb_cluster_dist"] = 0.0

    df = df.drop(columns=[column])
    return df

def safe_ratio(numerator: pd.Series, denominator: pd.Series) -> pd.Series:
    denominator = denominator.replace(0, np.nan)
    ratio = numerator / denominator
    return ratio.replace([np.inf, -np.inf], np.nan)

def engineer_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    for numeric_col in ["num_stores", "num_sizes", "weekly_demand"]:
        if numeric_col in df.columns:
            df[numeric_col] = pd.to_numeric(df[numeric_col], errors="coerce")

    for col in ["phase_in", "phase_out"]:
        if col in df.columns:
            df[f"{col}_dt"] = pd.to_datetime(df[col], format="%d/%m/%Y", errors="coerce")

    if "phase_in_dt" in df.columns:
        df["phase_in_year"] = df["phase_in_dt"].dt.year
        df["phase_in_month"] = df["phase_in_dt"].dt.month
        df["phase_in_day"] = df["phase_in_dt"].dt.day
        df["phase_in_dayofyear"] = df["phase_in_dt"].dt.dayofyear
        df["phase_in_week"] = df["phase_in_dt"].dt.isocalendar().week.astype(float)
        df["phase_in_weekday"] = df["phase_in_dt"].dt.weekday
        df["phase_in_weekend"] = (df["phase_in_dt"].dt.weekday >= 5).astype(int)
        df["phase_in_quarter"] = ((df["phase_in_month"] - 1) // 3 + 1).astype(float)
        base_launch = df["phase_in_dt"].min()
        if pd.notna(base_launch):
            df["phase_in_age_days"] = (df["phase_in_dt"] - base_launch).dt.days

    if "phase_out_dt" in df.columns:
        df["phase_out_year"] = df["phase_out_dt"].dt.year
        df["phase_out_month"] = df["phase_out_dt"].dt.month
        df["phase_out_week"] = df["phase_out_dt"].dt.isocalendar().week.astype(float)
        min_phase_out = df["phase_out_dt"].min()
        if pd.notna(min_phase_out):
            df["phase_out_age_days"] = (df["phase_out_dt"] - min_phase_out).dt.days

    if {"phase_in_dt", "phase_out_dt"}.issubset(df.columns):
        df["lifecycle_days"] = (df["phase_out_dt"] - df["phase_in_dt"]).dt.days
        df["lifecycle_days"] = df["lifecycle_days"].clip(lower=0)
        df["lifecycle_weeks"] = df["lifecycle_days"] / 7.0
    else:
        df["lifecycle_days"] = np.nan
        df["lifecycle_weeks"] = np.nan

    df["lifecycle_missing"] = df["lifecycle_days"].isna().astype(int)
    df["lifecycle_days"] = df["lifecycle_days"].fillna(df["lifecycle_days"].median())
    df["lifecycle_weeks"] = df["lifecycle_weeks"].fillna(df["lifecycle_weeks"].median())

    if "phase_in_month" in df.columns:
        season_map = {
            "winter": [12, 1, 2],
            "spring": [3, 4, 5],
            "summer": [6, 7, 8],
            "fall": [9, 10, 11],
        }
        for season_name, months in season_map.items():
            df[f"launch_{season_name}"] = df["phase_in_month"].isin(months).astype(int)
        df["phase_in_year_progress"] = df["phase_in_dayofyear"] / 365.0

    for cyc_col, period in [("phase_in_month", 12), ("phase_in_dayofyear", 365), ("phase_in_week", 52)]:
        if cyc_col in df.columns:
            angle = 2 * np.pi * df[cyc_col].fillna(0) / period
            df[f"{cyc_col}_sin"] = np.sin(angle)
            df[f"{cyc_col}_cos"] = np.cos(angle)

    if "color_rgb" in df.columns:
        def parse_rgb(value):
            if pd.isna(value) or value == "":
                return [128, 128, 128]
            try:
                parts = [int(float(x)) for x in str(value).split(",")]
                return parts if len(parts) == 3 else [128, 128, 128]
            except ValueError:
                return [128, 128, 128]

        rgb_values = np.array(df["color_rgb"].apply(parse_rgb).tolist())
        df["color_r"] = rgb_values[:, 0]
        df["color_g"] = rgb_values[:, 1]
        df["color_b"] = rgb_values[:, 2]
        df["color_mean"] = rgb_values.mean(axis=1)
        df["color_std"] = rgb_values.std(axis=1)
        df["color_range"] = np.ptp(rgb_values, axis=1)
        df["is_dark_color"] = (df["color_mean"] < 90).astype(int)

    ratio_specs = [
        ("weekly_demand", "num_stores", "demand_per_store"),
        ("weekly_demand", "num_sizes", "demand_per_size"),
        ("num_stores", "num_sizes", "stores_per_size"),
        ("weekly_demand", "lifecycle_weeks", "demand_per_week"),
        ("num_stores", "lifecycle_weeks", "stores_per_week"),
        ("num_sizes", "lifecycle_weeks", "sizes_per_week"),
        ("lifecycle_days", "num_stores", "lifecycle_per_store"),
        ("lifecycle_days", "num_sizes", "lifecycle_per_size"),
        ("num_stores", "weekly_demand", "stores_to_demand"),
        ("num_sizes", "weekly_demand", "sizes_to_demand"),
    ]
    for numerator_col, denominator_col, feature_name in ratio_specs:
        if numerator_col in df.columns and denominator_col in df.columns:
            df[feature_name] = safe_ratio(df[numerator_col], df[denominator_col])

    if {"num_stores", "num_sizes"}.issubset(df.columns):
        df["stores_times_sizes"] = df["num_stores"] * df["num_sizes"]
        df["stores_minus_sizes"] = df["num_stores"] - df["num_sizes"]
        df["stores_plus_sizes"] = df["num_stores"] + df["num_sizes"]
        if "weekly_demand" in df.columns:
            df["demand_per_store_size"] = safe_ratio(df["weekly_demand"], df["stores_times_sizes"])

    if {"weekly_demand", "lifecycle_weeks"}.issubset(df.columns):
        df["demand_times_lifecycle"] = df["weekly_demand"] * df["lifecycle_weeks"]

    log_candidates = [
        "weekly_demand",
        "num_stores",
        "num_sizes",
        "lifecycle_days",
        "demand_per_store",
        "demand_per_size",
        "demand_per_week",
        "demand_per_store_size",
    ]
    for col in log_candidates:
        if col in df.columns:
            df[f"log_{col}"] = np.log1p(df[col].clip(lower=0))

    for col in ["product_name", "product_type", "designer", "theme", "description"]:
        if col in df.columns:
            text_series = df[col].fillna("").astype(str)
            df[f"{col}_len"] = text_series.str.len()
            df[f"{col}_words"] = text_series.str.split().str.len()

    categorical_cols = df.select_dtypes(include=["object", "category"]).columns
    for col in categorical_cols:
        freq = df[col].value_counts(dropna=False, normalize=True)
        df[f"{col}_freq"] = df[col].map(freq)

    protected_missing_cols = {"Production", "is_train"}
    missing_cols = [col for col in df.columns if col not in protected_missing_cols and df[col].isna().any()]
    for col in missing_cols:
        df[f"{col}_missing_flag"] = df[col].isna().astype(int)
    df["row_missing_count"] = df.isna().sum(axis=1)
    df["row_missing_ratio"] = df["row_missing_count"] / max(1, df.shape[1])

    df = df.drop(columns=["phase_in", "phase_out", "color_rgb", "phase_in_dt", "phase_out_dt"], errors="ignore")
    return df

In [3]:
# Feature engineering and dataset assembly
train_df = train_df.copy()
test_df = test_df.copy()

train_df["is_train"] = 1
test_df["is_train"] = 0
test_df["Production"] = np.nan

full_df = pd.concat([train_df, test_df], ignore_index=True)
full_df = add_embedding_features(full_df, column="image_embedding", n_components=48)
full_df = engineer_features(full_df)

train_processed = full_df[full_df["is_train"] == 1].drop(columns=["is_train"])
test_processed = full_df[full_df["is_train"] == 0].drop(columns=["is_train"])

test_ids = test_processed["ID"].copy() if "ID" in test_processed.columns else pd.Series(np.arange(len(test_processed)))

if "ID" in train_processed.columns:
    train_processed = train_processed.drop(columns=["ID"])
    test_processed = test_processed.drop(columns=["ID"])

y = train_processed["Production"].astype(float)
X = train_processed.drop(columns=["Production"]).reset_index(drop=True)
test_features = test_processed.drop(columns=["Production"], errors="ignore").reset_index(drop=True)

numeric_cols = X.select_dtypes(include=[np.number]).columns
numeric_medians = X[numeric_cols].median().fillna(0)

X[numeric_cols] = X[numeric_cols].fillna(numeric_medians)
for col in numeric_cols:
    fill_value = numeric_medians[col] if col in numeric_medians else 0
    if col in test_features.columns:
        test_features[col] = test_features[col].fillna(fill_value)
    else:
        test_features[col] = fill_value

missing_test_cols = [col for col in X.columns if col not in test_features.columns]
for col in missing_test_cols:
    test_features[col] = 0

extra_test_cols = [col for col in test_features.columns if col not in X.columns]
if extra_test_cols:
    test_features = test_features.drop(columns=extra_test_cols)

test_features = test_features[X.columns]

X[numeric_cols] = X[numeric_cols].astype(np.float32)
test_features[numeric_cols] = test_features[numeric_cols].astype(np.float32)

categorical_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()
for col in categorical_cols:
    X[col] = X[col].astype(str).fillna("missing")
    if col in test_features.columns:
        test_features[col] = test_features[col].astype(str).fillna("missing")

cat_feature_indices = [X.columns.get_loc(col) for col in categorical_cols]
cat_features_for_pool = cat_feature_indices if categorical_cols else None

print(f"Final training matrix: {X.shape} | Test matrix: {test_features.shape}")
print(f"Categorical features tracked: {len(categorical_cols)} | Numeric features: {len(X.columns) - len(categorical_cols)}")

  df[f"{col}_missing_flag"] = df[col].isna().astype(int)
  df[f"{col}_missing_flag"] = df[col].isna().astype(int)
  df[f"{col}_missing_flag"] = df[col].isna().astype(int)
  df[f"{col}_missing_flag"] = df[col].isna().astype(int)
  df[f"{col}_missing_flag"] = df[col].isna().astype(int)
  df[f"{col}_missing_flag"] = df[col].isna().astype(int)
  df[f"{col}_missing_flag"] = df[col].isna().astype(int)
  df[f"{col}_missing_flag"] = df[col].isna().astype(int)
  df[f"{col}_missing_flag"] = df[col].isna().astype(int)
  df[f"{col}_missing_flag"] = df[col].isna().astype(int)
  df[f"{col}_missing_flag"] = df[col].isna().astype(int)
  df[f"{col}_missing_flag"] = df[col].isna().astype(int)
  df[f"{col}_missing_flag"] = df[col].isna().astype(int)
  df[f"{col}_missing_flag"] = df[col].isna().astype(int)
  df[f"{col}_missing_flag"] = df[col].isna().astype(int)
  df["row_missing_count"] = df.isna().sum(axis=1)
  df["row_missing_ratio"] = df["row_missing_count"] / max(1, df.shape[1])


Final training matrix: (95339, 192) | Test matrix: (2250, 192)
Categorical features tracked: 16 | Numeric features: 176


In [4]:
# Cross-validated CatBoost training and calibrated inference
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
oof_predictions = np.zeros(len(X))
test_predictions = np.zeros(len(test_features))
feature_importance = np.zeros(len(X.columns), dtype=float)

catboost_base_params = dict(
    iterations=2000,
    depth=8,
    learning_rate=0.025,
    loss_function="RMSE",
    eval_metric="RMSE",
    l2_leaf_reg=6.0,
    min_child_samples=25,
    random_strength=0.4,
    bagging_temperature=0.75,
    od_type="Iter",
    od_wait=200,
    verbose=250,
)

if CATBOOST_TASK_TYPE == "GPU":
    catboost_base_params.update(
        task_type="GPU",
        devices=GPU_DEVICES,
        grow_policy="SymmetricTree",
        bootstrap_type="Bayesian",
        leaf_estimation_backtracking="AnyImprovement",
    )
else:
    catboost_base_params.update(
        grow_policy="Lossguide",
        subsample=0.85,
        colsample_bylevel=0.85,
    )

for fold, (train_idx, valid_idx) in enumerate(kf.split(X), start=1):
    X_train_fold, X_valid_fold = X.iloc[train_idx], X.iloc[valid_idx]
    y_train_fold, y_valid_fold = y.iloc[train_idx], y.iloc[valid_idx]

    train_pool = Pool(X_train_fold, y_train_fold, cat_features=cat_features_for_pool)
    valid_pool = Pool(X_valid_fold, y_valid_fold, cat_features=cat_features_for_pool)

    fold_params = catboost_base_params.copy()
    fold_params["random_seed"] = SEED + fold
    if CATBOOST_TASK_TYPE == "GPU":
        fold_params.pop("subsample", None)
        fold_params.pop("colsample_bylevel", None)
    model = CatBoostRegressor(**fold_params)
    model.fit(train_pool, eval_set=valid_pool, use_best_model=True)

    fold_oof = model.predict(valid_pool)
    oof_predictions[valid_idx] = fold_oof
    fold_rmse = root_mean_squared_error(y_valid_fold, fold_oof)
    print(f"Fold {fold} RMSE: {fold_rmse:.4f}")

    test_predictions += model.predict(Pool(test_features, cat_features=cat_features_for_pool)) / kf.n_splits
    feature_importance += model.get_feature_importance(type="FeatureImportance")

cv_rmse = root_mean_squared_error(y, oof_predictions)
y_mean = y.mean()
oof_mean = oof_predictions.mean()
bias_correction = y_mean / oof_mean if np.isfinite(oof_mean) and abs(oof_mean) > 1e-6 else 1.0
test_predictions = np.clip(test_predictions * bias_correction, 0, None)

submission_df = pd.DataFrame({
    "ID": test_ids.values,
    "Production": np.rint(test_predictions).astype(int)
})
submission_path = SUBMISSION_DIR / f"submission_catboost_cv_{cv_rmse:.3f}.csv"
submission_df.to_csv(submission_path, index=False)

feature_importance_df = (
    pd.DataFrame({"feature": X.columns, "importance": feature_importance / kf.n_splits})
    .sort_values("importance", ascending=False)
    .reset_index(drop=True)
)

print(f"OOF RMSE: {cv_rmse:.4f}")
print(f"Bias correction factor: {bias_correction:.4f}")
print(f"Submission saved to: {submission_path}")
feature_importance_df.head(25)

0:	learn: 34215.3790280	test: 33569.9394330	best: 33569.9394330 (0)	total: 104ms	remaining: 3m 27s
250:	learn: 6999.2317910	test: 7350.7976290	best: 7350.7976290 (250)	total: 15s	remaining: 1m 44s
250:	learn: 6999.2317910	test: 7350.7976290	best: 7350.7976290 (250)	total: 15s	remaining: 1m 44s
500:	learn: 5354.8896995	test: 5757.2497735	best: 5757.2497735 (500)	total: 30.1s	remaining: 1m 30s
500:	learn: 5354.8896995	test: 5757.2497735	best: 5757.2497735 (500)	total: 30.1s	remaining: 1m 30s
750:	learn: 4382.8612567	test: 4790.7473537	best: 4790.7473537 (750)	total: 45.3s	remaining: 1m 15s
750:	learn: 4382.8612567	test: 4790.7473537	best: 4790.7473537 (750)	total: 45.3s	remaining: 1m 15s
1000:	learn: 3711.7183077	test: 4128.2167038	best: 4128.2167038 (1000)	total: 1m	remaining: 1m
1000:	learn: 3711.7183077	test: 4128.2167038	best: 4128.2167038 (1000)	total: 1m	remaining: 1m
1250:	learn: 3226.4173046	test: 3648.5937416	best: 3648.5937416 (1250)	total: 1m 15s	remaining: 44.9s
1250:	learn: 

Unnamed: 0,feature,importance
0,stores_times_sizes,11.391924
1,lifecycle_per_store,8.974239
2,price,6.928117
3,demand_times_lifecycle,6.279555
4,stores_plus_sizes,5.836264
5,stores_minus_sizes,5.385672
6,life_cycle_length,4.422606
7,stores_per_week,4.092043
8,lifecycle_weeks,3.487684
9,num_stores,1.99434


## GPU runtime notes
- CatBoost now auto-detects CUDA availability via `get_gpu_device_count()` and switches to `task_type="GPU"` on device string `"0"` (edit `GPU_DEVICES` if you want multi-GPU like `"0-1"`).
- Training matrices downcast numeric columns to `float32` before constructing `Pool` objects, which reduces host memory pressure and shortens PCIe transfer time on the RTX 3060.
- When no compatible GPU is present, the notebook falls back to CPU training automatically while keeping identical hyperparameters aside from `grow_policy`.
- Ensure NVIDIA drivers are up to date; CatBoost wheels already ship with the matching CUDA runtime on Windows, so no manual toolkit install is required in most setups.