# Criteo CTR/CVR/CTCVR Experiment
このノートブックは scripts/run_criteo_experiment.py の実行部分をインタラクティブに再現します。

In [7]:
# Environment setup and imports
import sys
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.model_selection import train_test_split

ROOT = Path().resolve().parents[2]  # project root
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.data.real_data import CriteoDataset, SyntheticCriteoDataset
from src.models.esmm import ESMM
from src.models.gbdt_proto import MTGBDT
from src.models.stgbdt import STGBDTBaseline
from src.models.utils import add_cvr_labels


def evaluate_predictions(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
    """Compute CTR/CTCVR/CVR AUC and LogLoss."""
    eps = 1e-8
    y_click = y_true[:, 0]
    y_conv = y_true[:, 1]
    y_ctcvr = y_click * y_conv

    pred_ctr = np.clip(y_pred[:, 0], eps, 1 - eps)
    pred_ctcvr = np.clip(y_pred[:, 1], eps, 1 - eps)
    pred_cvr = np.clip(y_pred[:, 2], eps, 1 - eps)

    metrics = {}
    metrics["auc_ctr"] = roc_auc_score(y_click, pred_ctr)
    metrics["logloss_ctr"] = log_loss(y_click, pred_ctr)
    metrics["auc_ctcvr"] = roc_auc_score(y_ctcvr, pred_ctcvr)
    metrics["logloss_ctcvr"] = log_loss(y_ctcvr, pred_ctcvr)

    click_mask = y_click == 1
    if np.sum(click_mask) > 1 and np.unique(y_conv[click_mask]).size > 1:
        metrics["auc_cvr"] = roc_auc_score(y_conv[click_mask], pred_cvr[click_mask])
        metrics["logloss_cvr"] = log_loss(y_conv[click_mask], pred_cvr[click_mask])
    else:
        metrics["auc_cvr"] = np.nan
        metrics["logloss_cvr"] = np.nan
    return metrics

In [8]:
# Configure paths and parameters
sample_size = 5000
test_size = 0.2
val_size = 0.1
seed = 42
output_csv = ROOT / "reports" / "tables" / "criteo_experiment_notebook.csv"
results = []

In [9]:
# Load Criteo or Synthetic dataset
dataset = CriteoDataset(sample_size=sample_size)
try:
    X, y = dataset.get_data(random_state=seed)
    source = "CriteoDataset"
except Exception:
    synth = SyntheticCriteoDataset(sample_size=sample_size)
    X, y = synth.load_data(random_state=seed)
    source = "SyntheticCriteoDataset"

X_shape, y_shape = X.shape, y.shape
X_shape, y_shape, source

Loading Criteo dataset from mt_gbm/data/criteo-research-uplift-v2.1.csv.gz...
Error loading Criteo dataset: [Errno 2] No such file or directory: 'mt_gbm/data/criteo-research-uplift-v2.1.csv.gz'


((5000, 10), (5000, 2), 'SyntheticCriteoDataset')

In [None]:
# Train/val/test split
X_train, X_tmp, y_train, y_tmp = train_test_split(
    X, y, test_size=test_size + val_size, random_state=seed, stratify=y[:, 0]
)
rel_val = val_size / (test_size + val_size)
X_val, X_test, y_val, y_test = train_test_split(
    X_tmp, y_tmp, test_size=1 - rel_val, random_state=seed, stratify=y_tmp[:, 0]
 )

X_train.shape, X_val.shape, X_test.shape

In [None]:
# Train and evaluate ESMM
esmm = ESMM(epochs=5, batch_size=256, verbose=0, validation_split=0.0)
esmm.fit(X_train, y_train)
esmm_pred = esmm.predict_proba(X_test)
esmm_metrics = evaluate_predictions(y_test, esmm_pred)
esmm_metrics.update({"model": "ESMM", "n_params": esmm.model_.count_params() if esmm.model_ else 0})
esmm_metrics
results.append(esmm_metrics)

In [None]:
# Train and evaluate MTGBDT
mtgbdt = MTGBDT(
    n_estimators=20, learning_rate=0.1, max_depth=3, n_tasks=3, loss="logloss", weighting_strategy="mtgbm"
 )
mtgbdt.fit(X_train, y_train)
mtgbdt_pred = mtgbdt.predict_proba(X_test)
mtgbdt_metrics = evaluate_predictions(y_test, mtgbdt_pred)
mtgbdt_metrics.update({"model": "MTGBDT", "n_estimators": mtgbdt.n_estimators})
mtgbdt_metrics
results.append(mtgbdt_metrics)

In [None]:
# Train and evaluate STGBDT baseline
stg = STGBDTBaseline(
    n_estimators=10, learning_rate=0.3, max_depth=2, min_samples_split=20, min_samples_leaf=10
 )
stg.fit(X_train, y_train)
stg_pred = stg.predict_proba(X_test)
stg_metrics = evaluate_predictions(y_test, stg_pred)
stg_metrics.update({"model": "STGBDTBaseline", "n_estimators": stg.n_estimators})
stg_metrics
results.append(stg_metrics)

In [None]:
# Aggregate and display metrics
headers = ["model", "auc_ctr", "auc_ctcvr", "auc_cvr", "logloss_ctr", "logloss_ctcvr", "logloss_cvr"]
print("\n=== Experiment Results ===")
for res in results:
    row = {h: res.get(h, np.nan) for h in headers}
    print(row)

df_results = pd.DataFrame(results)
df_results

In [None]:
# Save metrics to CSV
output_csv.parent.mkdir(parents=True, exist_ok=True)
df_results.to_csv(output_csv, index=False)
output_csv