# Quick start example

This script shows a minimal pipeline to train a forest of PMQRTs to predict confidence intervals targeting a marginal coverage of $1-\alpha$:
- either by predicting the quantiles of order $\alpha/2$ and $1-\alpha/2$, i.e. confidence intervals are $$|\hat q_{\alpha/2}, \hat q_{1-\alpha/2}]$$
- or using conformal prediction with CQR nested sets with a norminal quantile level equal to $2\alpha$, meaning that the conformal prediction intervals are of the form:
$$ [\hat q_{2\alpha}-t, \hat q_{1-2\alpha}+t]$$
where $t$ is a real parameter ajdusted at the conformalization step. For more information on CQR, one can check this paper : https://arxiv.org/abs/1905.03222. 

In [None]:
import os
import sys
import numpy as np

# Get the directory where the current notebook is located
NOTEBOOK_DIR = os.getcwd()

# Go up two levels (adjust the '..' count as needed)
REPO_ROOT = os.path.abspath(os.path.join(NOTEBOOK_DIR, '../../../../'))

# Add to sys.path if not already there
if REPO_ROOT not in sys.path:
    sys.path.insert(0, REPO_ROOT)

from DisTreebution.UQ.UQ import UQ

# Create synthetic data
n = 900
alpha = 0.1
X = np.random.randn(n, 3)
y = X[:, 0]*2.0 + 0.5*np.random.randn(n)

# Split train / calib / test
idx = np.arange(n)
np.random.shuffle(idx)
train_idx = idx[:300]
calib_idx = idx[300:600]
test_idx = idx[600:]

X_train, y_train = X[train_idx], y[train_idx]
X_calib, y_calib = X[calib_idx], y[calib_idx]
X_test, y_test = X[test_idx], y[test_idx]

Training trees...


100%|██████████| 40/40 [00:11<00:00,  3.61it/s]


Done training. 40 trees
Conformalizing...
Conformalization done.
Predicting conformal sets...
Average width: 1.8486468901945303
Empirical coverage: 0.9133333333333333
Example finished.


# 1. Getting confidence intervals by predicting empirical quantiles without conformalization

In [None]:
nTrees = 40
treeID2quantiles = {treeID: [alpha/2, 1-alpha/2] for treeID in range(nTrees)}
params = {'nTrees': nTrees, 'max_depth': 6, 'min_samples_split': 5, 'treeID2quantiles_train': treeID2quantiles}

# Instantiate UQ: use simple PQRT + CQR conformalization for the demo
uq = UQ(type_tree='PMQRT', type_conformal=None, params=params)

# Train trees
print('Training trees...')
trees, sample2calib = uq.train_trees(X_train, y_train)
print('Done training.', len(trees), 'trees')

# Predict confidence interval on the test set
print('Predicting conformal sets...')
sample2predset = uq.get_quantile_estimate(trees, X_test, quantiles=[alpha/2, 1-alpha/2])

# Evaluate widths and coverage
widths, coverages = uq.compute_width_coverage(sample2predset, y_test)
print('Average width:', float(np.mean(widths)))
print('Empirical coverage:', float(np.mean(coverages)))

Training trees...


100%|██████████| 40/40 [00:10<00:00,  3.99it/s]


Done training. 40 trees
Predicting conformal sets...
Average width: 1.5073783052203367
Empirical coverage: 0.8333333333333334
Example finished.


# 2. Getting confidence intervals with conformalization

In [13]:
nTrees = 40
treeID2quantiles = {treeID: [2*alpha, 1-2*alpha] for treeID in range(nTrees)}
params = {'nTrees': nTrees, 'max_depth': 6, 'min_samples_split': 5, 'treeID2quantiles_train': treeID2quantiles}

# Instantiate UQ: use simple PQRT + CQR conformalization for the demo
uq = UQ(type_tree='PMQRT', type_conformal="split", nested_set='CQR', params=params)

# Train trees
print('Training trees...')
trees, sample2calib = uq.train_trees(X_train, y_train)
print('Done training.', len(trees), 'trees')

# Conformalize on calibration set with alpha = 0.1
print('Conformalizing...')
uq.conformalize(trees, X_calib, y_calib, alpha, nominal_quantiles=[2*alpha])
print('Conformalization done.')

# Predict conformal sets on the test set
print('Predicting conformal sets...')
sample2predset = uq.predict_conformal_set(trees, X_test)

# Evaluate widths and coverage
widths, coverages = uq.compute_width_coverage(sample2predset[0], y_test)
print('Average width:', float(np.mean(widths)))
print('Empirical coverage:', float(np.mean(coverages)))

Training trees...


100%|██████████| 40/40 [00:11<00:00,  3.54it/s]


Done training. 40 trees
Conformalizing...
Conformalization done.
Predicting conformal sets...
Average width: 1.8653316403691773
Empirical coverage: 0.9166666666666666
Example finished.
