In [18]:
import numpy as np

def bootstrap_metric(metric_values, n_boot=1000, ci=95):
    """Bootstrap confidence intervals for a given metric."""
    boot_samples = np.random.choice(metric_values, (n_boot, len(metric_values)), replace=True)
    boot_means = np.mean(boot_samples, axis=1)
    lower_bound = np.percentile(boot_means, (100 - ci) / 2)
    upper_bound = np.percentile(boot_means, 100 - (100 - ci) / 2)
    return np.mean(boot_means), np.std(boot_means), (lower_bound, upper_bound)
      

Train optimization scores

In [28]:
logs = [np.float64(-1.1674955426771585), np.float64(-1.2416858621233817), np.float64(-1.1725122542366642), np.float64(-1.2230548363216198), np.float64(-1.181651547798197)]
accs = [np.float64(0.3569003395246655), np.float64(0.2309224945208209), np.float64(0.2751396648044693), np.float64(0.2809190809190809), np.float64(0.28121888070105555)]

In [29]:
np.std(accs)*100

np.float64(4.055172099434916)

In [31]:
final_log, std_log, (log_lb, log_ub) = bootstrap_metric(logs)
final_acc, std_acc, (acc_lb, acc_ub) = bootstrap_metric(accs)

print(f"\nFinal Nested CV Log Score: {final_log:.4f} +- {std_log:.4f} (CI: [{log_lb:.3f}, {log_ub:.3f}])")
print(f"Final Nested CV Accuracy: {final_acc:.4f} +- {std_acc:.4f} (CI: [{acc_lb:.3f}, {acc_ub:.3f}])")


Final Nested CV Log Score: -1.1968 +- 0.0130 (CI: [-1.224, -1.172])
Final Nested CV Accuracy: 0.2840 +- 0.0178 (CI: [0.251, 0.317])


In [3]:
np.mean(accs), np.std(accs)

(np.float64(0.28502009209401846), np.float64(0.04055172099434916))

In [5]:
pairs = [(np.float64(-1.1675239535122772), np.float64(0.6656680647094069)), (np.float64(-1.7300877507687165), np.float64(0.6658696951583981)), (np.float64(-1.260455793553574), np.float64(0.6656025538707103)), (np.float64(-1.0118600494526304), np.float64(0.6657342657342658)), (np.float64(-1.1716473707515658), np.float64(0.6662019518024298))]

In [6]:
logs = [p[0] for p in pairs]
accs = [p[1] for p in pairs]

In [8]:
np.mean(logs), np.std(logs)

(np.float64(-1.2683149836077527), np.float64(0.24436384664484645))

In [11]:
np.mean(accs), np.std(accs)*100

(np.float64(0.6658153062550423), np.float64(0.021258605219092622))

Nested cv bootstrap

In [33]:
accs = np.array([0.6656680647094069,
0.6658696951583981,
0.6656025538707103,
0.6657342657342658,
0.6662019518024298])

In [34]:
logs = ([-1.1675239535122772,
-1.7300877507687165,
-1.260455793553574,
-1.0118600494526304,
-1.1716473707515658])

In [35]:
final_log, std_log, (log_lb, log_ub) = bootstrap_metric(logs)
final_acc, std_acc, (acc_lb, acc_ub) = bootstrap_metric(accs)

print(f"\nFinal Nested CV Log Score: {final_log:.4f} +- {std_log:.4f} (CI: [{log_lb:.3f}, {log_ub:.3f}])")
print(f"Final Nested CV Accuracy: {final_acc:.4f} +- {std_acc:.4f} (CI: [{acc_lb:.3f}, {acc_ub:.3f}])")


Final Nested CV Log Score: -1.2652 +- 0.1089 (CI: [-1.506, -1.105])
Final Nested CV Accuracy: 0.6658 +- 0.0001 (CI: [0.666, 0.666])


In [24]:
import numpy as np
import scipy.stats as stats

def t_distribution_ci(scores, ci=95):
    """
    Compute confidence interval using a t-distribution for small sample sizes.

    Args:
        scores (list or np.array): List of metric values (e.g., log loss from 5 folds).
        ci (float): Confidence level (default is 95%).

    Returns:
        mean_score (float): Mean of the scores.
        std_dev (float): Standard deviation (corrected for small sample size).
        ci_bounds (tuple): Confidence interval bounds.
    """
    scores = np.array(scores)
    n = len(scores)
    mean_score = np.mean(scores)
    std_dev = np.std(scores, ddof=1)  # Use ddof=1 for unbiased estimate (Bessel's correction)
    
    # Compute t critical value for (n-1) degrees of freedom
    t_crit = stats.t.ppf((1 + ci / 100) / 2, df=n - 1)
    
    # Compute margin of error
    margin_of_error = t_crit * (std_dev / np.sqrt(n))
    
    # Confidence interval
    ci_lower = mean_score - margin_of_error
    ci_upper = mean_score + margin_of_error

    return mean_score, std_dev, (ci_lower, ci_upper)

# Example usage
scores = [-1.19, -1.22, -1.21, -1.18, -1.20]  # Example log loss values from 5 folds
mean, std, ci_bounds = t_distribution_ci(scores)

print(f"Mean: {mean:.3f}, Std Dev: {std:.3f}, 95% CI: {ci_bounds}")


Mean: -1.200, Std Dev: 0.016, 95% CI: (np.float64(-1.2196324316147755), np.float64(-1.1803675683852244))


In [27]:
a = [(np.float64(-19.907910660218143), np.float64(0.42360695026962253)), (np.float64(-20.20439280642713), np.float64(0.41502291293086274)), (np.float64(-19.922506495947086), np.float64(0.4231843575418994)), (np.float64(-19.98487441351875), np.float64(0.42137862137862137)), (np.float64(-20.196344850718535), np.float64(0.41525592511451903))]
logs = [b[0] for b in a]
accs = [b[1] for b in a]


mean, std, ci_bounds = t_distribution_ci(logs)

print(f"Mean: {mean:.3f}, Std Dev: {std:.3f}, 95% CI: {ci_bounds}")


mean, std, ci_bounds = t_distribution_ci(accs)

print(f"Mean: {mean:.3f}, Std Dev: {std:.3f}, 95% CI: {ci_bounds}")

Mean: -20.043, Std Dev: 0.146, 95% CI: (np.float64(-20.22496067421807), np.float64(-19.861451016513787))
Mean: 0.420, Std Dev: 0.004, 95% CI: (np.float64(0.4144274121651212), np.float64(0.42495209472908874))
