In [1]:
import numpy as np
from concurrent.futures import ProcessPoolExecutor
from tqdm import tqdm
from utils import simulate_bagging_and_ijk_var_calculation
import pandas as pd

In [2]:

####### Simulation parameters  #########
n_x = 1000
n_sim = 2000
portion_non_zero_weights = 0.5
var_x = 1
seed = 41

B_values = [500, 1000, 2000, 4000, 10000]  # Different B values

# Prepare to collect results
results = []

# Loop over each B value
for B in B_values:
    print(f"Running simulations for B = {B}")
    # run simulation
    with ProcessPoolExecutor() as executor:
        rng = np.random.default_rng(seed)
        weights = np.zeros(n_x)

        m = int(n_x * portion_non_zero_weights)         # number of non-zero weights
        x_sim = rng.normal(0, var_x**0.5, (n_sim, n_x)) # simulate n_sim data-sets
        weights[:m] = 1 / m                             # weights for the first m variables = 1/m , else 0

        theta_bagged = np.zeros(n_sim)
        theta_bagged_var_ijk = np.zeros(n_sim)
        theta_bagged_var_ijk_biased = np.zeros(n_sim)

        futures = [
            executor.submit(
                simulate_bagging_and_ijk_var_calculation,
                x1=x_sim[i],
                B=B,
                sim_i=i,
                seed=seed,
                weights=weights,
                m=m,
            )
            for i in range(n_sim)
        ]

        for i, future in enumerate(tqdm(futures, desc=f"Simulations for B={B}", unit="simulation")):
            _biased_var_estimate, _bias_correction, _theta_bagged = future.result()
            _theta_bagged_var_ijk = _biased_var_estimate - _bias_correction

            theta_bagged[i] = _theta_bagged
            theta_bagged_var_ijk[i] = _theta_bagged_var_ijk
            theta_bagged_var_ijk_biased[i] = _biased_var_estimate

        true_variance = var_x / m
        mean_ijk_awb_u = theta_bagged_var_ijk.mean()
        std_ijk_awb_u = theta_bagged_var_ijk.std()
        mean_ijk_awb = theta_bagged_var_ijk_biased.mean()
        std_ijk_awb = theta_bagged_var_ijk_biased.std()

        # Collect the results
        results.append({
            'Bootstrap Samples (B)': B,
            'True Variance': f"{true_variance:.3f}",
            'IJK-AWB-U Estimate ± Std': f"{mean_ijk_awb_u:.4f} ± {std_ijk_awb_u:.4f}",
            'IJK-AWB Estimate ± Std': f"{mean_ijk_awb:.4f} ± {std_ijk_awb:.4f}",
        })

df_results = pd.DataFrame(results)
df_results.to_csv("results.csv", index=False)


Running simulations for B = 500


Simulations for B=500: 100%|██████████| 2000/2000 [00:09<00:00, 214.82simulation/s]


Running simulations for B = 1000


Simulations for B=1000: 100%|██████████| 2000/2000 [00:18<00:00, 108.39simulation/s]


Running simulations for B = 2000


Simulations for B=2000: 100%|██████████| 2000/2000 [00:39<00:00, 50.44simulation/s]


Running simulations for B = 4000


Simulations for B=4000: 100%|██████████| 2000/2000 [01:16<00:00, 26.00simulation/s]


Running simulations for B = 10000


Simulations for B=10000: 100%|██████████| 2000/2000 [03:08<00:00, 10.64simulation/s]


In [2]:
import pandas as pd
df_results_ = pd.read_csv("results.csv")
latex_table = df_results_.to_latex(index=False)
print(latex_table)

\begin{tabular}{rrll}
\toprule
Bootstrap Samples (B) & True Variance & IJK-AWB-U Estimate ± Std & IJK-AWB Estimate ± Std \\
\midrule
500 & 0.002000 & 0.0020 ± 0.0003 & 0.0040 ± 0.0005 \\
1000 & 0.002000 & 0.0020 ± 0.0002 & 0.0030 ± 0.0003 \\
2000 & 0.002000 & 0.0020 ± 0.0002 & 0.0025 ± 0.0002 \\
4000 & 0.002000 & 0.0020 ± 0.0002 & 0.0022 ± 0.0002 \\
10000 & 0.002000 & 0.0020 ± 0.0001 & 0.0021 ± 0.0001 \\
\bottomrule
\end{tabular}

