In [None]:
import os
import numpy as np
import pandas as pd
# --- ground truth functions ---
def ground_truth(x: np.ndarray) -> np.ndarray:
    """
    Some arbitrarily defined function (e.g. affinity).
    """
    return -np.sum((x - 0.5) ** 2, axis=1) + np.sin(5 * np.sum(x, axis=1))

def ground_truth2(x: np.ndarray) -> np.ndarray:
    """
    Some arbitrary function meant to model an additional condition (e.g. developability).
    """
    return np.sum(np.sqrt(x) * np.sin(x), axis=1)

In [3]:
# --- data generation ---
def generate_data(embed_dim: int, n_samples: int, ground_truth_fn,
    noise_levels: dict[str, float], output_dir: str):
    """
    Generate one synthetic dataset:
      1) Uniform X in [0,1]^embed_dim, shape=(n_samples, embed_dim)
      2) Apply ground_truth_fn to get y_true
      3) For each fidelity in noise_levels, add Gaussian noise
      4) Save a newline-delimited JSON with columns:
           'embedding' (list), plus y_{fidelity} for each fidelity.
    """
    # ensure output dir
    os.makedirs(output_dir, exist_ok=True)

    # 1) sample inputs
    X = np.random.uniform(0, 1, size=(n_samples, embed_dim))
    # 2) compute true values
    y_true = ground_truth_fn(X)

    # 3) build DataFrame
    df = pd.DataFrame({
        'embedding': [x.tolist() for x in X]
    })
    # 4) add noisy outputs
    for label, sigma in noise_levels.items():
        df[f'y_{label}'] = y_true + np.random.normal(0, sigma, size=y_true.shape)

    # 5) write to file
    fname = f'synth_D{embed_dim}_N{n_samples}_{ground_truth_fn.__name__}.json'
    out_path = os.path.join(output_dir, fname)
    df.to_json(out_path, orient='records', lines=True)
    print(f"Saved: {out_path}")

In [4]:
EMBED_DIMS = {2, 4, 8, 16, 32, 64, 256, 512, 1024}
N_SAMPLES  = {100, 1_000, 10_000, 100_000, 240_000}
NOISE_LEVELS = {
    'low':    1.0,
    'medium': 0.5,
    'high':   0.1 }
OUTPUT_DIR = '../data/raw/synth_data'

for d in sorted(EMBED_DIMS):
    for n in sorted(N_SAMPLES):
        generate_data(d, n, ground_truth, NOISE_LEVELS, OUTPUT_DIR)

Saved: ../data/raw/synth_data/synth_D2_N100_ground_truth.json
Saved: ../data/raw/synth_data/synth_D2_N1000_ground_truth.json
Saved: ../data/raw/synth_data/synth_D2_N10000_ground_truth.json
Saved: ../data/raw/synth_data/synth_D2_N100000_ground_truth.json
Saved: ../data/raw/synth_data/synth_D2_N240000_ground_truth.json
Saved: ../data/raw/synth_data/synth_D4_N100_ground_truth.json
Saved: ../data/raw/synth_data/synth_D4_N1000_ground_truth.json
Saved: ../data/raw/synth_data/synth_D4_N10000_ground_truth.json
Saved: ../data/raw/synth_data/synth_D4_N100000_ground_truth.json
Saved: ../data/raw/synth_data/synth_D4_N240000_ground_truth.json
Saved: ../data/raw/synth_data/synth_D8_N100_ground_truth.json
Saved: ../data/raw/synth_data/synth_D8_N1000_ground_truth.json
Saved: ../data/raw/synth_data/synth_D8_N10000_ground_truth.json
Saved: ../data/raw/synth_data/synth_D8_N100000_ground_truth.json
Saved: ../data/raw/synth_data/synth_D8_N240000_ground_truth.json
Saved: ../data/raw/synth_data/synth_D16_N10