In [1]:
import numpy as np
import pandas as pd

def correlated_data_generator(domain, n, correlations=None, total_attributes=None, seed=None):
    """
    Generate a discrete dataset with optional pairwise 'same-value' correlations.
    - domain: iterable of categorical values (e.g., [0,1,2]).
    - correlations: list of tuples [(attr1, attr2, p_same), ...]
      For each pair, ensures X_attr2 == X_attr1 with prob p_same; otherwise X_attr2 is a
      random different value from the domain.
    - total_attributes: if provided, ensures columns X1..X{total_attributes} exist,
      filling any missing ones with i.i.d. draws from domain.
    """
    if seed is not None:
        np.random.seed(seed)

    domain = list(domain)
    # Precompute "all values except v"
    domain_excl = {v: [d for d in domain if d != v] for v in domain}

    df = pd.DataFrame()

    # Honor correlations, reusing existing attr1 columns if already built
    if correlations:
        for attr1, attr2, p_same in correlations:
            # Ensure attr1 exists; if not, create it i.i.d. from domain
            if attr1 not in df.columns:
                X1 = np.random.choice(domain, size=n)
                df[attr1] = X1
            else:
                X1 = df[attr1].to_numpy()

            # Build attr2 conditioned on attr1
            mask_same = np.random.rand(n) < p_same
            X2 = np.empty(n, dtype=object)
            # Same as X1 where mask says so
            X2[mask_same] = X1[mask_same]
            # Different random value where mask says different
            if (~mask_same).any():
                X1_diff = X1[~mask_same]
                X2[~mask_same] = [np.random.choice(domain_excl[v]) for v in X1_diff]
            df[attr2] = X2

    # Ensure requested attributes exist
    if total_attributes is not None:
        all_attrs = [f'X{i+1}' for i in range(total_attributes)]
    else:
        all_attrs = list(df.columns)

    for attr in all_attrs:
        if attr not in df.columns:
            df[attr] = np.random.choice(domain, size=n)

    # Reorder columns to X1..Xd if total_attributes was given
    if total_attributes is not None:
        df = df[[f'X{i+1}' for i in range(total_attributes)]]

    return df


def get_true_frequencies(df, columns=None):
    """
    Return normalized frequency dict per column:
      {col: {value: prob, ...}, ...}
    """
    columns = columns or list(df.columns)
    out = {}
    for col in columns:
        counts = df[col].value_counts(normalize=True).sort_index()
        out[col] = counts.to_dict()
    return out


if __name__ == "__main__":
    # Example usage / quick sanity check
    domain = [0, 1, 2, 3, 4]
    correlations = [('X1', 'X2', 0.57)]
    df = correlated_data_generator(
        domain=domain,
        n=2000,
        correlations=correlations,
        total_attributes=2,
        seed=42,
    )
    freqs = get_true_frequencies(df, ['X1', 'X2'])
    print("Frequencies:", freqs)

    # Empirical P[X1 == X2] should be ~ 0.57
    p_same_emp = (df['X1'] == df['X2']).mean()
    print("Empirical P[X1 == X2]:", round(p_same_emp, 4))


Frequencies: {'X1': {0: 0.2075, 1: 0.2005, 2: 0.1895, 3: 0.198, 4: 0.2045}, 'X2': {np.int64(0): 0.192, 1: 0.1985, 2: 0.197, np.int64(3): 0.216, np.int64(4): 0.1965}}
Empirical P[X1 == X2]: 0.5635
