In [None]:
import copy
import datetime
import hashlib
import itertools
import importlib
import json
import scipy.stats as st
import sklarpy.multivariate
import os
import pathlib
import shutil
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
import warnings

# Suppress warnings from sklarpy
warnings.filterwarnings('ignore', module='sklarpy.multivariate', message='divide by zero encountered in matmul')
warnings.filterwarnings('ignore', module='sklarpy.multivariate', message='overflow encountered in matmul')
warnings.filterwarnings('ignore', module='sklarpy.multivariate', message='invalid value encountered in matmul')

## Synthetic Data Generation

In [None]:
def get_random_state(seed):
    np.random.seed(seed)

    heckman_seed = np.random.randint(0, 1000000)
    uniform_seed = np.random.randint(0, 1000000)
    selection_seed = np.random.randint(0, 1000000)
    heckman_coefs_seed = np.random.randint(0, 1000000)

    return {
        'HECKMAN_SEED': heckman_seed,
        'UNIFORM_SEED': uniform_seed,
        'SELECTION_SEED': selection_seed,
        'HECKMAN_COEFS_SEED': heckman_coefs_seed,
    }

In [None]:
RANDOM_SEED = 20250924
RANDOM_ALT_SEED = 20240924

RANDOM_STATE = get_random_state(RANDOM_SEED)
RANDOM_STATE

In [None]:
census_dtype = {col: 'category' for col in ['age_group', 'education', 'gender', 'nationality', 'electoral_district', 'unit']}
census_df = pd.read_csv('../data/census.csv', dtype=census_dtype)
census_df.info()

In [None]:
population_df = census_df.loc[census_df.index.repeat(census_df.N)].drop(columns=['N']).reset_index(drop=True)
demography_cols = sorted(population_df.columns.tolist())
print(population_df.shape)
population_df.head()

In [None]:
def df_margins(df, columns=None, outcome=None):
    if columns is None:
        columns = df.columns

    if outcome is None:
        return df.groupby(columns, observed=False).size() / len(df)
    else:
        return df.groupby(columns, observed=False)[outcome].sum() / len(df)
    

In [None]:
def get_categories(df, column):
    categories = []

    if ':' in column:
        col1, col2 = column.split(':')

        is_col1_cat = col1 in df.columns and df[col1].dtype == 'category'
        is_col2_cat = col2 in df.columns and df[col2].dtype == 'category'

        if is_col1_cat and is_col2_cat:
            categories = list(map(lambda cats: f'{cats[0]}:{cats[1]}', itertools.product(df[col1].cat.categories.values, df[col2].cat.categories.values)))
        elif is_col1_cat:
            categories = df[col1].cat.categories.values
        elif is_col2_cat:
            categories = df[col2].cat.categories.values
    else:
        if column in df.columns and df[column].dtype == 'category':
            categories = df[column].cat.categories.values

    return categories

def set_if_new_and_return(config, path, default):
    path_keys = path.split('/')
    current_config = config

    for key in path_keys[:-1]:
        if key not in current_config:
            current_config[key] = {}
        current_config = current_config[key]

    if path_keys[-1] not in current_config:
        current_config[path_keys[-1]] = default

    return current_config[path_keys[-1]]

def generate_heckman_coefs(df, selection_columns, outcome_columns, default_sigma, rho, heckman_coefs, multilevel=True, seed=RANDOM_STATE['HECKMAN_COEFS_SEED']):
    np.random.seed(seed)

    coefs = copy.deepcopy(heckman_coefs)
    columns = sorted(list(set(selection_columns) | set(outcome_columns)))

    set_if_new_and_return(coefs, 'selection/intercept', 0)
    set_if_new_and_return(coefs, 'outcome/intercept', 0)

    for column in columns:
        categories = get_categories(df, column)

        s_sigma = default_sigma['selection_interaction'] if ':' in column else default_sigma['selection']
        o_sigma = default_sigma['outcome_interaction'] if ':' in column else default_sigma['outcome']

        # Generate values for tau, even if we don't use them for consistency
        s_tau_val = st.halfnorm.rvs(scale=s_sigma)
        o_tau_val = st.halfnorm.rvs(scale=o_sigma)

        if multilevel:
            s_tau = set_if_new_and_return(coefs, f'selection/tau/{column}', s_tau_val)
            o_tau = set_if_new_and_return(coefs, f'outcome/tau/{column}', o_tau_val)
            cov_matrix = np.diag([s_tau, o_tau]) @ np.array([[1, rho], [rho, 1]]) @ np.diag([s_tau, o_tau])
        else:
            cov_matrix = np.diag([s_sigma, o_sigma]) @ np.array([[1, rho], [rho, 1]]) @ np.diag([s_sigma, o_sigma])

        if len(categories) > 0:
            gen_coefs = np.random.multivariate_normal(np.zeros(2), cov_matrix, size=len(categories))
            gen_coefs = gen_coefs - gen_coefs.mean(axis=0)
            if column in selection_columns:
                col_coefs = set_if_new_and_return(coefs, f'selection/beta/{column}', {})

                for i, cat in enumerate(categories):
                    coef = gen_coefs[i, 0].item()
                    if cat not in col_coefs:
                        col_coefs[cat] = coef

                assert len(col_coefs) == len(categories)

                coefs['selection']['beta'][column] = col_coefs
            if column in outcome_columns:
                col_coefs = set_if_new_and_return(coefs, f'outcome/beta/{column}', {})

                for i, cat in enumerate(categories):
                    coef = gen_coefs[i, 1].item()
                    if cat not in col_coefs:
                        col_coefs[cat] = coef

                assert len(col_coefs) == len(categories)

                coefs['outcome']['beta'][column] = col_coefs
        else:
            gen_coefs = np.random.multivariate_normal(np.zeros(2), cov_matrix, size=1)
            if column in selection_columns:
                set_if_new_and_return(coefs, f'selection/beta/{column}', gen_coefs[0, 0].item())
            if column in outcome_columns:
                set_if_new_and_return(coefs, f'outcome/beta/{column}', gen_coefs[0, 1].item())

    return coefs

def validate_column(df, column, ignore_columns=[]):
    if column in ignore_columns:
        return

    assert column in df.columns, f"{column} not in {df.columns}"
    assert df[column].dtype == 'category', f"{column} is not of type category: {df[column].dtype}"

def validate_coefs(df, column, coefs, ignore_columns=[]):
    if column in ignore_columns:
        return

    if column in coefs:
        for cat in coefs[column]:
            assert cat in df[column].cat.categories, f"{cat} not in {df[column].cat.categories}"

def get_cov_matrix(rho, sigma):
    return np.array([[1.0, rho*sigma], [rho*sigma, sigma**2]])

def heckman_model(df, coefs={}, seed=RANDOM_STATE['HECKMAN_SEED']):
    np.random.seed(seed)
    df = df.copy()

    selection_process = coefs.get('selection', {})
    outcome_process = coefs.get('outcome', {})

    selection_beta = selection_process.get('beta', {})
    outcome_beta = outcome_process.get('beta', {})

    selection_noise_prop = coefs.get('selection_noise_prop', 0.0)
    outcome_noise_prop = coefs.get('outcome_noise_prop', 0.0)
    random_noise_prop = coefs.get('random_noise_prop', 0.5)

    error_params = coefs.get('error', {})

    selection_columns = set(selection_beta.keys())
    outcome_columns = set(outcome_beta.keys())

    for column in selection_columns:
        if ':' in column:
            col1, col2 = column.split(':')
            validate_column(df, col1)
            validate_column(df, col2)
            validate_coefs(df, col1, selection_beta)
            validate_coefs(df, col2, selection_beta)
        else:
            validate_column(df, column)
            validate_coefs(df, column, selection_beta)

    special_selection_columns = ['selection_mean', 'selection_error', 'selection_latent', 'selection']

    for column in outcome_columns:
        if ':' in column:
            col1, col2 = column.split(':')
            validate_column(df, col1, ignore_columns=special_selection_columns)
            validate_column(df, col2, ignore_columns=special_selection_columns)
            validate_coefs(df, col1, outcome_beta, ignore_columns=special_selection_columns)
            validate_coefs(df, col2, outcome_beta, ignore_columns=special_selection_columns)
        else:
            validate_column(df, column, ignore_columns=special_selection_columns)
            validate_coefs(df, column, outcome_beta, ignore_columns=special_selection_columns)

    df['selection_mean'] = np.full(len(df), selection_process.get('intercept', 0)).astype(float)
    df['outcome_mean'] = np.full(len(df), outcome_process.get('intercept', 0)).astype(float)

    match error_dist := error_params.get('distribution', 'normal'):
        case 'normal':
            error_mean = error_params.get('mean', [0., 0.])
            error_rho = error_params.get('rho', 0.5)
            error_sigma = error_params.get('sigma', 1.0)
            errors = np.random.multivariate_normal(error_mean, get_cov_matrix(rho=error_rho, sigma=error_sigma), size=len(df))
        case 'skewed_t':
            error_mean = error_params.get('mean', [0., 0.])
            error_rho = error_params.get('rho', 0.5)
            error_sigma = error_params.get('sigma', 1.0)
            error_dof = error_params.get('dof', 5.0)
            error_skew = error_params.get('skew', [0.0, 0.0])
            # https://sklarpy.readthedocs.io/en/latest/Multivariate.html#multivariate-example 
            skewed_t_params = (error_dof, np.array(error_mean), get_cov_matrix(rho=error_rho, sigma=error_sigma), np.array(error_skew))
            errors = sklarpy.multivariate.mvt_skewed_t.rvs(size=len(df), params=skewed_t_params)
        case _:
            raise ValueError(f'Invalid error distribution: {error_dist}')

    df['selection_error'] = errors[:, 0]
    df['outcome_error'] = errors[:, 1]

    df['use_selection_noise'] = np.random.binomial(1, selection_noise_prop, size=len(df))
    df['use_outcome_noise'] = np.random.binomial(1, outcome_noise_prop, size=len(df))

    for column in selection_columns:
        if ':' in column:
            col1, col2 = column.split(':')
            df['selection_mean'] += df[[col1, col2]].apply(lambda row: selection_beta.get(column, {}).get(f'{row[col1]}:{row[col2]}', 0), axis=1).values.astype(float)
        else:
            df['selection_mean'] += df[column].apply(lambda cat: selection_beta.get(column, {}).get(cat, 0)).values.astype(float)

    df['selection_latent'] = df['selection_mean'] + df['selection_error']
    df['selection'] = np.where(
        df['use_selection_noise'],
        np.random.binomial(1, random_noise_prop, size=len(df)).astype(int),
        (df['selection_latent'] > 0).astype(int)
    )

    for column in outcome_columns:
        if ':' in column:
            col1, col2 = column.split(':')
            if col1 in special_selection_columns and col2 in special_selection_columns:
                df['outcome_mean'] += df[col1] * df[col2] * outcome_beta.get(column, 0)
            elif col1 in special_selection_columns:
                df['outcome_mean'] += df[col1] * df[col2].apply(lambda cat: outcome_beta.get(column, {}).get(cat, 0)).values.astype(float)
            elif col2 in special_selection_columns:
                df['outcome_mean'] += df[col2] * df[col1].apply(lambda cat: outcome_beta.get(column, {}).get(cat, 0)).values.astype(float)
            else:
                df['outcome_mean'] += df[[col1, col2]].apply(lambda row: outcome_beta.get(column, {}).get(f'{row[col1]}:{row[col2]}', 0), axis=1).values.astype(float)
        else:
            if column in special_selection_columns:
                df['outcome_mean'] += df[column] * outcome_beta.get(column, 0)
            else:
                df['outcome_mean'] += df[column].apply(lambda cat: outcome_beta.get(column, {}).get(cat, 0)).values.astype(float)

    df['outcome_latent'] = df['outcome_mean'] + df['outcome_error']
    df['outcome'] = np.where(
        df['use_outcome_noise'],
        np.random.binomial(1, random_noise_prop, size=len(df)).astype(int),
        (df['outcome_latent'] > 0).astype(int)
    )

    return df

def col_margins(df, columns, outcome):
    groups = df.groupby(columns, observed=False)
    return (groups[outcome].value_counts() / groups.size()).rename('proportion').reset_index()

def aggregation_bias(df, agg_cols, coefs={}):
    df = df.copy()
    df['agg_bias'] = 0

    for coef_col in coefs:
        margin_col = f'{"_".join(sorted(agg_cols))}_{coef_col}_margin'
        margin = col_margins(df, agg_cols, coef_col).rename(columns={'proportion': margin_col})
        df = pd.merge(df, margin, on=agg_cols + [coef_col], how='left')
        df['agg_bias'] += df[margin_col] * df[coef_col].apply(lambda cat: coefs.get(coef_col, {}).get(cat, 0)).values.astype(float)

    df['outcome_latent'] = df['outcome_mean'] + df['outcome_error'] + df['agg_bias']
    df['outcome'] = np.where(df['use_outcome_noise'], df['outcome'], (df['outcome_latent'] > 0).astype(int))

    return df

In [None]:
def selection_sample(df, sample_size, bias={}, seed=RANDOM_STATE['SELECTION_SEED']):
    np.random.seed(seed)
    sample_df = df.sample(sample_size, weights='selection')

    match bias_type := bias.get('type', 'constant'):
        case 'constant':
            sample_df['outcome_latent'] = sample_df['outcome_latent'] + bias.get('offset', 0.0)
        case 'dynamic':
            sample_df['outcome_latent'] = sample_df['outcome_latent'] + np.where(sample_df['outcome_mean'] >= 0, sample_df['outcome_mean'], 0) * bias.get('coef', 0.0)
        case _:
            raise ValueError(f'Invalid bias type: {bias_type}')

    sample_df['outcome'] = np.where(
        sample_df['use_outcome_noise'],
        sample_df['outcome'],
        (sample_df['outcome_latent'] > 0).astype(int)
    )
    return sample_df

def generate_data(out_prefix, config, population_df, margin_cols, template_prefix='../data', template_files=[]):
    out_path = pathlib.Path(out_prefix)
    out_path.mkdir(parents=True, exist_ok=True)

    config['seed'] = config.get('seed', RANDOM_SEED)
    rng_state = get_random_state(config['seed'])
    config['heckman_coef_kwargs']['seed'] = config['heckman_coef_kwargs'].get('seed', rng_state['HECKMAN_COEFS_SEED'])
    config['selection_kwargs']['seed'] = config['selection_kwargs'].get('seed', rng_state['SELECTION_SEED'])
    config['heckman_kwargs']['seed'] = config['heckman_kwargs'].get('seed', rng_state['HECKMAN_SEED'])

    with open(out_path / 'data_config.json', 'w', encoding='utf-8') as f:
        json.dump(config, f, ensure_ascii=False, indent=2)

    heckman_coef_kwargs = config.get('heckman_coef_kwargs', {})
    heckman_kwargs = config.get('heckman_kwargs', {})
    selection_kwargs = config.get('selection_kwargs', {})
    aggregation_bias_kwargs = config.get('aggregation_bias_kwargs', None)

    for filename in template_files:
        shutil.copyfile(pathlib.Path(template_prefix) / filename, out_path / filename)

    heckman_coefs = generate_heckman_coefs(population_df, **heckman_coef_kwargs)
    heckman_df = heckman_model(population_df, coefs=heckman_coefs, **heckman_kwargs)
    if aggregation_bias_kwargs is not None: heckman_df = aggregation_bias(heckman_df, **aggregation_bias_kwargs)
    heckman_df['voting_intent'] = heckman_df['outcome'].map({1: 'Yes', 0: 'No'})

    # Keep population compact: parquet + no latent columns
    keep_columns = population_df.columns.tolist() + [
        'voting_intent', 'selection', 'outcome',
    ]
    drop_columns = [col for col in heckman_df.columns if col not in keep_columns]

    pop_out_df = heckman_df.drop(columns=drop_columns)
    latent_cols = [c for c in pop_out_df.columns if 'latent' in c]
    pop_out_df = pop_out_df.drop(columns=latent_cols, errors='ignore')

    # Ensure categoricals are stored as categoricals in parquet (round-trips on read_parquet)
    for col in population_df.columns.tolist() + ['voting_intent']:
        if col in pop_out_df.columns and str(pop_out_df[col].dtype) != 'category':
            pop_out_df[col] = pop_out_df[col].astype('category')

    pop_out_df.to_parquet(out_path / 'population.parquet', index=False)

    selection_sample_df = selection_sample(heckman_df, **selection_kwargs)
    selection_sample_df['voting_intent'] = selection_sample_df['outcome'].map({1: 'Yes', 0: 'No'})
    selection_sample_df.drop(columns=drop_columns).to_csv(out_path / 'estonia_selection.csv', index=False, float_format='%.4f')

    for cols in margin_cols:
        heckman_margin_df = df_margins(heckman_df, columns=cols + ['voting_intent']).fillna(0).reset_index(name='proportion')
        heckman_margin_df['N'] = (heckman_margin_df['proportion'] * len(heckman_df)).round(0).astype(int)
        margin_file_name = f'estonia_{"_".join(sorted(cols))}_margins.csv' if len(cols) > 0 else 'estonia_margins.csv'
        heckman_margin_df.drop(columns=['proportion']).to_csv(out_path / margin_file_name, index=False, float_format='%.4f')

    with open(out_path / 'heckman_coefs.json', 'w', encoding='utf-8') as f:
        json.dump(heckman_coefs, f, ensure_ascii=False, indent=2)

    return heckman_df, selection_sample_df

def translate_config_paths(config):
    new_config = {}

    for path_str, value in config.items():
        current_config = new_config
        keys = path_str.split('/')

        for key in keys[:-1]:
            current_config[key] = current_config.get(key, {})
            current_config = current_config[key]

        current_config[keys[-1]] = value

    return new_config

def apply_config(base_config, new_config):
    if isinstance(base_config, dict) and isinstance(new_config, dict):
        for key, value in base_config.items():
            if key in new_config:
                base_config[key] = apply_config(value, new_config[key])
        
        for key, value in new_config.items():
            if key not in base_config:
                base_config[key] = value
    else:
        base_config = new_config
    
    return base_config

def generate_data_configs(configs, default_config):
    if isinstance(configs, dict):
        for config_vals in itertools.product(*configs.values()):
            config = dict(zip(configs.keys(), config_vals))
            yield (config, apply_config(copy.deepcopy(default_config), translate_config_paths(config)))
    elif isinstance(configs, list):
        for config in configs:
            yield ('', apply_config(copy.deepcopy(default_config), config))
    else:
        raise ValueError(f'Invalid configs type: {type(configs)}')

def get_n_seeds(initial_seed, n_seeds):
    return [initial_seed + i for i in range(n_seeds)]

def create_symlink(target_path, link_path):
    if isinstance(link_path, str):
        link_path = pathlib.Path(link_path)

    if link_path.is_symlink():
        os.unlink(link_path)
    os.symlink(target_path, link_path)

margin_cols = [[]] + [[col] for col in demography_cols] + [[c1, c2] for c1, c2 in itertools.combinations(demography_cols, 2)]
input_cols = [col for col in demography_cols if col != 'electoral_district']

heckman_selection_columns = list(input_cols)
heckman_outcome_columns = list(input_cols)
heckman_coef_columns = sorted(list(set(heckman_selection_columns) | set(heckman_outcome_columns)))

heckman_selection_interactions = [f'{c1}:{c2}' for c1, c2 in itertools.combinations(heckman_selection_columns, 2)]
heckman_outcome_interactions = [f'{c1}:{c2}' for c1, c2 in itertools.combinations(heckman_outcome_columns, 2)]

heckman_coef_kwargs = {
    'selection_columns': heckman_selection_columns,
    'outcome_columns': heckman_outcome_columns,
    'default_sigma': {
        # Standard deviation of the half-normal distribution where the group-level standard deviation is drawn from
        # Average is sigma * sqrt(2) / sqrt(pi) or approximately 0.8 * sigma
        'selection': 0.5,
        'outcome': 0.5,
    },
    'rho': 0.0,
    'heckman_coefs': {
        'selection': {'intercept': -1.0},
        'outcome': {'intercept': 0.0},
        'error': {
            'distribution': 'normal',
            'mean': [0., 0.],
            'rho': 0.5,
            'sigma': 1.0,
        },
    },
}

default_config = {
    'heckman_coef_kwargs': heckman_coef_kwargs,
    'selection_kwargs': {'sample_size': 1000},
    'heckman_kwargs': {},
    'seed': RANDOM_SEED,
}

n_seeds = 5
seeds_config = {'seed': get_n_seeds(RANDOM_SEED, n_seeds)[::-1]}

generated_data_configs = {
    'est-default': generate_data_configs({} | seeds_config, default_config),
    'est-default-alt': generate_data_configs(
        {
            'seed': [RANDOM_ALT_SEED],
            'heckman_coef_kwargs/seed': [get_random_state(RANDOM_SEED)['HECKMAN_COEFS_SEED']],
        }, default_config),
    'est-electoral-district': generate_data_configs(
        {
            'heckman_coef_kwargs/selection_columns': [demography_cols],
            'heckman_coef_kwargs/outcome_columns': [demography_cols],
        } | seeds_config, default_config),
    'est-agg-bias': generate_data_configs(
        {
            'aggregation_bias_kwargs/agg_cols': [['unit']],
            'aggregation_bias_kwargs/coefs': [{'nationality': {'Estonian': 0.0, 'Other': bias}} for bias in [0.0, 0.5, 1.0, 2.0]],
        } | seeds_config, default_config),
    'est-no-selection': generate_data_configs(
        {
            'heckman_coef_kwargs/selection_columns': [[]],
            'heckman_coef_kwargs/heckman_coefs/rho': [0.0]
        } | seeds_config, default_config),
    'est-hcoef-cor': generate_data_configs(
        {
            'heckman_coef_kwargs/rho': [0.0, 0.25, 0.5, 0.75, 1.0]
        } | seeds_config, default_config),
    'est-hcoef-sigma': generate_data_configs(
        {
            'heckman_coef_kwargs/default_sigma/selection': [0.1, 0.5, 1.0, 2.0],
            'heckman_coef_kwargs/default_sigma/outcome': [0.1, 0.5, 1.0, 2.0]
        } | seeds_config, default_config),
    'est-heck-cor': generate_data_configs(
        {
            'heckman_coef_kwargs/heckman_coefs/error/rho': [0.0, 0.25, 0.5, 0.75, 1.0]
        } | seeds_config, default_config),
    'est-sample-size': generate_data_configs(
        {
            'selection_kwargs/sample_size': [100, 250, 500, 1000, 2000]
        } | seeds_config, default_config),
    'est-overreport-const': generate_data_configs(
        {
            'selection_kwargs/bias/type': ['constant'],
            'selection_kwargs/bias/offset': [0.0, 0.25, 0.5, 1.0],
        } | seeds_config, default_config),
    'est-non-response': generate_data_configs(
        {
            'heckman_coef_kwargs/heckman_coefs/selection/intercept': [-3.0, -2.0, -1.0, 0.0, 1.0, 2.0, 3.0],
        } | seeds_config, default_config),
    'est-int': generate_data_configs(
        {
            'heckman_coef_kwargs/selection_columns': [heckman_selection_columns + heckman_selection_interactions],
            'heckman_coef_kwargs/outcome_columns': [heckman_outcome_columns + heckman_outcome_interactions],
            'heckman_coef_kwargs/default_sigma/selection_interaction': [0.5],
            'heckman_coef_kwargs/default_sigma/outcome_interaction': [0.5],
        } | seeds_config, default_config),
    'est-noise': generate_data_configs(
        {
            'heckman_coef_kwargs/heckman_coefs/selection_noise_prop': [0.00, 0.10, 0.20],
            'heckman_coef_kwargs/heckman_coefs/outcome_noise_prop': [0.00, 0.10, 0.20],
        } | seeds_config, default_config),
    # Generating this configuration is very slow
    'est-non-normal-error': generate_data_configs(
        {
            'heckman_coef_kwargs/heckman_coefs/error/distribution': ['skewed_t'],
            'heckman_coef_kwargs/heckman_coefs/error/dof': [5.0],
            'heckman_coef_kwargs/heckman_coefs/error/skew': [[s_skew, o_skew] for s_skew in [-1.0, 0.0, 1.0] for o_skew in [-1.0, 0.0, 1.0]],
        } | seeds_config, default_config),
}

out_path = pathlib.Path('../tmp/data')
out_path.mkdir(parents=True, exist_ok=True)

names = {}

progress = tqdm(generated_data_configs.items())
for name, configs in progress:
    data_names = []

    inner_progress = tqdm(configs, leave=False, total=n_seeds)
    for desc, config in inner_progress:
        data_id = hashlib.md5(json.dumps(config, sort_keys=True).encode('utf-8')).hexdigest()
        data_name = f'{name}-{data_id}'
        data_path = out_path / data_id

        progress.set_postfix({'name': name, 'id': data_id, 'desc': desc})
        inner_progress.set_postfix({'desc': desc, 'id': data_id})

        generate_data(str(data_path), config, population_df, margin_cols)
        data_names.append((data_name, desc))

        link_path = out_path / data_name
        if not link_path.exists():
            os.symlink(data_path.name, link_path)

        link_path = out_path / name
        if link_path.exists() and link_path.is_symlink():
            os.unlink(link_path)
        os.symlink(data_path.name, link_path)

    names[name] = data_names

data_names_path = out_path / f'{datetime.datetime.now().strftime("%Y-%m-%d_%H:%M:%S")}_data_list.json'
with open(data_names_path, 'w', encoding='utf-8') as f:
    json.dump(names, f, ensure_ascii=False, indent=2)
create_symlink(data_names_path.name, out_path / 'data_list.json')