In [None]:
import ray
ray.init()

In [None]:
# Generate co-occurrence structure by sampling an Erdos-Renyi-graph G
# 
# and adding noise

d = 10000
expected_nnz = 50

import numpy as np
import os
np.random.seed(1234)

def sample_verts(n, d, sz, max_ratio, outf):
    """
    sample subsets of integer [0, d) of expected size sz
    such that the probability of a particular integer being
    drawn is independent of all others, but the maximum
    ratio of the probabilities of a particular integer
    being drawn are at most max_ratio (well, a little bit smaller)
    
    prints a binary frame size n x (d + 1) (prepended
    with a random equiprobable binary vector) to outfile
    
    returns min prob, max prob, avg incidence for each vec
    """
    
    # choose geometric progression for probabilities
    # lo * (max_ratio)^(0/d), lo * (max_ratio)^(1/d), ..., lo * max_ratio^((d-1)/d)
    # need sum to equal sz
    # sum is lo * (1 - max_ratio) / (1 - max_ratio^(1/d))
    if max_ratio > 1:
        corr = np.expm1(np.log(max_ratio) / d)
        ll = np.log(sz) - np.log(max_ratio - 1)
        # ll + log(corr) = log(lo)
        probs = ll + np.arange(d) * np.log(max_ratio) / d
        probs = np.exp(probs) * corr
    else:
        probs = np.ones(d) * (sz / d)
    
    for i, y in enumerate(np.random.choice(2, size=n)):
        print('1 ' if y else '0 ', file=outf, end='')
        np.savetxt(outf, np.flatnonzero(np.random.rand(d) < probs),
                   newline=' ', fmt='%d')
        print(file=outf)
        
    
        
    return probs[0], probs[-1], probs.sum(), probs

ratios = [4 ** i for i in range(11)]
ns = [10 * 1000]#, 100 * 1000]
print('d', d)
print('ns', ns)
print('ratios', ratios)

from subprocess import check_output

@ray.remote(num_cpus=1)
def gen_ratio(n, ratio):
    for t in ['train', 'test']:
        fn = f'data/synth_ratio{ratio}_n{n}.{t}'
        with open(fn, mode='w', buffering=1024 * 1024) as f:
            m, mm, e, p = sample_verts(n, d, expected_nnz, ratio, f)
            f.flush()
            os.fsync(f.fileno())
        wc = check_output(f'wc {fn}', shell=True, encoding='utf-8').strip()
        wc = wc.split()
        lines, words = [x for x in wc if x][:2]
        avg = int(words) / int(lines)
    print(f'ratio {ratio:5.1f} probs min {m:7.5f} max {mm:7.5f} avg words {avg}')
    return n, ratio, p