# Generate synthetic data

This notebook generates a dataset of synthetic activity traces.
These traces are saved to disk and used in the other notebooks provided in this folder.

In [1]:
import numpy as np
import pickle

from scipy.special import expit, logit

In [2]:
%%time
dim = 59  # Length of traces.
n = 600  # Number of shows.
m = 10_000  # Number of traces per show.

ts = np.arange(dim)
rng = np.random.default_rng(seed=42)

# Basic correlation structure for activity across days.
kernel = 1.2 * np.exp(-np.abs(ts[:, None] - ts) / 8.0)
chol = np.linalg.cholesky(kernel)
# Weekly seasonality
seasonality = -np.abs(np.sin(np.pi * np.arange(dim) / 7))

data = np.zeros((n, m, dim))
for i in range(n):
    # Central parameter for beta-geomtric churn (~ 1 / mean of geometric)
    alpha = expit(0.1 * rng.normal() - 0.4)
    # Concentration parameter for beta-geometric churn.
    k = rng.uniform(0.7, 1.5)
    # Baseline rate for each of the `dim` days.
    rates = -0.5 + 0.1 * chol @ rng.normal(size=dim)
    # Activity probabilities for each day (churn notwithstanding).
    probs = expit(rates + rng.uniform(0.1, 1.0) * seasonality)
    # Sampling traces.
    traces = rng.binomial(n=1, p=probs, size=(m, dim))
    # Churn date for each trace.
    max_day = rng.geometric(rng.beta(k * alpha, k * (1 - alpha), size=m)) - 1
    traces[max_day[:, None] <= ts] = 0
    data[i] = traces

CPU times: user 12 s, sys: 555 ms, total: 12.6 s
Wall time: 12.6 s


In [3]:
with open("data/synthetic-data-train.pkl", "wb") as f:
    pickle.dump({
        f"show-{i:03d}": data[i].astype(bool) for i in range(0, 200) 
    }, f)

with open("data/synthetic-data-eval.pkl", "wb") as f:
    pickle.dump({
        f"show-{i:03d}": data[i].astype(bool) for i in range(200, n) 
    }, f)