# Probabilistic program synthesis for tabular data using CrossCat+CGPM

<img src="resources/bayesian-program-synthesis.jpg"/>

In [1]:
import numpy as np
import pandas as pd

In [2]:
prng = np.random.RandomState(10)

#### Prepare the population schema.

In [3]:
schema = [
    ('categorical', {'k': 79}),
    ('categorical', {'k': 346}),
    ('categorical', {'k': 18}),
    ('categorical', {'k': 46}),
    ('categorical', {'k': 4}),
    ('categorical', {'k': 7}),
    ('normal', None),
    ('normal', None),
    ('normal', None),
    ('normal', None),
    ('normal', None),
    ('normal', None),
    ('normal', None),
    ('normal', None),
    ('normal', None),
    ('categorical', {'k': 282}),
    ('categorical', {'k': 54}),
    ('categorical', {'k': 25}),
    ('categorical', {'k': 141}),
    ('categorical', {'k': 38}),
    ('normal', None),
    ('normal', None),
]

#### Sample an AST from the prior.

In [4]:
from cgpm2 import sample_crosscat

In [5]:
ast = sample_crosscat.generate_random_ast(schema, prng)

In [6]:
ast

[((('crp', None), {'alpha': 1.121268}),
  [(0, ('categorical', {'k': 79}), {'alpha': 0.719775}),
   (8,
    ('normal', None),
    {'m': 1.710845, 'nu': 0.433329, 'r': 0.221829, 's': 1.943864}),
   (11,
    ('normal', None),
    {'m': 0.163875, 'nu': 4.392115, 'r': 0.484694, 's': 2.88707})]),
 ((('crp', None), {'alpha': 0.583097}),
  [(1, ('categorical', {'k': 346}), {'alpha': 1.050958}),
   (4, ('categorical', {'k': 4}), {'alpha': 0.737407}),
   (5, ('categorical', {'k': 7}), {'alpha': 2.393045}),
   (7,
    ('normal', None),
    {'m': 1.76423, 'nu': 0.793158, 'r': 0.048032, 's': 0.984268}),
   (12,
    ('normal', None),
    {'m': 0.609366, 'nu': 0.909741, 'r': 1.749406, 's': 0.289516}),
   (16, ('categorical', {'k': 54}), {'alpha': 0.04078}),
   (20,
    ('normal', None),
    {'m': 0.425179, 'nu': 1.440711, 'r': 0.044054, 's': 2.119557}),
   (21,
    ('normal', None),
    {'m': 2.104527, 'nu': 0.72045, 'r': 0.540442, 's': 0.930333})]),
 ((('crp', None), {'alpha': 0.569186}),
  [(2, ('

#### Compile the AST into the "Core DSL".

In [7]:
core_dsl = sample_crosscat.compile_ast_to_core_dsl(ast)

In [8]:
print core_dsl.getvalue()

- view:
    row clustering model:
      - crp:
          distargs:
          hypers:
            alpha: 1.1213
    distribution models:
      - categorical{0}:
          distargs:
            k: 79
          hypers:
            alpha: 0.7198
      - normal{8}:
          distargs:
          hypers:
            s: 1.9439
            r: 0.2218
            m: 1.7108
            nu: 0.4333
      - normal{11}:
          distargs:
          hypers:
            s: 2.8871
            r: 0.4847
            m: 0.1639
            nu: 4.3921
- view:
    row clustering model:
      - crp:
          distargs:
          hypers:
            alpha: 0.5831
    distribution models:
      - categorical{1}:
          distargs:
            k: 346
          hypers:
            alpha: 1.0510
      - categorical{4}:
          distargs:
            k: 4
          hypers:
            alpha: 0.7374
      - categorical{5}:
          distargs:
            k: 7
          hypers:
            alpha: 2.3930
      - norm

#### Compile "Core DSL" into the "Embedded DSL".

In [9]:
embedded_dsl = sample_crosscat.compile_core_dsl_to_embedded_dsl(core_dsl.getvalue())

In [10]:
print embedded_dsl.getvalue()

from cgpm2.categorical import Categorical
from cgpm2.crp import CRP
from cgpm2.flexible_rowmix import FlexibleRowMixture
from cgpm2.normal import Normal
from cgpm2.poisson import Poisson
from cgpm2.product import Product

view0 = FlexibleRowMixture(
  cgpm_row_divide=CRP(outputs=[100000], inputs=[], hypers={'alpha': 1.1213},),
  cgpm_components_base=Product(cgpms=[
    Categorical(outputs=[0], inputs=[], distargs={'k': 79}, hypers={'alpha': 0.7198},),
    Normal(outputs=[8], inputs=[], hypers={'s': 1.9439, 'r': 0.2218, 'm': 1.7108, 'nu': 0.4333},),
    Normal(outputs=[11], inputs=[], hypers={'s': 2.8871, 'r': 0.4847, 'm': 0.1639, 'nu': 4.3921},),])
)
view1 = FlexibleRowMixture(
  cgpm_row_divide=CRP(outputs=[100001], inputs=[], hypers={'alpha': 0.5831},),
  cgpm_components_base=Product(cgpms=[
    Categorical(outputs=[1], inputs=[], distargs={'k': 346}, hypers={'alpha': 1.051},),
    Categorical(outputs=[4], inputs=[], distargs={'k': 4}, hypers={'alpha': 0.7374},),
    Categorical(outp

#### Execute the Embeddd DSL source code to build the model trace.

In [11]:
exec(embedded_dsl.getvalue())

#### Load observations for .csv file and incorporate them into the model trace.

In [12]:
df = pd.read_csv('/tmp/satellites.coded.csv', index_col=False)
for row, values in df.iterrows():
    observation = dict(zip(range(len(values)), values.values))
    crosscat.incorporate(row, observation)

#### Run Bayesian synthesis.

In [13]:
from cgpm2.transition_crosscat import GibbsCrossCat
inference = GibbsCrossCat(crosscat, prng)
inference.transition_structure_cpp(N=100)
inference.transition(N=10, kernels=['hypers_distributions', 'hypers_row_divide'])

Completed: 100 iterations in 23.388717 seconds.
Completed: 10 iterations in 25.595952 seconds.


#### Render the posterior model trace as Embedded DSL + sequence of incorporates that reconstruct it exactly.

In [14]:
print sample_crosscat.render_trace_in_embedded_dsl(crosscat).getvalue()

from cgpm2.categorical import Categorical
from cgpm2.crp import CRP
from cgpm2.flexible_rowmix import FlexibleRowMixture
from cgpm2.normal import Normal
from cgpm2.poisson import Poisson
from cgpm2.product import Product

view0 = FlexibleRowMixture(
  cgpm_row_divide=CRP(outputs=[100004], inputs=[], hypers={'alpha': 1.1213},),
  cgpm_components_base=Product(cgpms=[
    Categorical(outputs=[0], inputs=[], distargs={'k': 79}, hypers={'alpha': 1},),
    Normal(outputs=[8], inputs=[], hypers={'s': 0.1953, 'r': 0.0009, 'm': 0.6179, 'nu': 1166.0},),
    Normal(outputs=[11], inputs=[], hypers={'s': 9711432.8657, 'r': 0.0954, 'm': 2760.7241, 'nu': 24.6309},),])
)
view1 = FlexibleRowMixture(
  cgpm_row_divide=CRP(outputs=[100005], inputs=[], hypers={'alpha': 0.5831},),
  cgpm_components_base=Product(cgpms=[
    Categorical(outputs=[1], inputs=[], distargs={'k': 346}, hypers={'alpha': 1},),
    Categorical(outputs=[4], inputs=[], distargs={'k': 4}, hypers={'alpha': 1},),
    Categorical(outputs=