In [1]:
import numpy as np
import pandas as pd
from treelib import Node, Tree
from collections import defaultdict
import copy
from tqdm.auto import tqdm
tqdm.pandas(leave=False)

In [2]:
dict2obj = lambda d: type("Object", (), d)

In [32]:
tree_finished = lambda t: all([
                (not node.is_leaf() and node.data.type == 'expr') or
                (node.is_leaf() and node.data.type == 'num')
                
                for _, node in t.nodes.items()])


def sample_equation(cache):
    node_id = 0
    tree = Tree()
    if not cache.distr[0,0] == 1:
        raise ValueError('should sample an equation first')
    tree.create_node(node_id, node_id, data = dict2obj({'type' : 'expr', 'val' : cache.sample('functions')}))
    node_id += 1
    
    while not tree_finished(tree):
        
        run_dict = {k:v for k, v in tree.nodes.items()}#copy.deepcopy(tree.nodes)
        
        for node_name, node in run_dict.items():
            if node.is_leaf() and not node.data.type == 'num':
                assert node.data.type == 'expr'
                n_children = function_to_nargs[node.data.val]
                node_depth = tree.depth(node)
                
                for i in range(n_children):
#                     type_ = 'expr' if rand_stream.sample() < distr[node_depth+2,0] else 'num'
#                     value = rand_stream.choice(functions if type_ == 'expr' else vals, distr[node_depth+2])
#                     type_ = np.random.choice(['expr', 'num'], p=distr[node_depth+1])
                    type_ = cache.sample('types', depth=node_depth+1)
#         def sample(self, sample_type, depth=-1):
#                     value = np.random.choice(functions if type_ == 'expr' else vals)
                    value = cache.sample('vals' if type_=='num' else 'functions')
                    tree.create_node(node_id, node_id, data = dict2obj({'type' : type_, 'val' : value}), parent=node.identifier)
                    node_id += 1
                
    
    #tree.show(data_property='val')
    
    
    return tree
    
    

In [33]:
def convert_tree_to_equation(tree, full_parens=False):
    equation = ""

    def _traverse(node):
        nonlocal equation
        
        if node.is_leaf():
            equation += str(node.data.val)
        else:
            n_children = len(tree.children(node.identifier))
            for i, child in enumerate(tree.children(node.identifier)):
                if full_parens:
                    need_parens = child.data.type == 'expr'
                else:
                    need_parens = child.data.type == 'expr' and parent_needs_parens_for_child(node.data.val, child.data.val)
                    need_parens = need_parens and not (i==0 and order_operations[node.data.val] >= order_operations[child.data.val])
                
                if need_parens:
                    equation += "("
                _traverse(child)
                if need_parens:
                    equation += ")"
                if not i == n_children - 1:
                    equation += node.data.val
                    
                

    _traverse(tree.nodes[tree.root])

    return equation

In [34]:
def parent_needs_parens_for_child(f1, f2):
    f1_idx = order_operations[f1]
    f2_idx = order_operations[f2]
    if not f1_idx == f2_idx:
        return f1 < f2
    else:
        return f1 in non_associative_functions
#     else:
#         return False

In [35]:
class ExpressionSampleCache():
    def __init__(self, size, functions, vals, distr):
        self.size = size
        self.functions = functions
        self.vals = vals
        self.distr = distr
        self.data = {
            'functions' : {
                'counter' : 0,
                'values' : np.random.choice(functions, size)
            },
            'vals' : {
                'counter' : 0,
                'values' : np.random.choice(vals, size)
            },
            'types' : {
                    i+1 : {
                        'counter' : 0,
                        'values' : np.random.choice(['expr', 'num'], size=size, p=d)
                    }
                    for i, d in enumerate(distr[1:])
            },
            
        }
        
    def restart(self, key, depth=-1):
        if key == 'functions':
            self.data['functions'] = {
                'counter' : 0,
                'values' : np.random.choice(self.functions, self.size)
            }
        elif key == 'vals' : 
            self.data['vals'] = {
                'counter' : 0,
                'values' : np.random.choice(self.vals, self.size)
            }
        elif key == 'types' :
            assert depth > 0, 'need to specify depth for type'
            self.data['types'][depth] = {
                        'counter' : 0,
                        'values' : np.random.choice(['expr', 'num'], size=self.size, p=self.distr[depth])
                    }
            
    def sample(self, sample_type, depth=-1):
        subdict = self.data[sample_type]
        if sample_type == 'types':
            assert depth > 0, 'need to specify depth for type'
            subdict = subdict[depth]
        if subdict['counter'] >= len(subdict['values']):
            self.restart(sample_type, depth)
            subdict = self.data[sample_type]
            if sample_type == 'types':
                subdict = subdict[depth]
        
        to_return = subdict['values'][subdict['counter']]
        subdict['counter'] += 1
        
        return to_return
            
        
        

In [36]:
order_operations = {
    '^' : 0,
    '*' : 1,
    '/' : 1,
    '—' : 2,
    '+' : 2
}
non_associative_functions = set(['^', '/', '—'])
functions = ['*', '+', '—']

In [37]:
exp_vs_num_prob = np.array([
    [1.,0.],
    [.5, .5],
    [.5,.5],
    [0.,1.],
#     [.5,.5],
#     [.25,.75],
#     [0,1]
])
sample_nums = np.arange(1000)

function_to_nargs = defaultdict(lambda: 2)

In [38]:
sampler_cache = ExpressionSampleCache(50000, functions, sample_nums, exp_vs_num_prob)

In [39]:
tree = sample_equation(sampler_cache)
tree.show(data_property='val')
convert_tree_to_equation(tree)

—
├── *
│   ├── *
│   │   ├── 565
│   │   └── 683
│   └── 935
└── *
    ├── 507
    └── —
        ├── 925
        └── 942



'565*683*935—507*(925—942)'

In [41]:
%%timeit
tree = sample_equation(sampler_cache)
eqn = convert_tree_to_equation(tree)

131 µs ± 655 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [20]:
rows = []
n_samples = 100000
with tqdm(total = n_samples, leave=False) as pbar:
    for i in range(n_samples):
        tree = sample_equation(sampler_cache)
        eqn = convert_tree_to_equation(tree)
        rows.append([tree, eqn])
        pbar.update(1)

  0%|          | 0/100000 [00:00<?, ?it/s]

In [21]:
df = pd.DataFrame.from_records(rows, columns = ['tree', 'eqn'])
df['res'] = df['eqn'].apply(lambda x: eval(x.replace('—', '-')))

In [23]:
df.sample(10)

Unnamed: 0,tree,eqn,res
2162,"(Node(tag=0, identifier=0, data=<class '__main...",310*632—793+167,195294
83214,"(Node(tag=0, identifier=0, data=<class '__main...",289—265*752*864*252,-43388835551
27328,"(Node(tag=0, identifier=0, data=<class '__main...",(142+586*589)*912,314909952
54508,"(Node(tag=0, identifier=0, data=<class '__main...",(609+17)*(286+482)*779,374518272
77213,"(Node(tag=0, identifier=0, data=<class '__main...",935+679*141+635+352+740,98401
46945,"(Node(tag=0, identifier=0, data=<class '__main...",251+234+494—877,102
9256,"(Node(tag=0, identifier=0, data=<class '__main...",198+633,831
29142,"(Node(tag=0, identifier=0, data=<class '__main...",115—830,-715
62291,"(Node(tag=0, identifier=0, data=<class '__main...",790—(967+747)—572*748,-428780
34740,"(Node(tag=0, identifier=0, data=<class '__main...",830*(871+466)*67,74350570


In [24]:
split_tokens = [
            '+',
            '-',
            '*',
            '(',
            ')', ]


In [25]:
tokens = []
char_idx = 0
start = 0
while char_idx < len(expresssion)+1:
    if char_idx == len(expresssion):
        if char_idx < len(expresssion):
            tokens.append(expresssion[start:])
        break
    if expresssion[char_idx] in split_tokens:
        if char_idx > 0 and not expresssion[char_idx-1] in split_tokens:
            tokens.append(expresssion[start:char_idx])
            tokens.append(expresssion[char_idx])
            start = char_idx + 1
        else:
            tokens.append(expresssion[char_idx])
        
    char_idx += 1

NameError: name 'expresssion' is not defined

In [None]:
tokens