In [3]:
import json
import random

#!/usr/bin/env python
# -*- coding: iso-8859-1 -*-

SI = "SINGULAR"
PL = "PLURAL"

stack = []

UND, DEREN, KOMMA, D, N_s, V_s, N_p, V_p, TN_s, TV_s, TN_p, TV_p = ("UND", "DEREN", "KOMMA", "D", "N_s", "V_s", "N_p", "V_p", "TN_s", "TV_s", "TN_p", "TV_p")

S, T, P = "S", "T", "P"

d =  {
    "UND" : [u"und"],
    "DEREN" : [u"deren"],
    "KOMMA" : [u","],
    "N_s" : [u"Bärin", u"Katja", u"Tochter", u"Mutter"],
    "V_s" : [u"geht", u"spielt", u"tanzt", u"singt", u"schwimmt"],
    "N_p" : [u"Bären", u"Teller", u"Hünde", u"Flaschen"],
    "V_p" : [u"fliegen", u"fallen", u"liegen", u"stehen"],
    "D" : [u"die"]
}


def bl(type):
    return u"(" + type
br = u")"

def is_bracket(tok):
    return tok.startswith("(") or tok.startswith(")")

def write_terminal(var):
    if is_bracket(var):
        return var
    return bl(var) + " " + random.choice(d[var]) + " " + br

def pick_tempus():
    if random.random() > 0.5:
        return SI
    return PL

def swap(tok):
    if tok == SI:
        return PL
    elif tok == PL:
        return SI
    elif tok == N_s: return V_s
    elif tok == V_s: return N_s
    elif tok == N_p: return V_p
    elif tok == V_p: return N_p

def get_noun(temp):
    if temp == SI:
            return N_s
    if temp == PL:
            return N_p
    return Exception(f"{temp} is no tempus")

def generate_2DD(cap=15):
    tempus = pick_tempus()
    BL = get_noun(tempus)
    BR = swap(BL)

    if cap == 1:
        return [bl(T), BL, BR, br]
    else:
        inner = [bl(P)]
        if random.random() > 0.45:
            inner.append([DEREN, generate_2DD(cap - 1)])
        else:
            inner.append([bl(P), DEREN, generate_2DD(cap - 1), br, UND, bl(P), DEREN, generate_2DD(cap - 1), br])
        inner.append(br)
        return [bl(T), BL, KOMMA, inner , KOMMA, BR, br]


def flatten(l):
    out = []
    for item in l:
        if isinstance(item, list):
            out.extend(flatten(item))
        elif isinstance(item, str):
            assert len(item) > 0
            out.append(write_terminal(item))
    return out

def parse_tree(tree):
    tempus = pick_tempus()
    BL = get_noun(tempus)
    BR = swap(BL)

    # Add Start phrase
    string = []
    string.extend([D, BL, KOMMA, DEREN])
    string.append(tree)
    string.extend([KOMMA, BR])

    # Flatten
    out = [bl(S)]
    out.extend(flatten(string))
    out.append(br)

    return out




print(" ".join(parse_tree(generate_2DD(5))))


(S (D die ) (N_p Hünde ) (KOMMA , ) (DEREN deren ) (T (N_s Katja ) (KOMMA , ) (P (DEREN deren ) (T (N_s Mutter ) (KOMMA , ) (P (DEREN deren ) (T (N_p Flaschen ) (KOMMA , ) (P (P (DEREN deren ) (T (N_s Tochter ) (KOMMA , ) (P (DEREN deren ) (T (N_s Bärin ) (V_s geht ) ) ) (KOMMA , ) (V_s singt ) ) ) (UND und ) (P (DEREN deren ) (T (N_p Bären ) (KOMMA , ) (P (P (DEREN deren ) (T (N_s Bärin ) (V_s singt ) ) ) (UND und ) (P (DEREN deren ) (T (N_p Teller ) (V_p stehen ) ) ) ) (KOMMA , ) (V_p fliegen ) ) ) ) (KOMMA , ) (V_p fallen ) ) ) (KOMMA , ) (V_s singt ) ) ) (KOMMA , ) (V_s singt ) ) (KOMMA , ) (V_p liegen ) )


In [4]:

data_size = 7000
nesting_depth = 7

output_file = r"C:\Users\MKhal\OneDrive\Desktop\UNI\Bachelorarbeit\DEV\server-export\data\2D-Custom-Bracket.json"


labels = []
for i in range(int(data_size / nesting_depth)):
    for j in range(1, nesting_depth):
        labels.append(" ".join(parse_tree(generate_2DD(nesting_depth))))

def remove_brackets(s: str):
    toks = s.split()
    no_brackets =  [t for t in toks if not is_bracket(t)]
    return " ".join(no_brackets)

inputs = [remove_brackets(s) for s in labels]

entries =  [{"i": inp, "l": label} for inp, label in zip(inputs, labels)]

out = {
    "data": entries,
    "meta": {
        "dataName": "2-Dyck-Custom",
        "desciption": "Custom set to test tree heights",
        "task": "BRACKETING",
        "randomSeed": -1,
        "stats": {
          "phrase count": -1,
          "data count": -1,
          "search tree depth": 7
        }
  }
}
json_object = json.dumps(out, indent=4, ensure_ascii=False)

with open(output_file, "w", encoding="utf-8") as outfile:
    outfile.write(json_object)
