<h1><div align="center"> Penn Treebank Dataset Syntax Tree Parser </div></h1>

## 1. Necessary Package Imports 

In [1]:
# Package imports
from typing import Dict, List
from nltk.tree import Tree
import logging
from tqdm import tqdm

## 2. Sentence String to Syntax Tree Conversion

In [3]:
sentence = "(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))"
t = Tree.fromstring(sentence)
print(t)
t.height()

(S (NP (D the) (N dog)) (VP (V chased) (NP (D the) (N cat))))


5

In [9]:
# this is my example sentence:
sentence = "(TOP (S (NP (NNP Ms.) (NNP Haag)) (VP (VBZ plays) (NP (NNP Elianti))) (. .)))"
# using nltk library to parse this string to tree:
parse_tree = Tree.fromstring(sentence)
print(parse_tree)
print(parse_tree.height())


(TOP
  (S
    (NP (NNP Ms.) (NNP Haag))
    (VP (VBZ plays) (NP (NNP Elianti)))
    (. .)))
6


In [4]:
def build_parse_dict(src_tree: Tree) -> Dict:
    """
    Build a multi-lvel dict of treebank data, adding target tags for each level from tags of higher level
    """
    max_height = src_tree.height()
    parse_dict = {
        level: {"tokens": list(), "tags": list(), "targets": list()}
        for level in range(2, max_height)
    }
    # top most level is redudant..so we can stop 1 level before
    for level in range(2, max_height):
        for subtree in src_tree.subtrees(lambda t: t.height() == level):
            parse_dict[level]["tokens"].append(subtree.leaves())
            parse_dict[level]["tags"].append(subtree.label())

    # each level might be missing some unary tokens, add them back to 'tokens'
    # for completeness
    # parse_dict = add_missing_tokens(parse_dict)

    # fill the tragets list of each level based on tags of next level
    # parse_dict = fill_targets(parse_dict, max_level=src_tree.height() - 1)

    

{2: {'tokens': [['Now', ',', 'would', "n't", 'that', 'be', 'a', 'novelty', '.'], ['Now', ',', 'would', "n't", 'that', 'be', 'a', 'novelty', '.'], ['Now', ',', 'would', "n't", 'that', 'be', 'a', 'novelty', '.'], ['Now', ',', 'would', "n't", 'that', 'be', 'a', 'novelty', '.'], ['Now', ',', 'would', "n't", 'that', 'be', 'a', 'novelty', '.'], ['Now', ',', 'would', "n't", 'that', 'be', 'a', 'novelty', '.'], ['Now', ',', 'would', "n't", 'that', 'be', 'a', 'novelty', '.'], ['Now', ',', 'would', "n't", 'that', 'be', 'a', 'novelty', '.'], ['Now', ',', 'would', "n't", 'that', 'be', 'a', 'novelty', '.'], ['Now'], [','], ['would'], ["n't"], ['that'], ['be'], ['a'], ['novelty'], ['.']], 'tags': ['TOP', 'TOP', 'TOP', 'TOP', 'TOP', 'TOP', 'TOP', 'TOP', 'TOP', 'RB', ',', 'MD', 'RB', 'VB', 'VB', 'DT', 'NN', '.'], 'targets': []}, 3: {'tokens': [['Now', ',', 'would', "n't", 'that', 'be', 'a', 'novelty', '.'], ['Now', ',', 'would', "n't", 'that', 'be', 'a', 'novelty', '.'], ['Now', ',', 'would', "n't", 't