In [2]:
class Tree:
    def __init__(self, tag, branches):
        assert len(branches) >= 1
        for b in branches:
            assert isinstance(b, (Tree, Leaf)) # Checks if all the branches are trees or leaves
        self.tag = tag
        self.branches = branches
        
class Leaf:
    def __init__(self, tag, word):
        self.tag = tag
        self.word = word
        
beasts = Leaf('N', 'buffalo') # This is buffalo as a noun
intimidate = Leaf('V', 'buffalo') # This is buffalo as a verb
S, NP, VP = "S", "NP", "VP"

s = Tree(S, [Tree(NP, [beasts]),
             Tree(VP, [intimidate,
                       Tree(NP, [beasts])
                      ])
            ])

In [3]:
"""Pretty-print Trees as indented S-expressions."""

import heapq
import signal
signal.signal(signal.SIGPIPE, signal.SIG_DFL)

from io import StringIO

Leaf.__str__ = lambda leaf: '({tag} {word})'.format(**leaf.__dict__)

def print_tree(t, indent=0, end='\n'):
    """Print Tree or Leaf t with indentation.

    >>> np = Tree('NP', [Leaf('N', 'buffalo')])
    >>> t = Tree('S', [np, Tree('VP', [Leaf('V', 'buffalo'), np])])
    >>> print_tree(t)
    (S (NP (N buffalo))
       (VP (V buffalo)
           (NP (N buffalo))))
    """
    if isinstance(t, Leaf):
        print(t, end='')
    else:
        s = '(' + t.tag + ' '
        indent += len(s)
        print(s, end='')
        print_tree(t.branches[0], indent, '')
        for b in t.branches[1:]:
            print('\n' + ' '*indent, end='')
            print_tree(b, indent, '')
        print(')', end=end)



In [7]:
lexicon = {
        Leaf('N', 'buffalo'), # bison
        Leaf('V', 'buffalo'), # intimidate
        Leaf('J', 'buffalo'), # New York
        Leaf('R', 'that'),
        }

grammar = {
        'S':  [['NP', 'VP']],
        'NP': [['N'], ['J', 'N'], ['NP', 'RP']],
        'VP': [['V', 'NP']],
        'RP': [['R', 'NP', 'V'], ['NP', 'V']],
        }

# Parsing

What we have so far is a sentence generator. It generates a syntax trees from root symbols. Each trees correspond to a sentence.

However, we want a parser, which takes a particular sentence and generates its syntactic structure. It turns out parsers and sentence generators are almost the same.

## Exhaustive Parsing

We would like to enumerate all the possible syntax trees corresponding to a sentence. We can do this by expanding all tags recursively, but we'll force words to match input. 

For example, we want to parse the sentence,

`buffalo buffalo buffalo buffalo`

We'll call `expand` on `S`, but on top of that we'll keep track of 2 additional pieces of info: 
1. The beginning of the sentence that we're expanding
2. The end of the sentence that we're expanding

In the code, we keep track of these positions using index numbers.

<img src = 'numbers.png' width = 500/>

* A span from `0` to `4` conclude all 4 of these words
* A span from `0` to `2` only enclose the first 2 `buffalos`

When we expand `S`, we'll obtain `NP` and `VP`. In addition, we'll decide which part of the sentence constitutes the `NP` and `VP`. We'll have to determine a split point where the `NP` ends and `VP` begins. 

Let's analyze the case where the `NP` ends and `VP` begins at index 1. Here, we obtain a structure where the `NP` subject is just the first word, while the rest are `VP`.

<img src = 'index1.png' width = 500/>

The next step is to recursively expand these tags. The `NP` becomes a `N`oun, which is `buffalo`. 

Notice that there's a constraint: Any time we generate a `Leaf`, we need to ensure that the `Leaf` has the right tag and word. This way, we force the program to only build syntax trees that have the right words at the leaves.

<img src = 'leaf.png' width = 500/>

Now we also need to recursively expand `VP`. The output of the process would be the following subtree,

<img src = 'subtree.png' width = 500/>

Above, we obtain a valid syntactic structure. However, this is not the only valid syntactic structure for the sentence; we could have picked a different split point such as the following, 

<img src = 'alternative.png' width = 500/>

## Demo

The process of parsing is simpler than sentence expansion. We'll still need to expand tags, but we'll do it in the process of parsing an entire line of text. 

In [None]:
def parse(line):
    words = line.split() # A line consists of words, which can be obtained by splitting spaces
    
    # expand and expand_all becomes part of the parse function
    def expand(tag):
    """Yield all trees rooted by tag."""
    for leaf in lexicon: # Go through all leaves in lexicon
        if tag == leaf.tag:
            yield leaf
    if tag in grammar:
        for tags in grammar[tag]: # Go through all the different sequences of tags that it can expand to 
            for branches in expand_all(tags):
                yield(Tree(tag, branches))
                
    def expand_all(tags):
    """ Yield all sequences of branches for a sequence of tags"""
    if len(tags) == 1:# If the length of the tag is only 1, then there's only one branch.
        for branch in expand(tags[0]):
            yield [branch] # We want to yield a sequence of branches, so the yield value must be in a list
    else: # Otherwise, we need to handle all the different tags and turn each of them into a branch
        first, rest = tags[0], tags[1:]# This can be done by recursively splitting them into first and rest of the tags
        for first_branch in expand(first):# Then consider all the possible ways to fill the first branch out of the first tag
            for rest_branches in expand_all(rest): #Expand the rest of the tags into the rest of the branches
                yield [first_branch] + rest_branches

However, there are changes within the `expand` and `expand_all` function.

For `expand`, instead of just expanding the `tag`, we expand the `tag` over a `start` and `end` point. 

In [None]:
def expand(start, end, tag):
    # if the start and end point are only one apart, that means we're expanding a single word
    if end-start == 1:
        # grab the word that's at index 'start', the word that we need to match the leaf of interest
        word = words[start]
        for leaf in lexicon: # When going through the entire lexicon
            # check that the tag is right and that the word matches the leaf's word
            if tag == leaf.tag and word == leaf.word:
                yield leaf
    # Not much different with tags in grammar except that,
    if tag in grammar:
        for tags in grammar[tag]:
            # When we expand, expand them all on the span that starts with 'start' and ends at 'end'
            for branches in expand_all(start, end, tags):
                yield Tree(tag, branches)

In [1]:
# Update expand_all, this time it also takes 'start' and 'end'

def expand_all(start, end, tags):
    # Divide up the 'start' and span into smaller spans. 
    if len(tags) == 1: # If there's only 1 tag
        # expand the tag from start to end
        for branch in expand(start, end, tags[0]):
            yield [branch]
    # If there're multiple tags, split them into 'first' and 'rest'
    else:
        first, rest = tags[0], tags[1:]
        for middle in range(start+1, end+1-len(rest)):
            for first_branch in expand(start, middle, first):
                for rest_branches in expand_all(middle, end, rest):
                    yield[first_branch] + rest_branches

One other thing we need to do is what happens in the end. We want to expand `S`, but we want to start at the beginning of the sentence and end after the last word. 

In [None]:
for tree in expand(0, len(words), 'S'):
    print_tree(tree)

What we have overall is as the following,

In [10]:
def parse(line):
    words = line.split() # A line consists of words, which can be obtained by splitting spaces
    
    def expand(start, end, tag):
        # if the start and end point are only one apart, that means we're expanding a single word
        if end-start == 1:
            # grab the word that's at index 'start', the word that we need to match the leaf of interest
            word = words[start]
            for leaf in lexicon: # When going through the entire lexicon
                # check that the tag is right and that the word matches the leaf's word
                if tag == leaf.tag and word == leaf.word:
                    yield leaf
        # Not much different with tags in grammar except that,
        if tag in grammar:
            for tags in grammar[tag]:
                # When we expand, expand them all on the span that starts with 'start' and ends at 'end'
                for branches in expand_all(start, end, tags):
                    yield Tree(tag, branches)

        # Update expand_all, this time it also takes 'start' and 'end'

    def expand_all(start, end, tags):
        # Divide up the 'start' and span into smaller spans. 
        if len(tags) == 1: # If there's only 1 tag
            # expand the tag from start to end
            for branch in expand(start, end, tags[0]):
                yield [branch]
        # If there're multiple tags, split them into 'first' and 'rest'
        else:
            first, rest = tags[0], tags[1:]
            for middle in range(start+1, end+1-len(rest)):
                for first_branch in expand(start, middle, first):
                    for rest_branches in expand_all(middle, end, rest):
                        yield[first_branch] + rest_branches
                        
    for tree in expand(0, len(words), 'S'):
        print_tree(tree)

In [11]:
parse('buffalo buffalo buffalo')

(S (NP (N buffalo))
   (VP (V buffalo)
       (NP (N buffalo))))


Above is the result of parsing repeated `"buffalo"` 3 times. What about 4 times?

In [12]:
parse('buffalo buffalo buffalo buffalo')

(S (NP (N buffalo))
   (VP (V buffalo)
       (NP (J buffalo)
           (N buffalo))))
(S (NP (J buffalo)
       (N buffalo))
   (VP (V buffalo)
       (NP (N buffalo))))


We obtain 2 trees:

1. Beast intimidate New York beast
2. New York beast intimidate beast

We can span as many `buffalo` as we want.

In [13]:
parse('buffalo ' * 10)

(S (NP (N buffalo))
   (VP (V buffalo)
       (NP (NP (N buffalo))
           (RP (NP (NP (N buffalo))
                   (RP (NP (NP (N buffalo))
                           (RP (NP (J buffalo)
                                   (N buffalo))
                               (V buffalo)))
                       (V buffalo)))
               (V buffalo)))))
(S (NP (N buffalo))
   (VP (V buffalo)
       (NP (NP (N buffalo))
           (RP (NP (NP (N buffalo))
                   (RP (NP (NP (J buffalo)
                               (N buffalo))
                           (RP (NP (N buffalo))
                               (V buffalo)))
                       (V buffalo)))
               (V buffalo)))))
(S (NP (N buffalo))
   (VP (V buffalo)
       (NP (NP (N buffalo))
           (RP (NP (NP (J buffalo)
                       (N buffalo))
                   (RP (NP (NP (N buffalo))
                           (RP (NP (N buffalo))
                               (V buffalo)))
                   