In [1]:
class Tree:
    def __init__(self, tag, branches):
        assert len(branches) >= 1
        for b in branches:
            assert isinstance(b, (Tree, Leaf)) # Checks if all the branches are trees or leaves
        self.tag = tag
        self.branches = branches
        
class Leaf:
    def __init__(self, tag, word):
        self.tag = tag
        self.word = word
        
beasts = Leaf('N', 'buffalo') # This is buffalo as a noun
intimidate = Leaf('V', 'buffalo') # This is buffalo as a verb
S, NP, VP = "S", "NP", "VP"

s = Tree(S, [Tree(NP, [beasts]),
             Tree(VP, [intimidate,
                       Tree(NP, [beasts])
                      ])
            ])

In [2]:
"""Pretty-print Trees as indented S-expressions."""

import heapq
import signal
signal.signal(signal.SIGPIPE, signal.SIG_DFL)

from io import StringIO

Leaf.__str__ = lambda leaf: '({tag} {word})'.format(**leaf.__dict__)

def print_tree(t, indent=0, end='\n'):
    """Print Tree or Leaf t with indentation.

    >>> np = Tree('NP', [Leaf('N', 'buffalo')])
    >>> t = Tree('S', [np, Tree('VP', [Leaf('V', 'buffalo'), np])])
    >>> print_tree(t)
    (S (NP (N buffalo))
       (VP (V buffalo)
           (NP (N buffalo))))
    """
    if isinstance(t, Leaf):
        print(t, end='')
    else:
        s = '(' + t.tag + ' '
        indent += len(s)
        print(s, end='')
        print_tree(t.branches[0], indent, '')
        for b in t.branches[1:]:
            print('\n' + ' '*indent, end='')
            print_tree(b, indent, '')
        print(')', end=end)



# Grammars

We don't know exactly how language works, but we can describe various possible ways of combining words into sentences. A formal description of all the different sentences in a language is called a `grammar`.

## Context-Free Grammar Rules

A grammar rule describes how a tag can be expanded as a sequence of tags or words. For example, we could have a context-free rules that looks like the following,

` S -> NP VP `

Which can be interpreted as:

"A `S`entence can be expanded as a `N`oun `P`hrase then a `V`erb `P`hrase

<img src = 'context.png' width = 700/>

Let's go over the collection of rules! We have the one that we have above,

`S -> NP VP`

This allows us to construct a sentence out of a `NP` and a `VP`.

<img src = 'S.png' width = 500/>

From here, we can expand `NP`. 

`NP -> N`

A `NP` can consist of just a `N`oun. A `N`oun can be the word `buffalo`.

`N -> buffalo`

<img src = 'grammar2.png' width = 500/>

By recursively expanding symbols (e.g. `NP`, `VP`), we are starting to generate sentences. So far, we have the noun `buffalo` as the subject (or `N`). 

More grammar rules allow us to fill out the rest of the sentence. How should we expand the `VP`?

`VP -> V NP`

We can expand `VP` as a `V`erb and a `N`oun `P`hrase. The `V`erb is "buffalo".

What about the `NP`? Previously we expanded it to `N`. The only `N`oun we have is also "buffalo".

<img src = 'grammar3.png' width = 500/>

This recursive expansion process is the core of most natural language parsers. 

<img src = 'grammar4.png' width = 700/>

## Demo

We'll create the different pieces of the grammar. The `Leaf`s are in the set called `lexicon`, and the rules that expand the trees are in the dictionary called `grammar`.

In [8]:
lexicon = {
        Leaf('N', 'buffalo'), # beasts
        Leaf('V', 'buffalo'), # intimidate
        }

grammar = {
        'S':  [['NP', 'VP']],
        'NP': [['N']],
        'VP': [['V', 'NP']],
        }

What does it mean to `expand` a `tag`? 

In [None]:
def expand(tag):
    """Yield all trees rooted by tag."""

One possibility is to expand it as a `Leaf`.

In [None]:
    for leaf in lexicon: # Go through all leaves in lexicon
        if tag == leaf.tag:
            yield leaf

Another possibility is that the `tag` appears in the `grammar` dictionary. 

In [None]:
    if tag in grammar:
        for tags in grammar[tag]: # Go through all the different sequences of tags that it can expand to 
            for branches in expand_all(tags):
                yield(Tree(tag, branches))

Thus, the `expand` function definition looks like the following,

In [3]:
def expand(tag):
    """Yield all trees rooted by tag."""
    for leaf in lexicon: # Go through all leaves in lexicon
        if tag == leaf.tag:
            yield leaf
    if tag in grammar:
        for tags in grammar[tag]: # Go through all the different sequences of tags that it can expand to 
            for branches in expand_all(tags):
                yield(Tree(tag, branches))

Now what does it mean to expand all tags? 

In [4]:
def expand_all(tags):
    """ Yield all sequences of branches for a sequence of tags"""
    if len(tags) == 1:# If the length of the tag is only 1, then there's only one branch.
        for branch in expand(tags[0]):
            yield [branch] # We want to yield a sequence of branches, so the yield value must be in a list
    else: # Otherwise, we need to handle all the different tags and turn each of them into a branch
        first, rest = tags[0], tags[1:]# This can be done by recursively splitting them into first and rest of the tags
        for first_branch in expand(first):# Then consider all the possible ways to fill the first branch out of the first tag
            for rest_branches in expand_all(rest): #Expand the rest of the tags into the rest of the branches
                yield [first_branch] + rest_branches
  

What to do with these expansions? For starters, let's try printing them!

In [13]:
for tree in expand('S'):# For every tree we get by expanding a whole 'S'entence
    print_tree(tree)

(S (NP (N buffalo))
   (VP (V buffalo)
       (NP (N buffalo))))


If we add more alternatives to the grammar, then we'll obtain different sentences. 

For example, what if a `NP` be an ad`J`ective followed by a `N`oun?

In [4]:
grammar = {
        'S':  [['NP', 'VP']],
        'NP': [['N'], ['J', 'N']],
        'VP': [['V', 'NP']],
        }

An ad`J`ective is a word that modifies a noun. For example, `buffalo` can also mean a city in New York state. So the word buffalo can also mean 'from New York'. Buffalo lady can mean the lady from Buffalo, New York.

Let's add this definition to the `lexicon` dictionary,

In [5]:
lexicon = {
        Leaf('N', 'buffalo'), # beasts
        Leaf('V', 'buffalo'), # intimidate
        Leaf('J', 'buffalo'), # from New York
        }

This time, if we try to expand `S` once again,

In [8]:
for tree in expand('S'):# For every tree we get by expanding a whole 'S'entence
    print_tree(tree)

(S (NP (N buffalo))
   (VP (V buffalo)
       (NP (N buffalo))))
(S (NP (N buffalo))
   (VP (V buffalo)
       (NP (J buffalo)
           (N buffalo))))
(S (NP (J buffalo)
       (N buffalo))
   (VP (V buffalo)
       (NP (N buffalo))))
(S (NP (J buffalo)
       (N buffalo))
   (VP (V buffalo)
       (NP (J buffalo)
           (N buffalo))))


...we obtain 4 different structures of sentences! 

We can expand the grammar further to find more interesting sentences. A `NP` can also have a `NP` followed by a `RP` (relative clause). A relative clause describes the noun. 

A relative clause `RP` starts with relative pronoun `R`, which in this case is the word `'that'`. For example, "the buffalo **that** other buffalo intimidate is from New York". It also needs to have a subject (`NP`) and a `V`erb.

In [7]:
lexicon = {
        Leaf('N', 'buffalo'), # beasts
        Leaf('V', 'buffalo'), # intimidate
        Leaf('J', 'buffalo'), # from New York
        Leaf('R', 'that')
        }

grammar = {
        'S':  [['NP', 'VP']],
        'NP': [['N'], ['J', 'N'], ['NP', 'RP']],
        'VP': [['V', 'NP']],
        'RP': [['R', 'NP', 'V']],
        }

Beware that if we try to expand the sentence this time, it will take a long processing time.

In [None]:
for tree in expand('S'):# For every tree we get by expanding a whole 'S'entence
    print_tree(tree)

(S (NP (N buffalo))
   (VP (V buffalo)
       (NP (N buffalo))))
(S (NP (N buffalo))
   (VP (V buffalo)
       (NP (J buffalo)
           (N buffalo))))
(S (NP (N buffalo))
   (VP (V buffalo)
       (NP (NP (N buffalo))
           (RP (R that)
               (NP (N buffalo))
               (V buffalo)))))
(S (NP (N buffalo))
   (VP (V buffalo)
       (NP (NP (N buffalo))
           (RP (R that)
               (NP (J buffalo)
                   (N buffalo))
               (V buffalo)))))
(S (NP (N buffalo))
   (VP (V buffalo)
       (NP (NP (N buffalo))
           (RP (R that)
               (NP (NP (N buffalo))
                   (RP (R that)
                       (NP (N buffalo))
                       (V buffalo)))
               (V buffalo)))))
(S (NP (N buffalo))
   (VP (V buffalo)
       (NP (NP (N buffalo))
           (RP (R that)
               (NP (NP (N buffalo))
                   (RP (R that)
                       (NP (J buffalo)
                           (N buffalo))
   