# Assignment 6: PCFG

## Part 1: Loading Grammar

In [1]:
predefined_grammar = {
    "S -> NP VP":   0.5,
    "VP -> V NP":   0.5,
    "VP -> VP PP":  0.5,
    "PP -> P NP":   1,
    "NP -> NP PP":  0.25,
    "NP -> John":   0.25,
    "NP -> soccer": 0.25,
    "NP -> school": 0.25,
    "V -> plays":   1,
    "P -> at":      1,
}


In [2]:
from collections import defaultdict

def format_rules(grammar_rules):
    """
    The following function converts the grammar rules to the form used by our parser
    """
    formatted_rules = []
    probilities = defaultdict(lambda: None)
    for rule, prob in grammar_rules.items():
        
        rule = tuple(rule.replace("->", "").split())
        probilities[rule] = prob
        
        # Adds a rule to the dictionary
        formatted_rules.append(rule)
        
    return formatted_rules, probilities


## Part 2: CKY Parser

In [3]:
class Node:
    """
    Barebone data structure used for storing information about a non-terminal symbol
    """

    def __init__(self, symbol, prob, child1, child2=None):
        self.symbol = symbol
        self.prob = prob
        self.child1 = child1
        self.child2 = child2

    def __repr__(self):
        return self.symbol



def cky_parse(text, rules, probabilities):
    """
    Performs Constituency Parsing using the CKY algorithm.
    """
    tokens = text.split()
    length = len(tokens)
    
    # Data structure for storing the subtrees
    parse_triangle = [[[] for x in range(length - i)] for i in range(length)]
    
    for i, tok in enumerate(tokens):
        
         # Find out which non terminals can generate the terminals in the input string
         # and put them into the parse table. One terminal could be generated by multiple
         # non terminals, therefore the parse table will contain a list of non terminals.
        for rule in rules:
            if tok == rule[1]:
                
                prob = probabilities[rule]
                parse_triangle[0][i].append(Node(rule[0], prob, tok))
    
    # Starting from the second row
    for row_idx in range(1, length):
        
        # Number of cells at each row
        n_cells = length - row_idx
        
        for cell_idx in range(n_cells):
            
            # Number of spans being added to the cell
            n_spans = row_idx
            
            for span_idx in range(n_spans):
                
                left_cell = parse_triangle[span_idx][cell_idx]
                right_cell = parse_triangle[row_idx - span_idx - 1][cell_idx + span_idx + 1]
                max_prob, max_prob_node = 0, None

                for rule in rules:
                    if len(rule) == 3:
                        rule_prob = probabilities[rule]
                        left_nodes = list(filter(lambda n: n.symbol == rule[1], left_cell))
                        right_nodes = list(filter(lambda n: n.symbol == rule[2], right_cell))
                        if len(left_nodes) and len(right_nodes):
                            nodes = [Node(rule[0], rule_prob * left.prob * right.prob, left, right) \
                                         for left in left_nodes for right in right_nodes]
                            max_node = max(nodes, key=lambda x: x.prob)
                            if max_node.prob > max_prob:
                                max_prob = max_node.prob
                                max_prob_node = max_node
                            
                if max_prob_node != None:
                    parse_triangle[row_idx][cell_idx].append(max_prob_node)

    return parse_triangle
    

## Part 3: Constructing (and Visualizing) Parsed Trees

In [4]:
def generate_tree(node):
    """
    Generates the string representation of the parse tree.
    :param node: the root node.
    :return: the parse tree in string form.
    """
    if node.child2 is None:
        return f"[{node.symbol} '{node.child1}']"
    return f"[{node.symbol} {generate_tree(node.child1)} {generate_tree(node.child2)}]"

def construct_and_print_tree(parse_triangle, start_symbol="S", round_digits=6):
    """
    Print the parse tree starting with the start symbol, using the Node pointers to backtrack.
    """
    final_nodes = [n for n in parse_triangle[-1][0] if n.symbol == start_symbol]
    if final_nodes:
        prob = round(final_nodes[0].prob, round_digits)
        print(f"The given sentence is contained in the language produced by the given grammar with probability: {prob}")
        trees = [generate_tree(node) for node in final_nodes]
        for tree in trees:
            print(tree)
    else:
        print("The given sentence is not contained in the language produced by the given grammar!")

## Part 4: Your Solution

In [5]:
def parse_text(text, rules):
    """
    This wrapper function parse the given `text` using `rules` with CKY,
    and print out the parsed tree with the highest probability.
    
    You can use this for your solution.
    """
    rules, probs = format_rules(rules)
    parse_triangle = cky_parse(text, rules, probs)
    construct_and_print_tree(parse_triangle)

In [6]:
# For solution (probilistic grammar computed from silly corpus)
text = "John plays soccer at school"
text2 = "John plays"

computed_grammar = None

# This is from my 503 assignment
computed_grammar = {
    "S -> NP VP":   1.0,
    "VP -> V NP":   10/13,
    "VP -> VP PP":  3/13,
    "PP -> P NP":   1.0,
    "NP -> NP PP":  0.04,
    "NP -> John":   0.4,
    "NP -> soccer": 0.4,
    "NP -> school": 0.16,
    "V -> plays":   1.0,
    "P -> at":      1.0,
}

parse_text(text, predefined_grammar)
print("")

parse_text(text, computed_grammar)
print("")

parse_text(text2, predefined_grammar)
print("")

parse_text(text2, computed_grammar)

The given sentence is contained in the language produced by the given grammar with probability: 0.001953
[S [NP 'John'] [VP [VP [V 'plays'] [NP 'soccer']] [PP [P 'at'] [NP 'school']]]]

The given sentence is contained in the language produced by the given grammar with probability: 0.004544
[S [NP 'John'] [VP [VP [V 'plays'] [NP 'soccer']] [PP [P 'at'] [NP 'school']]]]

The given sentence is not contained in the language produced by the given grammar!

The given sentence is not contained in the language produced by the given grammar!
