In [1]:
import nltk
from collections import Counter, defaultdict
from nltk.tree import *

In [3]:
# optional
nltk.download('large_grammars')


[nltk_data] Downloading package large_grammars to
[nltk_data]     /home/rrichajalota/nltk_data...
[nltk_data]   Unzipping grammars/large_grammars.zip.


True

In [7]:
grammar = nltk.data.load("grammars/large_grammars/atis.cfg") # load the grammar
s = nltk.data.load("grammars/large_grammars/atis_sentences.txt") # load raw sentences
t = nltk.parse.util.extract_test_sentences(s)

In [36]:
mixed = set()
for prod in grammar.productions():
    # replace such rules so that they point directly to terminals
    if prod.is_nonlexical() and len(prod.rhs()) == 1:
        pp = grammar.productions(lhs=prod.rhs()[0])
        print(pp)
        mixed.add(prod)
        break

[only -> 'only']


In [33]:
mixed

{ADJ_ABL -> only}

In [5]:
# initialize the parser
parser = nltk.parse.BottomUpChartParser(grammar)
# parse all test sentences
for sentence in t:
    print(sentence)
    parser.chart_parse(sentence[0])

(['i', 'need', 'a', 'flight', 'from', 'charlotte', 'to', 'las', 'vegas', 'that', 'makes', 'a', 'stop', 'in', 'saint', 'louis', '.'], 2085)
(['what', 'is', 'the', 'cheapest', 'one', 'way', 'flight', 'from', 'phoenix', 'to', 'san', 'diego', 'that', 'arrives', 'in', 'the', 'morning', 'on', 'thursday', 'june', 'second', '.'], 1380)
(['what', 'is', 'the', 'cheapest', 'one', 'way', 'flight', 'from', 'columbus', 'to', 'indianapolis', '.'], 50)
(['is', 'there', 'a', 'flight', 'from', 'memphis', 'to', 'los', 'angeles', '.'], 18)
(['what', 'aircraft', 'is', 'this', '.'], 0)
(['please', 'show', 'me', 'the', 'flights', 'from', 'chicago', 'to', 'detroit', 'that', 'arrive', 'at', 'six', 'p.m.', 'next', 'tuesday', '.'], 20)
(['what', 'flights', 'are', 'available', 'between', 'chicago', 'and', 'indianapolis', 'next', 'wednesday', 'between', 'eleven', 'a.m.', 'and', 'one', 'p.m', '.'], 0)
(['please', 'book', 'a', 'one', 'way', 'coach', 'fare', 'from', 'chicago', 'to', 'indianapolis', 'on', 'united', 'f

KeyboardInterrupt: 

https://www.nltk.org/api/nltk.html#nltk.grammar.CFG
https://www.nltk.org/api/nltk.html#nltk.tree.Tree.pformat_latex_qtree

In [6]:
grammar.is_chomsky_normal_form()

True

In [81]:
for idx, sent in enumerate(test_sents[15:31]):
 print(sent)
 parser.chart_parse(sent[0])


(['what', 'is', 'e', 'w', 'r', '.'], None)
(['oakland', 'to', 'salt', 'lake', 'city', '.'], None)
(['show', 'me', 'northwest', 'flights', 'to', 'detroit', '.'], None)
(['i', 'want', 'to', 'leave', 'before', 'noon', '.'], None)
(['what', 'flights', 'leave', 'boston', 'to', 'pittsburgh', '.'], None)
(['i', "'d", 'like', 'an', 'afternoon', 'flight', '.'], None)
(['are', 'any', 'fares', 'cheaper', 'than', 'these', '.'], None)
(['what', 'is', 'the', 'flying', 'time', 'from', '.'], None)
(['which', 'flights', 'use', 'a', 'large', 'plane', '.'], None)
(['what', 'area', 'does', 'canadian', 'airlines', 'international', 'service', '.'], None)
(['what', 'flights', 'leave', 'las', 'vegas', 'to', 'oakland', '.'], None)


KeyboardInterrupt: 

In [116]:
grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP | P N
NP -> Det N | Det NPP | 'I' | 'John'
NPP -> N PP | N P 
VP -> V NP | VP PP
Det -> 'an' | 'my' | 'a'
N -> 'elephant' | 'pajamas' | 'shot' | 'cake'
V -> 'shot' | 'ate'
P -> 'in'
""")

In [4]:
type(grammar)

nltk.grammar.CFG

In [5]:
grammar.start()

S

In [117]:
grammar.productions()

[S -> NP VP,
 PP -> P NP,
 PP -> P N,
 NP -> Det N,
 NP -> Det NPP,
 NP -> 'I',
 NP -> 'John',
 NPP -> N PP,
 NPP -> N P,
 VP -> V NP,
 VP -> VP PP,
 Det -> 'an',
 Det -> 'my',
 Det -> 'a',
 N -> 'elephant',
 N -> 'pajamas',
 N -> 'shot',
 N -> 'cake',
 V -> 'shot',
 V -> 'ate',
 P -> 'in']

In [6]:
from nltk.grammar import *
grammar.productions(lhs=Nonterminal("NP"))

[NP -> Det N, NP -> Det NPP, NP -> 'I', NP -> 'John']

In [7]:
grammar.productions(rhs=Nonterminal("Det"))

[NP -> Det N, NP -> Det NPP]

In [8]:
pp = grammar.productions(rhs="I")
pp

[NP -> 'I']

In [9]:
def fetch_lhs(grammar, rhs_tuple):
    all_prods = []
    
    if len(rhs_tuple) == 1: # it's a terminal
        pp = grammar.productions(rhs=rhs_tuple[0])
        for prod in pp:
            all_prods.append(prod.lhs()) 
        return all_prods
    
    filtered_productions = grammar.productions(rhs=rhs_tuple[0])
    #print(filtered_productions)
    
    for a_production in filtered_productions:  
        if rhs_tuple[1] == a_production.rhs()[1]:  
              all_prods.append(a_production.lhs())
    return all_prods

In [10]:
print(fetch_lhs(('Det','N')))

TypeError: fetch_lhs() missing 1 required positional argument: 'rhs_tuple'

In [48]:
grammar.productions(rhs='shot')[0].lhs()

N

In [129]:
def parse_tree(chart, row, col, sym):
    cell = chart[row][col][sym]
    
    #base condition - if string, we've reached a terminal. returns [terminal]
    if type(cell[0]) == type('str'):
        return [ImmutableTree(sym, cell)]
        #return cell
    
    #print(f'cell: {cell}')
    trees = []
    for ind, tup in enumerate(cell):
        #print(f'choice of tuple: {ind}')
        left, right = tup[0], tup[1]
        lhs_trees = parse_tree(chart, left[0], left[1], left[2])
        rhs_trees = parse_tree(chart, right[0], right[1], right[2])
        #print(f'lhs: {lhs_trees}')
        #print(f'rhs: {rhs_trees}')
        for t1 in lhs_trees:
            for t2 in rhs_trees:
                trees.append(ImmutableTree(sym, [t1]+[t2]))
        #for t in trees:
        #    print(f' after append: {Tree.fromstring(str(t)).pretty_print()}')
    return trees
    

def cky_recognizer(words, grammar, parser=False, draw_tree=False):
    #words = words.split() # for toy eg. 
    n = len(words)
    print(n)
    chart = [[defaultdict(list) for j in range(n+1)]  for i in range(n)]
    
    for col in range(1, n+1): # left to right
        NT_list = fetch_lhs(grammar, (words[col-1],)) 
        for N in NT_list:
            chart[col-1][col][N].append(words[col-1])
            #print(N, chart[col-1][col][N][0])
        
        for row in range(col-2, -1, -1): # bottom-up
            for k in range(row+1, col):
                for first_N in list(chart[row][k].keys()):
                    for second_N in list(chart[k][col].keys()):
                        #print(f'chart[{row}][{k}] : {chart[row][k]}')
                        #print(f'chart[{k}][{col}] : {chart[k][col]}')
                        NT_list = fetch_lhs(grammar, (first_N, second_N))
                        #print(NT_list)
                        for N in NT_list:
                            chart[row][col][N].append(((row, k, first_N), (k, col, second_N)))
                            #print(N, chart[row][col][N])
                            
          
    if parser: 
        if grammar.start() in chart[0][n].keys():
            all_trees = []
            tree = parse_tree(chart, 0, n, grammar.start())
            if draw_tree:
                for t in tree:
                    Tree.fromstring(str(t)).pretty_print()
            return len(tree)
    
    #for k, v in chart[0][n].items():
    #    print(f'{k} -> {len(v)} {v}')
    #    print()
    return 0 #grammar.start() in chart[0][n].keys()

In [120]:
cky_recognizer('b a a b a', toy_gram, parser=True)

5
chart[0][1] : defaultdict(<class 'list'>, {B: ['b']})
chart[1][2] : defaultdict(<class 'list'>, {A: ['a'], C: ['a']})
chart[0][1] : defaultdict(<class 'list'>, {B: ['b']})
chart[1][2] : defaultdict(<class 'list'>, {A: ['a'], C: ['a']})
chart[1][2] : defaultdict(<class 'list'>, {A: ['a'], C: ['a']})
chart[2][3] : defaultdict(<class 'list'>, {A: ['a'], C: ['a']})
chart[1][2] : defaultdict(<class 'list'>, {A: ['a'], C: ['a']})
chart[2][3] : defaultdict(<class 'list'>, {A: ['a'], C: ['a']})
chart[1][2] : defaultdict(<class 'list'>, {A: ['a'], C: ['a']})
chart[2][3] : defaultdict(<class 'list'>, {A: ['a'], C: ['a']})
chart[1][2] : defaultdict(<class 'list'>, {A: ['a'], C: ['a']})
chart[2][3] : defaultdict(<class 'list'>, {A: ['a'], C: ['a']})
chart[0][1] : defaultdict(<class 'list'>, {B: ['b']})
chart[1][3] : defaultdict(<class 'list'>, {B: [((1, 2, C), (2, 3, C))]})
chart[0][2] : defaultdict(<class 'list'>, {A: [((0, 1, B), (1, 2, A))], S: [((0, 1, B), (1, 2, C))]})
chart[2][3] : default

True

In [100]:
toy_gram = nltk.CFG.fromstring("""
S -> A B
S -> B C
A -> B A
B -> C C
C -> A B
A -> 'a'
B -> 'b'
C -> 'a'
""")

In [104]:
toy_gram.productions()

[S -> A B,
 S -> B C,
 A -> B A,
 B -> C C,
 C -> A B,
 A -> 'a',
 B -> 'b',
 C -> 'a']

In [118]:
cky_recognizer('I shot an elephant in my pajamas', grammar, True)

7
chart[0][1] : defaultdict(<class 'list'>, {NP: ['I']})
chart[1][2] : defaultdict(<class 'list'>, {N: ['shot'], V: ['shot']})
chart[0][1] : defaultdict(<class 'list'>, {NP: ['I']})
chart[1][2] : defaultdict(<class 'list'>, {N: ['shot'], V: ['shot']})
chart[1][2] : defaultdict(<class 'list'>, {N: ['shot'], V: ['shot']})
chart[2][3] : defaultdict(<class 'list'>, {Det: ['an']})
chart[1][2] : defaultdict(<class 'list'>, {N: ['shot'], V: ['shot']})
chart[2][3] : defaultdict(<class 'list'>, {Det: ['an']})
chart[2][3] : defaultdict(<class 'list'>, {Det: ['an']})
chart[3][4] : defaultdict(<class 'list'>, {N: ['elephant']})
chart[1][2] : defaultdict(<class 'list'>, {N: ['shot'], V: ['shot']})
chart[2][4] : defaultdict(<class 'list'>, {NP: [((2, 3, Det), (3, 4, N))]})
chart[1][2] : defaultdict(<class 'list'>, {N: ['shot'], V: ['shot']})
chart[2][4] : defaultdict(<class 'list'>, {NP: [((2, 3, Det), (3, 4, N))]})
chart[0][1] : defaultdict(<class 'list'>, {NP: ['I']})
chart[1][4] : defaultdict(<cl

True

In [151]:
fetch_lhs((Nonterminal("NP"), Nonterminal("NP")))

[S -> NP VP]


[]

In [119]:
t = Tree.fromstring('(S (NP I) (VP (V enjoyed) (NP my cookie)))')
print(t)

(S (NP I) (VP (V enjoyed) (NP my cookie)))


In [120]:
t.pretty_print()

             S                
  ___________|___              
 |               VP           
 |      _________|___          
 NP    V             NP       
 |     |          ___|____     
 I  enjoyed      my     cookie



In [37]:
t = ImmutableTree('S', [ImmutableTree('NP', ['I']), ImmutableTree('VP', [ImmutableTree('V', ['ate']), ImmutableTree('NP', ['a cookie'])])])
from nltk.draw.tree import draw_trees
draw_trees(t)                    


In [4]:
## test the parser with atis test sentences 
grammar = nltk.data.load("./atis/atis-grammar-cnf.cfg")
sents = nltk.data.load('./atis/atis-test-sentences.txt')
test_sents = nltk.parse.util.extract_test_sentences(sents)

with open('result.txt', 'w') as f:
    for idx, sent in enumerate(test_sents):
        f.write(f" {' '.join(sent[0])}\t{cky_recognizer(sent[0], grammar, True)}\n")
    #print(parser.chart_parse(sent[0]))


NameError: name 'cky_recognizer' is not defined