### todo
* make search exhaustive inside one sentence

In [74]:
import pandas as pd
import logging
import spacy 
import spacy.symbols as ss
from spacy import displacy
import en_core_web_lg
from stanfordcorenlp import StanfordCoreNLP
from kanren import run, eq, membero, var, conde, Relation, facts

In [19]:
nlp = en_core_web_lg.load()

In [52]:
snlp = StanfordCoreNLP('http://localhost', port=9000)

In [281]:
dataset = [
    #"For the next four years Orwell mixed journalistic work – mainly for Tribune, The Observer and the Manchester Evening News, though he also contributed to many small-circulation political and literary magazines – with writing his best-known work, Nineteen Eighty-Four, which was published in 1949.",
    "Published July 11, 1960, To Kill a Mockingbird was an immediate bestseller and won great critical acclaim, including the Pulitzer Prize for Fiction in 1961.",
    "His sixth book, The Fault in Our Stars, was released in January 2012",
    "In the 1940s, Salinger confided to several people that he was working on a novel featuring Holden Caulfield, the teenage protagonist of his short story \"Slight Rebellion off Madison\", and The Catcher in the Rye was published on July 16, 1951, by Little, Brown and Company.",
    "In 2003, Hosseini released his first novel, The Kite Runner, the story of a young boy, Amir, struggling to establish a closer rapport with his father and coping with memories of a haunting childhood event. ",
    "Animal Farm: A Fairy Story was published in Britain on 17 August 1945, and a year later in the US, on 26 August 1946.",
    "The trilogy's second book, Catching Fire, was released in September 2009, and its third book, Mockingjay, was released on August 24, 2010.",
    "In September 2008, Scholastic Press released The Hunger Games, the first book of a trilogy by Collins.",
    "In 1973, King's first novel Carrie was accepted by publishing house Doubleday.",
    "In 2000, King published online a serialized horror novel, The Plant.",
    "Also in 2000, he wrote a digital novella, Riding the Bullet, and has said he sees e-books becoming 50% of the market \"probably by 2013 and maybe by 2012\".",
    "In 2006, King published an apocalyptic novel, Cell.",
    "In 2009, King published Ur, a novella written exclusively for the launch of the second-generation Amazon Kindle and available only on Amazon.com, and Throttle, a novella co-written with his son Joe Hill and released later as an audiobook titled Road Rage, which included Richard Matheson's short story \"Duel\".",
]

In [256]:
pd.read_csv('books.csv', index_col=0)[['authors', 'original_title', 'original_publication_year']]

Unnamed: 0_level_0,authors,original_title,original_publication_year
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Suzanne Collins,The Hunger Games,2008.0
2,"J.K. Rowling, Mary GrandPré",Harry Potter and the Philosopher's Stone,1997.0
3,Stephenie Meyer,Twilight,2005.0
4,Harper Lee,To Kill a Mockingbird,1960.0
5,F. Scott Fitzgerald,The Great Gatsby,1925.0
6,John Green,The Fault in Our Stars,2012.0
7,J.R.R. Tolkien,The Hobbit or There and Back Again,1937.0
8,J.D. Salinger,The Catcher in the Rye,1951.0
9,Dan Brown,Angels & Demons,2000.0
10,Jane Austen,Pride and Prejudice,1813.0


In [282]:
test = dataset[-1]

In [283]:
displacy.render(nlp(test), jupyter=True)

In [276]:
BOOK_NAME = '__BOOK_NAME__'
REL_DATE = '__REL_DATE__'
variables = set([BOOK_NAME, REL_DATE])
REL_SYN = set(['release', 'publish', 'accept'])
BOOK_SYN = set(['book', 'novel'])
RULES = [
    [((REL_SYN, None, None), (ss.nsubjpass, ss.dobj), (BOOK_NAME, ss.WORK_OF_ART)),
     ((REL_SYN, None, None), (ss.prep,), (['in', 'on'], None)),
     ((['in', 'on'], None, None), (ss.pobj,), (REL_DATE, ss.DATE))],
    
    [((REL_SYN, None,None), (ss.nsubjpass,ss.dobj), (BOOK_SYN, None, None)),
     ((BOOK_SYN, None,None), (ss.appos,), (BOOK_NAME, None, ss.PROPN)),
     ((REL_SYN, None, None), (ss.prep,), (['in', 'on'], None, None)),
     ((['in', 'on'], None, None), (ss.pobj,), (REL_DATE, ss.DATE, None))],
    
    #[((['release'], None), (ss.dobj,), (['novel'], None)),
    # ((['novel'], None), (ss.appos,), (BOOK_NAME, ss.WORK_OF_ART)),
    # ((['release'], None), (ss.prep,), (['in'], None)),
    # ((['in'], None), (ss.pobj,), (REL_DATE, ss.DATE))]
]

In [284]:
def is_variable(rchild_lemmas):
    return isinstance(rchild_lemmas, str) and rchild_lemmas in variables

def child_eq(rdeps, rchild, child):
    (rchild_lemmas, rchild_ent, rchild_pos) = rchild
    (child_lemma, child_ent, child_pos, dep) = child.lemma_, child.ent_type, child.pos, child.dep
    if is_variable(rchild_lemmas):
        if (not rchild_pos or rchild_pos == child_pos) and (not rchild_ent or rchild_ent == child_ent) and (dep in rdeps):
            return {rchild_lemmas: child}
        else:
             return False
    else:
        return  child_lemma in rchild_lemmas and (not rchild_pos or rchild_pos == child_pos) and (not rchild_ent or rchild_end == child_ent) and (dep in rdeps)

def match():
    for ri, rule in enumerate(RULES):
        print("Rule ", ri)
        matches = []
        tree = nlp(test)
        bindings = {}
        for subrule in rule:
            ((rhead_lemmas, rhead_ent, _), rdeps, rchild) = subrule
            #print(subrule)
            matched = False
            for tok in tree:
                same_lemma = tok.lemma_ in rhead_lemmas
                same_ent = (not rhead_ent or (rhead_ent == tok.pos))
                same_child = None
                #print("  ", tok)
                for ch in tok.children:
                    #print("    ", rchild, ch)
                    res =  child_eq(rdeps, rchild, ch)
                    if res:
                        if isinstance(res, dict):
                            bindings.update(res)
                        same_child = res
                #print("  ", same_lemma, same_ent, same_child)
                matched = same_lemma and same_ent and same_child
                if matched:
                    break
            matches.append(matched)
        print("MATCHES", matches)
        if all(matches):
            return bindings
match()

Rule  0
MATCHES [False, False, True, {'__REL_DATE__': 2009}]


In [78]:
prep = Relation('prep') 
pobj = Relation('pobj')
facts(prep, ('released', 'in'))
facts(pobj, ('in', 'January'))
             
x = var()
y = var()
run(1, x, 
    prep('released', y),
    pobj(y, x)
   )

('January',)

In [267]:
[(tok.head.text, tok.dep_, tok.text, tok.ent_type_, tok.pos_) for tok in list(nlp(test))]

[('published', 'prep', 'In', '', 'ADP'),
 ('In', 'pobj', '2006', 'DATE', 'NUM'),
 ('published', 'punct', ',', '', 'PUNCT'),
 ('published', 'nsubj', 'King', '', 'PROPN'),
 ('published', 'ROOT', 'published', '', 'VERB'),
 ('novel', 'det', 'an', '', 'DET'),
 ('novel', 'amod', 'apocalyptic', '', 'ADJ'),
 ('published', 'dobj', 'novel', '', 'NOUN'),
 ('novel', 'punct', ',', '', 'PUNCT'),
 ('novel', 'appos', 'Cell', 'PERSON', 'PROPN'),
 ('published', 'punct', '.', '', 'PUNCT')]

In [271]:
for ent in nlp(test).ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)


2006 3 7 DATE
Cell 46 50 PERSON


In [53]:
snlp.annotate(dataset[0])

'{"sentences":[{"index":0,"parse":"(ROOT\\n  (S\\n    (VP\\n      (VP (VBN Published)\\n        (NP-TMP (NNP July) (CD 11) (, ,) (CD 1960))\\n        (, ,)\\n        (S\\n          (VP (TO To)\\n            (VP (VB Kill)\\n              (SBAR\\n                (S\\n                  (NP (DT a) (NNP Mockingbird))\\n                  (VP (VBD was)\\n                    (NP (DT an) (JJ immediate) (NN bestseller)))))))))\\n      (CC and)\\n      (VP (VBD won)\\n        (NP\\n          (NP (JJ great) (JJ critical) (NN acclaim))\\n          (, ,)\\n          (PP (VBG including)\\n            (NP\\n              (NP (DT the) (NNP Pulitzer) (NNP Prize))\\n              (PP (IN for)\\n                (NP\\n                  (NP (NN Fiction))\\n                  (PP (IN in)\\n                    (NP (CD 1961))))))))))\\n    (. .)))","basicDependencies":[{"dep":"ROOT","governor":0,"governorGloss":"ROOT","dependent":14,"dependentGloss":"bestseller"},{"dep":"csubj","governor":14,"governorGloss":"be