### todo
* make search exhaustive inside one sentence
* match years before 1000

In [46]:
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd
import logging
import spacy 
import spacy.symbols as ss
from spacy import displacy
import en_core_web_lg
from stanfordcorenlp import StanfordCoreNLP
from kanren import run, eq, membero, var, conde, Relation, facts
import wikipedia as wiki
import re
import numpy as np

In [6]:
nlp = en_core_web_lg.load()

In [11]:
dataset = [
    #"For the next four years Orwell mixed journalistic work – mainly for Tribune, The Observer and the Manchester Evening News, though he also contributed to many small-circulation political and literary magazines – with writing his best-known work, Nineteen Eighty-Four, which was published in 1949.",
    "Published July 11, 1960, To Kill a Mockingbird was an immediate bestseller and won great critical acclaim, including the Pulitzer Prize for Fiction in 1961.",
    "His sixth book, The Fault in Our Stars, was released in January 2012",
    "In the 1940s, Salinger confided to several people that he was working on a novel featuring Holden Caulfield, the teenage protagonist of his short story \"Slight Rebellion off Madison\", and The Catcher in the Rye was published on July 16, 1951, by Little, Brown and Company.",
    "In 2003, Hosseini released his first novel, The Kite Runner, the story of a young boy, Amir, struggling to establish a closer rapport with his father and coping with memories of a haunting childhood event. ",
    "Animal Farm: A Fairy Story was published in Britain on 17 August 1945, and a year later in the US, on 26 August 1946.",
    "The trilogy's second book, Catching Fire, was released in September 2009, and its third book, Mockingjay, was released on August 24, 2010.",
    "In September 2008, Scholastic Press released The Hunger Games, the first book of a trilogy by Collins.",
    "In 1973, King's first novel Carrie was accepted by publishing house Doubleday.",
    "In 2000, King published online a serialized horror novel, The Plant.",
    "Also in 2000, he wrote a digital novella, Riding the Bullet, and has said he sees e-books becoming 50% of the market \"probably by 2013 and maybe by 2012\".",
    "In 2006, King published an apocalyptic novel, Cell.",
    "In 2009, King published Ur, a novella written exclusively for the launch of the second-generation Amazon Kindle and available only on Amazon.com, and Throttle, a novella co-written with his son Joe Hill and released later as an audiobook titled Road Rage, which included Richard Matheson's short story \"Duel\".",
    "King's next novel, 11/22/63, was published November 8, 2011, and was nominated for the 2012 World Fantasy Award Best Novel."
]

In [15]:
db = pd.read_csv('books_all.csv', index_col=0)[['authors', 'original_title', 'original_publication_year']]
db = db.rename(columns={'original_publication_year': 'year'})
db = db.dropna()
db['year'] = db['year'].apply(int)
db

Unnamed: 0_level_0,authors,original_title,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Suzanne Collins,The Hunger Games,2008
2,"J.K. Rowling, Mary GrandPré",Harry Potter and the Philosopher's Stone,1997
3,Stephenie Meyer,Twilight,2005
4,Harper Lee,To Kill a Mockingbird,1960
5,F. Scott Fitzgerald,The Great Gatsby,1925
6,John Green,The Fault in Our Stars,2012
7,J.R.R. Tolkien,The Hobbit or There and Back Again,1937
8,J.D. Salinger,The Catcher in the Rye,1951
9,Dan Brown,Angels & Demons,2000
10,Jane Austen,Pride and Prejudice,1813


In [331]:
test = "Benjamin Anastas (born 1969) is an American novelist, memoirist, journalist and book reviewer born in Gloucester, Massachusetts."

In [60]:
displacy.render(nlp("A collection of Jack London's San Francisco Stories was published in October 2010 by Sydney Samizdat Press."), jupyter=True)

In [76]:
BOOK_NAME = '__BOOK_NAME__'
REL_DATE = '__REL_DATE__'
variables = set([BOOK_NAME, REL_DATE])
REL_SYN = set(['release', 'publish', 'accept', 'write'])
BOOK_SYN = set(['book', 'novel', 'story', 'piece', 'collection'])
RULES = [
    [((REL_SYN, None, None), (ss.nsubjpass, ss.dobj), (BOOK_NAME, None, ss.PROPN)),
     #((REL_SYN, None, None), (ss.prep,), (['in', 'on'], None, None)),
     #((['in', 'on'], None, None), (ss.pobj,), (REL_DATE, ss.DATE, None))
    ],
    
    [((REL_SYN, None,None), (ss.nsubjpass,ss.dobj), (BOOK_SYN, None, None)),
     ((BOOK_SYN, None,None), (ss.appos,), (BOOK_NAME, None, ss.PROPN)),
    # ((REL_SYN, None, None), (ss.prep,), (['in', 'on'], None, None)),
    # ((['in', 'on'], None, None), (ss.pobj,), (REL_DATE, ss.DATE, None))
    ],
    
    [((REL_SYN, None, None), (ss.nsubjpass, ss.dobj), (BOOK_SYN, None, None)),
     ((BOOK_SYN, None, None), (ss.prep,), (['of'], None, None)),
     ((['of'], None, None), (ss.pobj,), (BOOK_NAME, None, ss.PROPN)),
    # ((REL_SYN, None, None), (ss.prep,), (['in', 'on'], None, None)),
    # ((['in', 'on'], None, None), (ss.pobj,), (REL_DATE, ss.DATE, None))
    ]
    
    #[(('be', None, None), (ss.nsubj,), (BOOK_SYN, None, None)),
    # ((BOOK_SYN, None, None), (ss.amod,),(REL_SYN, None, None))]
    
    #[((['release'], None), (ss.dobj,), (['novel'], None)),
    # ((['novel'], None), (ss.appos,), (BOOK_NAME, ss.WORK_OF_ART)),
    # ((['release'], None), (ss.prep,), (['in'], None)),
    # ((['in'], None), (ss.pobj,), (REL_DATE, ss.DATE))]
]

AUTHOR = '__AUTHOR__'
AUTHOR_VARS = (AUTHOR,)
WRITER_SYN = set(['novelist','essayist','writer', 'author', 'biographer',
                 'columnist', 'critic', 'dramatist', 'editor',
                 'journalist', 'poet'])

AUTHOR_RULES = [
    [((set(['be']), None, None), (ss.nsubj,), (AUTHOR,ss.PERSON, ss.PROPN)),
     ((set(['be']), None, None), (ss.attr,), (WRITER_SYN, None, None))]
]

In [87]:
def is_variable(rchild_lemmas, variables):
    return isinstance(rchild_lemmas, str) and rchild_lemmas in variables

def child_eq(rdeps, rchild, child, variables):
    (rchild_lemmas, rchild_ent, rchild_pos) = rchild
    (child_lemma, child_ent, child_pos, dep) = child.lemma_, child.ent_type, child.pos, child.dep
    if is_variable(rchild_lemmas, variables):
        if (not rchild_pos or rchild_pos == child_pos) and (not rchild_ent or rchild_ent == child_ent) and (dep in rdeps):
            return {rchild_lemmas: child}
        else:
             return False
    else:
        return  child_lemma in rchild_lemmas and (not rchild_pos or rchild_pos == child_pos) and (not rchild_ent or rchild_end == child_ent) and (dep in rdeps)
    
def find_entity(doc, idx):
    for ent in doc.ents:
        if idx >= ent.start and idx < ent.end:
            return ent.text
        
propn_parts = set([ss.PROPN, ss.DET, ss.PART])
        
def find_book_name(doc, tok):
    if tok.dep == ss.appos:
        left = None
        right = None
        for idx in range(tok.i, 0, -1):
            if doc[idx].pos == ss.PUNCT:
                left = idx
                break
        for idx in range(tok.i, len(doc)):
            if doc[idx].pos == ss.PUNCT:
                right = idx
                break
                
        return doc[left+1:right].text
    elif tok.pos == ss.PROPN:
        left = tok.i
        right = tok.i + 1
        for idx in range(tok.i, 0, -1):
            if doc[idx].pos in propn_parts:
                left = idx
            else:
                break
        for idx in range(tok.i, len(doc)):
            if doc[idx].pos in propn_parts:
                right = idx
            else:
                break
        return doc[left:right+1].text

def match(doc, rules, variables):
    for ri, rule in enumerate(rules):
        #print("Rule ", ri)
        matches = []
        bindings = {}
        for subrule in rule:
            ((rhead_lemmas, rhead_ent, _), rdeps, rchild) = subrule
            #print(subrule)
            matched = False
            for tok in doc:
                same_lemma = tok.lemma_ in rhead_lemmas
                same_ent = (not rhead_ent or (rhead_ent == tok.pos))
                same_child = None
                #print("  ", tok)
                for ch in tok.children:
                    #print("    ", rchild, ch)
                    res =  child_eq(rdeps, rchild, ch, variables)
                    if res:
                        if isinstance(res, dict):
                            bindings.update(res)
                        same_child = res
                #print("  ", same_lemma, same_ent, same_child)
                matched = same_lemma and same_ent and same_child
                if matched:
                    break
            matches.append(matched)
            
       
        #print("MATCHES", matches)
        if all(matches):
            return bindings

In [84]:
def match_book_and_year(doc):
    bindings = match(doc, RULES, variables)
    if bindings:
        book_tok = bindings[BOOK_NAME]
        rel_tok = bindings[REL_DATE]
        return {BOOK_NAME:find_entity(doc, book_tok.i) or find_book_name(doc, book_tok), 
                REL_DATE: find_entity(doc, rel_tok.i)}
    else:
        return {}

In [79]:
def match_author(doc, title):
    bindings = match(doc, AUTHOR_RULES, AUTHOR_VARS)
    if bindings:
        author_tok = bindings[AUTHOR]
        return {AUTHOR: title}
    else:
        return {}

In [88]:
match_book_and_year(nlp("A collection of Jack London's San Francisco Stories was published in October 2010 by Sydney Samizdat Press."))

{}

In [23]:
match_author(nlp("Eric Arthur Blair (25 June 1903 – 21 January 1950),[1] better known by his pen name George Orwell, was an English novelist, essayist, journalist, and critic."), wiki.page('George Orwell').title)

{'__AUTHOR__': 'George Orwell'}

In [26]:
wp = wiki.page('Geourge Orwell')

In [24]:
def extract_year(rel_date):
    if rel_date:
        m = re.findall(r"\d\d\d\d", rel_date)
        if m:
            return int(m[0])
extract_year('June')

In [91]:
FIRST_SENTS = 3
def match_all(wikipage):
    collected = []
    author = None
    for i, sent in enumerate(sent_tokenize(wikipage.content)):
        s = nlp(sent)
        binds = match_book_and_year(s)
        #date = binds.get(REL_DATE, None)
        #year = extract_year(date)
        if i < FIRST_SENTS:
            author = match_author(s, wikipage.title).get(AUTHOR, author)
        elif not author:
            print("No author found, skipping")
            break;
        
        #if not year:
        #    continue
        if binds:
            #print(s)
            #print(binds)
            #collected.append([author, year, binds.get(BOOK_NAME)])
            collected.append([author, binds.get(BOOK_NAME)])
    return collected

facts = match_all(wp)

In [334]:
{'Benjamin Anastas': [('2005', "Versace Enthroned with Saints: Margaret, Jerome, Alex and the Angel Donatella"),
                      ('2012', "His memoir, Too Good To Be True, was published in 2012")],
 'Nicholson Baker': [('2001', "Double Fold"),
                     ('April 10, 2008', 'How I fell in love with Wikipedia')],
'Steven Barnes': [('1979', 'The Locusts')],
'Ray Douglas Bradbury': [('November 1941', 'Pendulum'),
                         ('1947', ' Dark Carnival'),
                         ('October 2001', 'From the Dust Returned'),
                         ('2006', 'Farewell Summer'),
                         ('2007', 'Summer Morning, Summer Night')]}

{'Benjamin Anastas': [('2005',
   'Versace Enthroned with Saints: Margaret, Jerome, Alex and the Angel Donatella'),
  ('2012', 'His memoir, Too Good To Be True, was published in 2012')],
 'Nicholson Baker': [('2001', 'Double Fold'),
  ('April 10, 2008', 'How I fell in love with Wikipedia')],
 'Ray Douglas Bradbury': [('November 1941', 'Pendulum'),
  ('1947', ' Dark Carnival'),
  ('October 2001', 'From the Dust Returned'),
  ('2006', 'Farewell Summer'),
  ('2007', 'Summer Morning, Summer Night')],
 'Steven Barnes': [('1979', 'The Locusts')]}

In [54]:
def ground_thruth_year(db, author, title):
    mask = (db['authors'] == author) & (db['original_title'] == title)
    res = db['year'][mask].values
    if len(res) == 0:
        return 0
    else:
        return res[0]
    

def compare_with_db_precise(db, facts):
    for fact in facts[:2]:
        mask = (db['authors'] == fact[0]) & (db['year'] == fact[1]) & (db['original_title'] == fact[2])
        print(db[mask])
facts = match_all(wiki.page('Suzanne Collins'))
print(facts)
compare_with_db_precise(db, facts)

[['Suzanne Collins', 2008, 'The Hunger Games'], ['Suzanne Collins', 2009, 'Catching Fire']]
            authors    original_title  year
id                                         
1   Suzanne Collins  The Hunger Games  2008
            authors original_title  year
id                                      
17  Suzanne Collins  Catching Fire  2009


In [55]:
def check_fact(db, fact):
    db_year = ground_thruth_year(db, fact[0], fact[2])
    test_year = fact[1]
    return db_year == test_year

def check_facts(db, facts):
    return [check_fact(db, fact) for fact in facts]

def calc_accuracy(db, facts):
    results = check_facts(db, facts)
    return np.sum(results)/len(results)

calc_accuracy(db,facts)

1.0

In [58]:
def evaluate_page(db, page_name):
    wp = wiki.page(page_name)
    facts = match_all(wp)
    print(facts)
    return (calc_accuracy(db, facts), len(facts))

In [92]:
evaluate_page(db, 'Dan Brown')

[['Dan Brown', 'a CD'], ['Dan Brown', 'Digital Fortress'], ['Dan Brown', 'Angels & Demons'], ['Dan Brown', 'The Da Vinci Code']]


IndexError: list index out of range