In [100]:
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd
import logging
import spacy 
import spacy.symbols as ss
from spacy import displacy
import en_core_web_lg
from stanfordcorenlp import StanfordCoreNLP
from kanren import run, eq, membero, var, conde, Relation, facts, fact, unifiable
import wikipedia as wiki
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from spacy.matcher import Matcher

pd.set_option('display.max_colwidth', 300)

In [2]:
nlp = en_core_web_lg.load()

### Information extractor

Loading previously generated train, test datasets

In [3]:
train = pd.read_csv('train.csv', index_col=0).assign(doc = lambda df: df.sent.apply(nlp))
test = pd.read_csv('test.csv', index_col=0).assign(doc = lambda df: df.sent.apply(nlp))

In [4]:
def predict(X, match_fn):
    predicted = X.apply(lambda x: match_fn(x))
    predicted_year = predicted.apply(lambda p: p[REL_DATE] if REL_DATE in p else None)
    predicted_title = predicted.apply(lambda p: p[BOOK_NAME] if BOOK_NAME in p else None)
    return {'predicted_year': predicted_year, 'predicted_title': predicted_title}

def evaluate(t):
    ttp = np.sum(pd.notna(t.title) & pd.notna(t.predicted_title) & (t.title == t.predicted_title))
    tfp = np.sum(pd.isna(t.title) & pd.notna(t.predicted_title))
    tfn = np.sum(pd.notna(t.title) & pd.isna(t.predicted_title))
    tprec = ttp / (ttp + tfp)
    trecall = ttp / (ttp + tfn)
    
    ytp = np.sum(pd.notna(t.year) & pd.notna(t.predicted_year) & (t.year == t.predicted_year))
    yfp = np.sum(pd.isna(t.year) & pd.notna(t.predicted_year))
    yfn = np.sum(pd.notna(t.year) & pd.isna(t.predicted_year))
    yprec = ytp / (ytp + yfp)
    yrecall = ytp / (ytp + yfn)
    return {'title': (tprec, trecall), 'year': (yprec, yrecall)}

Define rule-based extractor based on Kanren logic engine

In [131]:
BOOK_NAME = '__BOOK_NAME__'
REL_DATE = '__REL_DATE__'
AUTHOR = '__AUTHOR__'
REL_SYN = {'release', 'publish', 'accept', 'write', 'finish'}
BOOK_SYN2 = {'book', 'novel', 'novella','story', 'piece', 'collection','tale', 'manuscript', 'sequel', 'article'}
PREP_SYN = {'on', 'in', 'around'}
WRITER_ATTRS_SYNS = {'novelist','essayist','writer', 'author', 'biographer',
                     'columnist', 'critic', 'dramatist', 'editor',
                     'journalist', 'poet'}

DEPS = ['nsubjpass','dobj', 'prep', 'pobj', 'appos', 'acl', 'nsubj', 'attr']
POSS = ['PROPN', 'NUM']
ENTS = ['DATE', 'PERSON']
COMS = ['LEMMA']

In [6]:
# Generates db of relations between words and theirs attributes
def gather_facts(doc):
    R = {}
    
    for r in DEPS+POSS+ENTS+COMS:
        R[r] = Relation(r)
    
    for tok in doc:
        facts(R['LEMMA'], (tok.i, tok.lemma_))
        if tok.pos_ in POSS:
            fact(R[tok.pos_], (tok.i))
        if tok.dep_ in DEPS:
            facts(R[tok.dep_], (tok.head.i, tok.i))
        if tok.ent_type_ in ENTS:
            fact(R[tok.ent_type_], (tok.i))
            
    return R
        
# Checks that element by this id has lemma from the list
def memberoi(R, idx, lst):
    l = var('l'+str(idx))
    return conde((R['LEMMA'](idx, l), membero(l, lst)))

def action(R, idx):
    return memberoi(R, idx, REL_SYN)

def book(R, idx):
    return memberoi(R, idx, BOOK_SYN2)

def prep(R, idx):
    return memberoi(R, idx, PREP_SYN)

def action_on_object(R, action_i, object_i):
    return conde((R['nsubjpass'](action_i, object_i),),
                 (R['dobj'](action_i, object_i),))

def date_ent(R, idx):
    
    return conde(
        (R["DATE"](idx), conde((R["PROPN"](idx),),
                               (R["NUM"](idx),)))
    )

def be(R, idx):
    return R["LEMMA"](idx, 'be')

def author_attrs(R, idx):
    l = var('l' + str(idx))
    return conde((R['LEMMA'](idx, l), memberoi(R, l, WRITER_ATTRS_SYNS)))
def author_attrs(R, idx):
    l = var('l' + str(idx))
    #return R['LEMMA'](idx, l)
    return memberoi(R, idx, WRITER_ATTRS_SYNS)

This rule handles following patterns
    
    book , <name> , written in <date>

In [7]:
def Book_Date_Rule1(R):
    _action_i = var('_action_i')
    _book_i = var('_book_i')
    _prep_i = var('_prep_i')
    return [
        action(R, _action_i),
        book(R, _book_i),
        prep(R, _prep_i),
        
        action_on_object(R, _action_i, _book_i),
        R["appos"](_book_i, _book_name_i),
        R["prep"](_action_i, _prep_i),
        R["pobj"](_prep_i, _date_i)
    ]

This rule handles following patterns

    <name> written in <date>


In [8]:
def Book_Date_Rule2(R):
    _action_i = var('_action_i')
    _prep_i = var('_prep_i')
    return [
        action(R, _action_i),
        prep(R, _prep_i),
        action_on_object(R, _action_i, _book_name_i),
        R['prep'](_action_i, _prep_i),
        R['pobj'](_prep_i, _date_i)
    ]

In [9]:
Book_Date_Rules = [Book_Date_Rule1, Book_Date_Rule2]

This rule handles following patterns

     <name> was novelist


In [10]:
def Author_Rule1(R):
    _be_i = var('_be_i')
    _author_attrs_i = var('_author_attrs_i')
    return [
        be(R, _be_i),
        author_attrs(R, _author_attrs_i),
        R['nsubj'](_be_i, _author_name_i),
        R['attr'](_be_i, _author_attrs_i)
    ]

In [11]:
def run_rules(doc, vs, rules_fns): 
    R = gather_facts(doc)
    for rule_fn in rules_fns:
        result = run(1, vs, *rule_fn(R))
        if result:
            return result
    return []

This function extracts book names given only on word from it
* if appos:
    * grab tokens between commas or quotes or etc
* if not appos:
    * tries to find leftmost token that is not `PROPN`,`PART`, etc
    * tries to find rightmost token that is not `PROPN`, `PART`, etc
    * all tokens between leftmost and rightmost is the desired book name

In [12]:
propn_parts = set([ss.PROPN, ss.PART, ss.ADP, ss.DET, ss.CCONJ, ss.NOUN])

def valid_named_part(tok):
    return (tok.pos in propn_parts) or tok.text == '-'
        
def find_book_name(doc, tok):
    if tok.dep == ss.appos:
        left = None
        right = None
        for idx in range(tok.i, 0, -1):
            if doc[idx].pos == ss.PUNCT and doc[idx].text != '-':
                left = idx
                break
        for idx in range(tok.i, len(doc)):
            if doc[idx].pos == ss.PUNCT and doc[idx].text != '-':
                right = idx
                break
                
        return doc[left+1:right].text
    elif tok.pos == ss.PROPN:
        left = tok.i
        right = tok.i + 1
        for idx in range(tok.i, -1, -1):
            if doc[idx].pos == ss.DET or (valid_named_part(doc[idx]) and any([doc[j].pos == ss.PROPN for j in range(idx, max(idx-3,-1),-1)])):
                left = idx
            else:
                break
        for idx in range(tok.i, len(doc)):
            if valid_named_part(doc[idx]) and any([doc[j].pos == ss.PROPN for j in range(idx, min(idx+3,len(doc)))]):
                right = idx
            else:
                break
        return doc[left:right+1].text

In [13]:
def find_entity(doc, idx):
    for ent in doc.ents:
        if idx >= ent.start and idx < ent.end:
            return ent.text

In [35]:
def extract_year(rel_date):
    if rel_date:
        m = re.findall(r"\d\d\d\d", str(rel_date))
        if m:
            return int(m[0])

This function tries to match book name and year of publishing in the spacy `Doc`

In [26]:
def match_book_and_year(doc):    
    global _book_name_i
    global _date_i
    global _author_name_i
    _book_name_i = var('_book_name_i')
    _date_i = var('_date_i')
    _author_name_i = var('_author_name_i')
    results =  run_rules(doc, (_book_name_i, _date_i), Book_Date_Rules)
    
    for res in results:
        if res:
            (book_name_idx, date_idx) = res
            if book_name_idx and date_idx:
                book_tok = doc[book_name_idx]
                rel_tok = doc[date_idx]
                cand1 = find_entity(doc, book_tok.i)
                cand2 = find_book_name(doc, book_tok)
                book_name = None
                if cand1 and not cand2:
                    book_name = cand1
                elif not cand1 and cand2:
                    book_name = cand2
                elif not (cand1 or cand2):
                    return {}
                elif len(cand1) > len(cand2):
                    book_name = cand1
                else:
                    book_name = cand2
                
                return {BOOK_NAME: book_name, 
                        REL_DATE: extract_year(find_entity(doc, rel_tok.i))}
    return {}

This function tries to match author from the text, but returns one that is given through `title` param.
It's fixes the problem that the first sentence in wikipedia article has own(full) author's name, but we need his
well known name

In [93]:
def _normalize_author(author):
    s1 = re.sub(r"(\w)\. ?(\w)\. ", r"\1.\2. ", author)
    s2 = re.sub(r"(\w)\. ?(\w)\. ? (\w)\. ", r"\1.\2.\3. ", s1)
    return s2

def match_author(doc, title):
    global _author_name_i
    _author_name_i = var('_author_name_i')
    result = run_rules(doc, (_author_name_i,), [Author_Rule1])
    for author_name_i, in result:
        return {AUTHOR: _normalize_author(title)}
    return {}

In [28]:
t1 = train.assign(**predict(train.doc, match_book_and_year))
print(evaluate(t1))

November 18, 1865
June 1997
early 1951
July 1998
16 July 2005
None
1997
1871
Mississippi
April 2013
1865
US
{'title': (0.8181818181818182, 0.5), 'year': (0.8888888888888888, 0.42105263157894735)}


We see that precision is pretty good, but recall is not so good. It's probably a good idea to get new rules. But after analyzing data, we saw thath train examples have errors or it's really hard to find borders for book names without quotes. Also there is a problem in the first sentence with extracint date: "on the <Date> in the New Yourk weekly" here weekly also date :( The best idea would be to define some patterns for date.

In [37]:
t1[pd.notna(t1.predicted_year) | pd.notna(t1.title)].head()

Unnamed: 0,sent,title,year,author,doc,predicted_year,predicted_title
56,"His first success as a writer came when his humorous tall tale ""The Celebrated Jumping Frog of Calaveras County"" was published on November 18, 1865, in the New York weekly The Saturday Press, bringing him national attention.",The Celebrated Jumping Frog of Calaveras County,1865.0,Mark Twain,"(His, first, success, as, a, writer, came, when, his, humorous, tall, tale, "", The, Celebrated, Jumping, Frog, of, Calaveras, County, "", was, published, on, November, 18, ,, 1865, ,, in, the, New, York, weekly, The, Saturday, Press, ,, bringing, him, national, attention, .)",1865.0,The Celebrated Jumping Frog of Calaveras County
307,"The Incident in the Philippines, posthumously published in 1924, was in response to the Moro Crater Massacre, in which six hundred Moros were killed.",The Incident in the Philippines,1924.0,Mark Twain,"(The, Incident, in, the, Philippines, ,, posthumously, published, in, 1924, ,, was, in, response, to, the, Moro, Crater, Massacre, ,, in, which, six, hundred, Moros, were, killed, .)",,
910,"In June 1997, Bloomsbury published Philosopher's Stone with an initial print run of 1,000 copies, 500 of which were distributed to libraries.",Philosopher's Stone,1997.0,J.K. Rowling,"(In, June, 1997, ,, Bloomsbury, published, Philosopher, 's, Stone, with, an, initial, print, run, of, 1,000, copies, ,, 500, of, which, were, distributed, to, libraries, .)",1997.0,Philosopher's Stone
903,"==\n\nIn 1995, Rowling finished her manuscript for Harry Potter and the Philosopher's Stone on an old manual typewriter.",Harry Potter and the Philosopher's Stone,1995.0,J.K. Rowling,"(=, =, \n\n, In, 1995, ,, Rowling, finished, her, manuscript, for, Harry, Potter, and, the, Philosopher, 's, Stone, on, an, old, manual, typewriter, .)",,
52,"His experiences in the American West inspired Roughing It, written during 1870–71 and published in 1872.",Roughing It,1872.0,Mark Twain,"(His, experiences, in, the, American, West, inspired, Roughing, It, ,, written, during, 1870–71, and, published, in, 1872, .)",,


In [31]:
FIRST_SENTS = 3

def match_all(wikipage):
    collected = []
    author = None
    for i, sent in enumerate(sent_tokenize(wikipage.content)):
        s = nlp(sent)
        binds = match_book_and_year(s)
        date = binds.get(REL_DATE, None)
        year = extract_year(date)
        if i < FIRST_SENTS:
            sent = re.sub(r"\([^\(\)]+\)", "", sent)
            s = nlp(sent)
            author = match_author(s, wikipage.title).get(AUTHOR, author)
        elif not author:
            print("No author found, skipping")
            break

        if not year:
            continue
        if binds:
            # print(s)
            # print(binds)
            collected.append([author, year, binds.get(BOOK_NAME)])
    return collected

### Page evaluation

Loading db

In [102]:
def load_db():
    db = pd.read_csv('books_all.csv', index_col=0)[['authors', 'original_title', 'original_publication_year']]
    db = db.rename(columns={'original_publication_year': 'year'})
    db = db.dropna()
    db['year'] = db['year'].apply(int)
    db['original_title'] = db['original_title'].str.lower()
    return db
db = load_db()

Functions for evaluating page

In [106]:
def ground_truth_year(db, author, title):
    mask = (db['authors'].str.contains(author)) & (db['original_title'] == title.lower())
    res = db['year'][mask].values
    if len(res) == 0:
        return 0
    else:
        return res[0]


def check_fact(db, fact):
    db_year = ground_truth_year(db, fact[0], fact[2])
    test_year = fact[1]
    return db_year == test_year


def check_facts(db, facts):
    return [check_fact(db, fact) for fact in facts]


def calc_accuracy(db, facts):
    results = check_facts(db, facts)
    return np.sum(results) / len(results)


def evaluate_page(db, page_name):
    wp = wiki.page(page_name)
    facts = match_all(wp)
    return (calc_accuracy(db, facts), facts)

In [136]:
(acc, _facts) = evaluate_page(db, 'Walt Whitman')
print('Page accuracy:', acc)
print('Gathered facts:', _facts)

Page accuracy: 0.3333333333333333
Gathered facts: [['Walt Whitman', 1855, 'Leaves of Grass'], ['Walt Whitman', 1865, 'George'], ['Walt Whitman', 1868, 'Poems of Walt Whitman']]


### Error breakdown

* Is author in db?

In [137]:
author = _facts[0][0]
author_mask = db['authors'].str.contains(author)
authors_books = db[author_mask]
print('Author has {} books'.format(len(authors_books)))
authors_books.head()

Author has 1 books


Unnamed: 0_level_0,authors,original_title,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1679,Walt Whitman,leaves of grass,1855


* If we have author, what books we have in wikipedia, but don't have in db?

In [138]:
books_not_in_db = {fact[2].lower() for fact in _facts} - set(authors_books.original_title.values)
print("There is {} books from wikipedia and not from db".format(len(books_not_in_db)))
print("Sample:")
print(list(books_not_in_db)[:5])

There is 2 books from wikipedia and not from db
Sample:
['george', 'poems of walt whitman']


### Conclusions
* Goodreads-10k corpus is not good corpus for fact checking. It lacks too many books.
* It's better to write more rules to decrease rate of false negatives
* Use of `logpy` gives us good readability and ready engine, but it is too slow. It's better to look at other kanren engines or improve one that I done for the first version of this task