### todo
* make search exhaustive inside one sentence
* match years before 1000

In [695]:
import nltk
from nltk.tokenize import sent_tokenize
import pandas as pd
import logging
import spacy 
import spacy.symbols as ss
from spacy import displacy
import en_core_web_lg
from stanfordcorenlp import StanfordCoreNLP
from kanren import run, eq, membero, var, conde, Relation, facts, fact, unifiable
import wikipedia as wiki
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
from spacy.matcher import Matcher

pd.set_option('display.max_colwidth', 300)

In [280]:
nlp = en_core_web_lg.load()

In [281]:
db = pd.read_csv('books_all.csv', index_col=0)[['authors', 'original_title', 'original_publication_year']]
db = db.rename(columns={'original_publication_year': 'year'})
db = db.dropna()
db['year'] = db['year'].apply(int)

In [672]:
def is_variable(rchild_lemmas, variables):
    return isinstance(rchild_lemmas, str) and rchild_lemmas in variables

def child_eq(rdeps, rchild, child, variables):
    (rchild_lemmas, rchild_ent, rchild_pos) = rchild
    (child_lemma, child_ent, child_pos, dep) = child.lemma_, child.ent_type, child.pos, child.dep
    if is_variable(rchild_lemmas, variables):
        if (not rchild_pos or rchild_pos == child_pos) and (not rchild_ent or rchild_ent == child_ent) and (dep in rdeps):
            return {rchild_lemmas: child}
        else:
             return False
    else:
        return  (not child_lemma or (child_lemma in rchild_lemmas)) and (not rchild_pos or rchild_pos == child_pos) and (not rchild_ent or rchild_end == child_ent) and (dep in rdeps)
    
def find_entity(doc, idx):
    for ent in doc.ents:
        if idx >= ent.start and idx < ent.end:
            return ent.text
        
propn_parts = set([ss.PROPN, ss.PART, ss.ADP, ss.DET, ss.CCONJ, ss.NOUN])

def valid_named_part(tok):
    return (tok.pos in propn_parts) or tok.text == '-'
        
def find_book_name(doc, tok):
    if tok.dep == ss.appos:
        left = None
        right = None
        for idx in range(tok.i, 0, -1):
            if doc[idx].pos == ss.PUNCT:
                left = idx
                break
        for idx in range(tok.i, len(doc)):
            if doc[idx].pos == ss.PUNCT:
                right = idx
                break
                
        return doc[left+1:right].text
    elif tok.pos == ss.PROPN:
        left = tok.i
        right = tok.i + 1
        for idx in range(tok.i, -1, -1):
            #print(doc[idx])
            if doc[idx].pos == ss.DET or valid_named_part(doc[idx]):
                left = idx
            else:
                break
        for idx in range(tok.i, len(doc)):
            if valid_named_part(doc[idx]):
                right = idx
            else:
                break
        return doc[left:right+1].text

def match(doc, rules, variables):
    for ri, rule in enumerate(rules):
        #print("Rule ", ri)
        matches = []
        bindings = {}
        for subrule in rule:
            ((rhead_lemmas, rhead_ent, _), rdeps, rchild) = subrule
            #print(subrule)
            matched = False
            for tok in doc:
                same_lemma = (not rhead_lemmas) or (tok.lemma_ in rhead_lemmas)
                same_ent = (not rhead_ent or (rhead_ent == tok.pos))
                same_child = None
                #print("  ", tok)
                for ch in tok.children:
                    #print("    ", rchild, ch)
                    res =  child_eq(rdeps, rchild, ch, variables)
                    if res:
                        if isinstance(res, dict):
                            bindings.update(res)
                        same_child = res
                #print("  ", same_lemma, same_ent, same_child)
                matched = same_lemma and same_ent and same_child
                if matched:
                    break
            matches.append(matched)
            
       
        #print("MATCHES", matches)
        if all(matches):
            return bindings

In [660]:
def match_book_and_year(doc, rules, variables):
    bindings = match(doc, rules, variables)

    if bindings:   
        book_tok = bindings[BOOK_NAME]
        rel_tok = bindings[REL_DATE]
        cand1 = find_entity(doc, book_tok.i)
        cand2 = find_book_name(doc, book_tok)
        return {BOOK_NAME: cand1 if cand1 and (len(cand1) > len(cand2)) else cand2, 
                REL_DATE: extract_year(find_entity(doc, rel_tok.i))}
    else:
        return {}

In [635]:
def match_author(doc, title):
    bindings = match(doc, AUTHOR_RULES, AUTHOR_VARS)
    if bindings:
        author_tok = bindings[AUTHOR]
        return {AUTHOR: title}
    else:
        return {}

In [636]:
def extract_year(rel_date):
    if rel_date:
        m = re.findall(r"\d\d\d\d", rel_date)
        if m:
            return int(m[0])

In [637]:
FIRST_SENTS = 3
def match_all(wikipage, rules, variables):
    collected = []
    author = None
    for i, sent in enumerate(sent_tokenize(wikipage.content)):
        s = nlp(sent)
        binds = match_book_and_year(s, rules, variables)
        #date = binds.get(REL_DATE, None)
        #year = extract_year(date)
        if i < FIRST_SENTS:
            author = match_author(s, wikipage.title).get(AUTHOR, author)
        elif not author:
            print("No author found, skipping")
            break;
        
        #if not year:
        #    continue
        if binds:
            #print(s)
            #print(binds)
            #collected.append([author, year, binds.get(BOOK_NAME)])
            collected.append([author, binds.get(BOOK_NAME)])
    return collected

In [638]:
def ground_thruth_year(db, author, title):
    mask = (db['authors'] == author) & (db['original_title'] == title)
    res = db['year'][mask].values
    if len(res) == 0:
        return 0
    else:
        return res[0]
    

def compare_with_db_precise(db, facts):
    for fact in facts[:2]:
        mask = (db['authors'] == fact[0]) & (db['year'] == fact[1]) & (db['original_title'] == fact[2])
        print(db[mask])

In [639]:
def check_fact(db, fact):
    db_year = ground_thruth_year(db, fact[0], fact[2])
    test_year = fact[1]
    return db_year == test_year

def check_facts(db, facts):
    return [check_fact(db, fact) for fact in facts]

def calc_accuracy(db, facts):
    results = check_facts(db, facts)
    return np.sum(results)/len(results)

In [640]:
def evaluate_page(db, page_name, rules, variables):
    wp = wiki.page(page_name)
    facts = match_all(wp, rules, variables)
    print(facts)
    return (calc_accuracy(db, facts), len(facts))

In [788]:
BOOK_NAME = '__BOOK_NAME__'
REL_DATE = '__REL_DATE__'
VARIABLES_V1 = set([BOOK_NAME, REL_DATE])
REL_SYN = set(['release', 'publish', 'accept', 'write'])
BOOK_SYN = set(['book', 'novel', 'story', 'piece', 'collection'])
RULES_V1 = [
    [((REL_SYN, None, None), (ss.nsubjpass,ss.dobj,ss.nsubj), (BOOK_NAME, None, ss.PROPN)),
     ((REL_SYN, None, None), (ss.prep,), (['in', 'on'], None, None)),
     ((['in', 'on'], None, None), (ss.pobj,), (REL_DATE, ss.DATE, None))
    ],
    
    [((REL_SYN, None,None), (ss.nsubjpass,ss.dobj,ss.nsubj), (BOOK_SYN, None, None)),
     ((BOOK_SYN, None,None), (ss.appos,), (BOOK_NAME, None, ss.PROPN)),
     ((REL_SYN, None, None), (ss.prep,), (['in', 'on'], None, None)),
     ((['in', 'on'], None, None), (ss.pobj,), (REL_DATE, ss.DATE, None))
    ],
    
    [((REL_SYN, None, None), (ss.nsubjpass, ss.dobj,ss.nsubj), (BOOK_SYN, None, None)),
     ((BOOK_SYN, None, None), (ss.prep,), (['of'], None, None)),
     ((['of'], None, None), (ss.pobj,), (BOOK_NAME, None, ss.PROPN)),
     ((REL_SYN, None, None), (ss.prep,), (['in', 'on'], None, None)),
     ((['in', 'on'], None, None), (ss.pobj,), (REL_DATE, ss.DATE, None))
    ]
]

AUTHOR = '__AUTHOR__'
AUTHOR_VARS = (AUTHOR,)
WRITER_SYN = set(['novelist','essayist','writer', 'author', 'biographer',
                 'columnist', 'critic', 'dramatist', 'editor',
                 'journalist', 'poet'])

AUTHOR_RULES = [
    [((set(['be']), None, None), (ss.nsubj,), (AUTHOR,ss.PERSON, ss.PROPN)),
     ((set(['be']), None, None), (ss.attr,), (WRITER_SYN, None, None))]
]

In [959]:
train = pd.read_csv('train.csv', index_col=0).assign(doc = lambda df: df.sent.apply(nlp))
test = pd.read_csv('test.csv', index_col=0).assign(doc = lambda df: df.sent.apply(nlp))

In [834]:
def predict(X, match_fn):
    predicted = X.apply(lambda x: match_fn(x))
    predicted_year = predicted.apply(lambda p: p[REL_DATE] if REL_DATE in p else None)
    predicted_title = predicted.apply(lambda p: p[BOOK_NAME] if BOOK_NAME in p else None)
    return {'predicted_year': predicted_year, 'predicted_title': predicted_title}

In [919]:
t = train.assign(**predict(train.doc, lambda x: match_book_and_year(x, RULES_V1, VARIABLES_V1)))

In [920]:
def evaluate(t):
    ttp = np.sum(pd.notna(t.title) & pd.notna(t.predicted_title) & (t.title == t.predicted_title))
    tfp = np.sum(pd.isna(t.title) & pd.notna(t.predicted_title))
    tfn = np.sum(pd.notna(t.title) & pd.isna(t.predicted_title))
    tprec = ttp / (ttp + tfp)
    trecall = ttp / (ttp + tfn)
    
    ytp = np.sum(pd.notna(t.year) & pd.notna(t.predicted_year) & (t.year == t.predicted_year))
    yfp = np.sum(pd.isna(t.year) & pd.notna(t.predicted_year))
    yfn = np.sum(pd.notna(t.year) & pd.isna(t.predicted_year))
    yprec = ytp / (ytp + yfp)
    yrecall = ytp / (ytp + yfn)
    return {'title': (tprec, trecall), 'year': (yprec, yrecall)}

In [921]:
metrics1 = evaluate(t)
print(metrics1)

{'title': (0.46153846153846156, 0.3157894736842105), 'year': (0.5333333333333333, 0.38095238095238093)}


In [1098]:
REL_SYN = set(['release', 'publish', 'accept', 'write', 'finish'])
BOOK_SYN2 = set(['book', 'novel', 'story', 'piece', 'collection','tale', 'manuscript', 'sequel', 'article'])
PREP_SYN = {'on', 'in', 'around'}
WRITER_ATTRS_SYNS = set(['novelist','essayist','writer', 'author', 'biographer',
                 'columnist', 'critic', 'dramatist', 'editor',
                 'journalist', 'poet'])

AUTHOR_RULES = [
    [((set(['be']), None, None), (ss.nsubj,), (AUTHOR,ss.PERSON, ss.PROPN)),
     ((set(['be']), None, None), (ss.attr,), (WRITER_SYN, None, None))]
]

DEPS = ['nsubjpass','dobj', 'prep', 'pobj', 'appos', 'acl', 'nsubj', 'attr']
POSS = ['PROPN']
ENTS = ['DATE', 'PERSON']
COMS = ['LEMMA']

def gather_facts(doc):
    R = {}
    
    for r in DEPS+POSS+ENTS+COMS:
        R[r] = Relation(r)
    
    for tok in doc:
        facts(R['LEMMA'], (tok.i, tok.lemma_))
        if tok.pos_ in POSS:
            fact(R[tok.pos_], (tok.i))
        if tok.dep_ in DEPS:
            facts(R[tok.dep_], (tok.head.i, tok.i))
        if tok.ent_type_ in ENTS:
            fact(R[tok.ent_type_], (tok.i))
            
    return R
        
def memberoi(R, idx, lst):
    l = var('l'+str(idx))
    return conde((R['LEMMA'](idx, l), membero(l, lst)))

def action(R, idx):
    return memberoi(R, idx, REL_SYN)

def book(R, idx):
    return memberoi(R, idx, BOOK_SYN2)

def prep(R, idx):
    return memberoi(R, idx, PREP_SYN)

def action_on_object(R, action_i, object_i):
    return conde((R['nsubjpass'](_action_i, object_i),),
                 (R['dobj'](_action_i, object_i),))

def date_ent(R, idx):
    return R["DATE"](idx) 

def be(R, idx):
    return R["LEMMA"](idx, 'be')

def author_attrs(R, idx):
    l = var('l' + str(idx))
    return conde((R['LEMMA'](idx, l), memberoi(R, l, WRITER_ATTRS_SYNS)))


def Book_Date_Rule1(R):
    _action_i = var('_action_i')
    _book_i = var('_book_i')
    _prep_i = var('_prep_i')
    return [
        action(R, _action_i),
        book(R, _book_i),
        prep(R, _prep_i),
        
        action_on_object(R, _action_i, _book_i),
        R["appos"](_book_i, _book_name_i),
        R["prep"](_action_i, _prep_i),
        R["pobj"](_prep_i, _date_i)
    ]

def Book_Date_Rule2(R):
    _action_i = var('_action_i')
    _prep_i = var('_prep_i')
    return [
        action(R, _action_i),
        prep(R, _prep_i),
        action_on_object(R, _action_i, _book_name_i),
        R['prep'](_action_i, _prep_i),
        R['pobj'](_prep_i, _date_i)
    ]

def Author_Rule1(R):
    _be_i = var('_be_i')
    _author_attrs_i = var('_author_attrs_i')
    return [
        be(R, _be_i),
        author_attrs(R, _author_attrs_i),
        R['nsubj'](_be_i, _author_name_i),
        R['attr'](_be_i, _author_attrs_i)
    ]
    

Book_Date_Rules = [Book_Date_Rule1, Book_Date_Rule2]
        
def run_rules(doc, vs, rules_fns): 
    R = gather_facts(doc)
    for rule_fn in rules_fns:
        result = run(1, vs, *rule_fn(R))
        print(result)
        if result:
            return result
    return []


In [1076]:
def match_book_and_year2(doc):    
    global _book_name_i
    global _date_i
    global _author_name_i
    _book_name_i = var('_book_name_i')
    _date_i = var('_date_i')
    _author_name_i = var('_author_name_i')
    results =  run_rules(doc, (_book_name_i, _date_i), Book_Date_Rules)
    
    for res in results:
        if res:
            (book_name_idx, date_idx) = res
            if book_name_idx and date_idx:
                book_tok = doc[book_name_idx]
                rel_tok = doc[date_idx]
                cand1 = find_entity(doc, book_tok.i)
                cand2 = find_book_name(doc, book_tok)
                book_name = None
                if cand1 and not cand2:
                    book_name = cand1
                elif not cand1 and cand2:
                    book_name = cand2
                elif not (cand1 or cand2):
                    return {}
                elif len(cand1) > len(cand2):
                    book_name = cand1
                else:
                    book_name = cand2
                return {BOOK_NAME: book_name, 
                        REL_DATE: extract_year(find_entity(doc, rel_tok.i))}
    return {}

In [1142]:

def author_attrs(R, idx):
    l = var('l' + str(idx))
    #return R['LEMMA'](idx, l)
    return conde((R['LEMMA'](idx, l),memberoi(R, idx, WRITER_ATTRS_SYNS)))

def Author_Rule1(R):
    _be_i = var('_be_i')
    _author_attrs_i = var('_author_attrs_i')
    print(R['nsubj'].facts)
    return [
        be(R, _be_i),
        author_attrs(R, _author_attrs_i),
        R['nsubj'](_be_i, _author_name_i),
        R['attr'](_be_i, _author_attrs_i)
    ]

def match_author(doc, title):
    global _author_name_i
    _author_name_i = var('_author_name_i')
    result = run_rules(doc, (_author_name_i,), [Author_Rule1])
    print(result)
    for author_name_i, in result:
        return {AUTHOR: title}
    else:
        {}
    
match_author(nlp("Joanne Rowling,  , writing under the pen names J. K. Rowling and Robert Galbraith, is a British novelist, screenwriter, and producer who is best known for writing the Harry Potter fantasy series."), "s")

{(17, 1)}
((1,),)
((1,),)


{'__AUTHOR__': 's'}

In [1087]:
match_book_and_year2(doc)

()
()


{}

()
[]


{}

In [1066]:
propn_parts = set([ss.PROPN, ss.PART, ss.ADP, ss.DET, ss.CCONJ, ss.NOUN])

def valid_named_part(tok):
    return (tok.pos in propn_parts) or tok.text == '-'
        
def find_book_name(doc, tok):
    if tok.dep == ss.appos:
        left = None
        right = None
        for idx in range(tok.i, 0, -1):
            if doc[idx].pos == ss.PUNCT and doc[idx].text != '-':
                left = idx
                break
        for idx in range(tok.i, len(doc)):
            if doc[idx].pos == ss.PUNCT and doc[idx].text != '-':
                right = idx
                break
                
        return doc[left+1:right].text
    elif tok.pos == ss.PROPN:
        left = tok.i
        right = tok.i + 1
        #print(tok)
        #print(doc)
        for idx in range(tok.i, -1, -1):
            #print(doc[idx])
            if doc[idx].pos == ss.DET or (valid_named_part(doc[idx]) and any([doc[j].pos == ss.PROPN for j in range(idx, max(idx-3,-1),-1)])):
                left = idx
            else:
                break
        for idx in range(tok.i, len(doc)):
            #print(doc[idx])
            if valid_named_part(doc[idx]) and any([doc[j].pos == ss.PROPN for j in range(idx, min(idx+3,len(doc)))]):
                right = idx
            else:
                break
        return doc[left:right+1].text

In [1067]:
t2 = train.assign(**predict(train.doc, match_book_and_year2))
print(evaluate(t2))

{'title': (0.8181818181818182, 0.5), 'year': (0.875, 0.3684210526315789)}


In [1029]:
doc = nlp("He wrote his first published novel, The Town and the City, and began the famous On the Road around 1949 while living there")
x = var('x')
_book_name_i = var('_book_name_i')
def Book_Date_Rule1__(R):
    _action_i = var('_action_i')
    _book_i = var('_book_i')
    _prep_i = var('_prep_i')
    return [
        action(R, _action_i),
        book(R, _book_i),
        prep(R, _prep_i),
        
        action_on_object(R, _action_i, _book_i),
        R["appos"](_book_i, _book_name_i),
        #R["prep"](_action_i, _prep_i),
        #R["pobj"](_prep_i, _date_i)
    ]

run(1,(_book_name_i, _date_i), *Book_Date_Rule1__(gather_facts(doc))), doc[11],gather_facts(doc)['appos'].facts

(((8, ~_date_i),), City, {(5, 8)})

In [943]:
t2 = train.assign(**predict(train.doc, match_book_and_year2))
print(evaluate(t2))

{'title': (0.8181818181818182, 0.47368421052631576), 'year': (0.875, 0.35)}


In [949]:
t2[pd.notna(t2.predicted_year) | pd.notna(t2.title)]

Unnamed: 0,sent,title,year,author,doc,predicted_year,predicted_title
56,"His first success as a writer came when his humorous tall tale ""The Celebrated Jumping Frog of Calaveras County"" was published on November 18, 1865, in the New York weekly The Saturday Press, bringing him national attention.",The Celebrated Jumping Frog of Calaveras County,1865.0,Mark Twain,"(His, first, success, as, a, writer, came, when, his, humorous, tall, tale, "", The, Celebrated, Jumping, Frog, of, Calaveras, County, "", was, published, on, November, 18, ,, 1865, ,, in, the, New, York, weekly, The, Saturday, Press, ,, bringing, him, national, attention, .)",,The Celebrated Jumping Frog of Calaveras County
307,"The Incident in the Philippines, posthumously published in 1924, was in response to the Moro Crater Massacre, in which six hundred Moros were killed.",The Incident in the Philippines,1924.0,Mark Twain,"(The, Incident, in, the, Philippines, ,, posthumously, published, in, 1924, ,, was, in, response, to, the, Moro, Crater, Massacre, ,, in, which, six, hundred, Moros, were, killed, .)",,
910,"In June 1997, Bloomsbury published Philosopher's Stone with an initial print run of 1,000 copies, 500 of which were distributed to libraries.",Philosopher's Stone,1997.0,J.K. Rowling,"(In, June, 1997, ,, Bloomsbury, published, Philosopher, 's, Stone, with, an, initial, print, run, of, 1,000, copies, ,, 500, of, which, were, distributed, to, libraries, .)",1997.0,Philosopher's Stone
903,"==\n\nIn 1995, Rowling finished her manuscript for Harry Potter and the Philosopher's Stone on an old manual typewriter.",Harry Potter and the Philosopher's Stone,1995.0,J.K. Rowling,"(=, =, \n\n, In, 1995, ,, Rowling, finished, her, manuscript, for, Harry, Potter, and, the, Philosopher, 's, Stone, on, an, old, manual, typewriter, .)",,
52,"His experiences in the American West inspired Roughing It, written during 1870–71 and published in 1872.",Roughing It,1872.0,Mark Twain,"(His, experiences, in, the, American, West, inspired, Roughing, It, ,, written, during, 1870–71, and, published, in, 1872, .)",,
710,La nuit est ma femme was written in early 1951 and completed a few days or weeks before he began the original English version of,La nuit est ma femme,1951.0,Jack Kerouac,"(La, nuit, est, ma, femme, was, written, in, early, 1951, and, completed, a, few, days, or, weeks, before, he, began, the, original, English, version, of)",1951.0,La nuit est ma femme
561,"He wrote his first published novel, The Town and the City, and began the famous On the Road around 1949 while living there.",The Town and the City,1949.0,Jack Kerouac,"(He, wrote, his, first, published, novel, ,, The, Town, and, the, City, ,, and, began, the, famous, On, the, Road, around, 1949, while, living, there, .)",,
918,"Its sequel, Harry Potter and the Chamber of Secrets, was published in July 1998 and again Rowling won the Smarties Prize.",Harry Potter and the Chamber of Secrets,1998.0,J.K. Rowling,"(Its, sequel, ,, Harry, Potter, and, the, Chamber, of, Secrets, ,, was, published, in, July, 1998, and, again, Rowling, won, the, Smarties, Prize, .)",1998.0,Harry Potter and the Chamber of Secrets
930,"The sixth book, Harry Potter and the Half-Blood Prince, was released on 16 July 2005.",Harry Potter and the Half-Blood Prince,2005.0,J.K. Rowling,"(The, sixth, book, ,, Harry, Potter, and, the, Half, -, Blood, Prince, ,, was, released, on, 16, July, 2005, .)",2005.0,Harry Potter and the Half-Blood Prince
367,"A subsequent article, ""To My Missionary Critics"" published in The North American Review in April 1901, unapologetically continues his attack, but with the focus shifted from Ament to his missionary superiors, the American Board of Commissioners for Foreign Missions.\n",To My Missionary Critics,1901.0,Mark Twain,"(A, subsequent, article, ,, "", To, My, Missionary, Critics, "", published, in, The, North, American, Review, in, April, 1901, ,, unapologetically, continues, his, attack, ,, but, with, the, focus, shifted, from, Ament, to, his, missionary, superiors, ,, the, American, Board, of, Commissioners, fo...",,


In [960]:
tt = test.assign(**predict(test.doc, match_book_and_year2))

In [961]:
evaluate(tt)

{'title': (0.75, 0.42857142857142855),
 'year': (0.6666666666666666, 0.2857142857142857)}

In [962]:
tt[pd.notna(tt.predicted_year) | pd.notna(tt.title)]

Unnamed: 0,sent,title,year,author,doc,predicted_year,predicted_title
787,"Both Sur le chemin and La nuit est ma femme have also been translated to English by Jean-Christophe Cloutier, in collaboration with Kerouac, and were published in 2016 by the Library of America in The Unknown Kerouac.\n\n\n",Both Sur le chemin,2016.0,Jack Kerouac,"(Both, Sur, le, chemin, and, La, nuit, est, ma, femme, have, also, been, translated, to, English, by, Jean, -, Christophe, Cloutier, ,, in, collaboration, with, Kerouac, ,, and, were, published, in, 2016, by, the, Library, of, America, in, The, Unknown, Kerouac, ., \n\n\n)",,
935,Harry Potter and the Deathly Hallows was released on 21 July 2007 (0:01 BST) and broke its predecessor's record as the fastest-selling book of all time.,Harry Potter and the Deathly Hallows,2007.0,J.K. Rowling,"(Harry, Potter, and, the, Deathly, Hallows, was, released, on, 21, July, 2007, (, 0:01, BST, ), and, broke, its, predecessor, 's, record, as, the, fastest, -, selling, book, of, all, time, .)",2007.0,Harry Potter and the Deathly Hallows
370,"Little Bessie, a story ridiculing Christianity, was first published in the 1972 collection Mark Twain's Fables of Man.\n",Little Bessie,1972.0,Mark Twain,"(Little, Bessie, ,, a, story, ridiculing, Christianity, ,, was, first, published, in, the, 1972, collection, Mark, Twain, 's, Fables, of, Man, ., \n)",,Little Bessie
561,"He wrote his first published novel, The Town and the City, and began the famous On the Road around 1949 while living there.",The Town and the City,1949.0,Jack Kerouac,"(He, wrote, his, first, published, novel, ,, The, Town, and, the, City, ,, and, began, the, famous, On, the, Road, around, 1949, while, living, there, .)",,
931,"It too broke all sales records, selling nine million copies in its first 24 hours of release.",Harry Potter and the Half-Blood Prince,2005.0,J.K. Rowling,"(It, too, broke, all, sales, records, ,, selling, nine, million, copies, in, its, first, 24, hours, of, release, .)",,
6,"His humorous story, ""The Celebrated Jumping Frog of Calaveras County"", was published in 1865, based on a story that he heard at Angels Hotel in Angels Camp, California where he had spent some time as a miner.",The Celebrated Jumping Frog of Calaveras County,1865.0,Mark Twain,"(His, humorous, story, ,, "", The, Celebrated, Jumping, Frog, of, Calaveras, County, "", ,, was, published, in, 1865, ,, based, on, a, story, that, he, heard, at, Angels, Hotel, in, Angels, Camp, ,, California, where, he, had, spent, some, time, as, a, miner, .)",1865.0,The Celebrated Jumping Frog of Calaveras County
815,"The seven-year period that followed saw the death of her mother, birth of her first child, divorce from her first husband and relative poverty until the first novel in the series, Harry Potter and the Philosopher's Stone, was published in 1997.",,,J.K. Rowling,"(The, seven, -, year, period, that, followed, saw, the, death, of, her, mother, ,, birth, of, her, first, child, ,, divorce, from, her, first, husband, and, relative, poverty, until, the, first, novel, in, the, series, ,, Harry, Potter, and, the, Philosopher, 's, Stone, ,, was, published, in, 19...",1997.0,Harry Potter and the Philosopher's Stone
367,"A subsequent article, ""To My Missionary Critics"" published in The North American Review in April 1901, unapologetically continues his attack, but with the focus shifted from Ament to his missionary superiors, the American Board of Commissioners for Foreign Missions.\n",To My Missionary Critics,1901.0,Mark Twain,"(A, subsequent, article, ,, "", To, My, Missionary, Critics, "", published, in, The, North, American, Review, in, April, 1901, ,, unapologetically, continues, his, attack, ,, but, with, the, focus, shifted, from, Ament, to, his, missionary, superiors, ,, the, American, Board, of, Commissioners, fo...",,


In [1141]:
displacy.render(nlp("Joanne Rowling,  , writing under the pen names J. K. Rowling and Robert Galbraith, is a British novelist, screenwriter, and producer who is best known for writing the Harry Potter fantasy series."), jupyter=True, )

In [None]:
ss.PROPN, ss.PART, ss.ADP, ss.DET, ss.CCONJ, ss.NOUN