In [2]:
import json
from pprint import pprint
from collections import namedtuple
data = json.load(open('./extractedFactsWithYear.json'), object_hook=lambda d: namedtuple('X', d.keys())(*d.values()))

In [7]:
from random import shuffle
articles = data.articles
shuffle(articles)
divider = int(round(0.5 * len(data.articles)))
learn_data = data.articles[:divider]
test_data = data.articles[divider:]

In [13]:
import spacy
nlp = spacy.load('en_core_web_md')

In [105]:
import re
entity_with_year_re = re.compile('''([A-Z][a-zA-Z'--(0-9)]*(?:\s[A-Z-0-9][a-zA-Z'--(0-9)]*)*((:|\sdel|\sfor|\sin|\sthe|\sof|\sat|\sand|\sis|\sor|\s&|\.|\sto)*(?:\s[A-Z-0-9][a-zA-Z'--(0-9)]*)+)*) \(([1-3][0-9]{3})\)''')
entity_re = re.compile('''([A-Z][a-zA-Z'--(0-9)]*(?:\s[A-Z-0-9][a-zA-Z-(0-9)]*)*((:|\sdel|\sfor|\sin|\sthe|\sof|\sat|\sto|\sde)*(?:\s[A-Z-0-9][a-zA-Z'--(0-9)]*)+)*)''')

In [17]:
def score(fact_dic, fact):    
    if fact['film'] not in fact_dic:
        return 0
    if fact_dic[fact['film']]['director'] == fact['director']: 
        if fact_dic[fact['film']]['year'] == fact['year']:
            return 1
        else:            
            return 0.75
    return 0.5

def check(articles):
    facts_amount = 0
    final_score = 0
    for article in articles:
        facts_amount = facts_amount + len(article.facts)
        extracted = extract(article)
        facts_dic = {f.film:{'director':f.director, 'year':f.year} for f in article.facts}
        for f in extracted:
            final_score = final_score + score(facts_dic, f)
    return final_score/facts_amount * 100

In [155]:
def director_from_article_name(article_name):
    return next((ent.text for ent in nlp(article_name).ents if ent.label_ == 'PERSON'), '')

def parse_director_name(article_name, doc):
    director = next((ent.text for ent in doc.ents if ent.label_ == 'PERSON'), '')
    director_from_name = director = director_from_article_name(article_name)
    if director == '': return director_from_name
    if len(director.split(' ')) < 2 : return director_from_name
    return director

def find_ind(doc, value):
    i = 0 
    for word in doc:
        if word.text == value:
            return i
        i= i + 1
    
    
def firstRule(article_name,sentence):
    results = []
    doc = nlp(sentence)
    director = parse_director_name(article_name, doc)
    for s in entity_with_year_re.finditer(sentence):
        fact = {
                'director':director,
                'film' : s.group()[:-7],
                'year' : s.group()[-5:-1]
        }
        results.append(fact)
    return results
def film_rule(article):
    result = {'film':'', 'year':'', 'director':''}
    doc = nlp(article.sentence)
    entities = [s.group() for s in entity_re.finditer(article.sentence)]    
    for ent in entities:    
        ent_doc = nlp(ent)
        start = find_ind(doc, ent_doc[0].text)
        if start == None: continue
        span = doc[start:start + len(ent_doc)]
        span.merge()
    #for word in doc:
    #    print("Word:", word.text, " Tag:", word.tag_, " Head:", word.head.text, "Dependency relation:", word.dep_)
    #    print("Children:", list(word.children))
    #    print("")
    years = [s.group() for s in re.finditer("([1-3][0-9]{3})",article.sentence)]
    if len(years) ==1:
        result['year'] = years[0]    
    for w in doc:        
        if w.text == "film":
            if w.head.text in entities and w.dep_ == 'pobj':
                result['film'] = w.head.text
                continue
            for child in w.children:
                if child.text in entities and child.dep_ == 'appos':
                    result['film'] = child.text
                    break
        if result['year'] != '': continue
        if w.dep_ == "ROOT":
            preps = [child for child in w.children if child.dep_ == "prep"]
            for prep in preps:
                match = re.search("([1-3][0-9]{3})", prep.text)
                if match:
                    result['year'] = match.group()
                    break
    result['director'] = director_from_article_name(article.name)
    return [result]

def extract(article):
    if re.search("\(([1-3][0-9]{3})\)",article.sentence):
        return firstRule(article.name, article.sentence)
    entities = [s.group() for s in entity_re.finditer(article.sentence)]
    years = [s.group() for s in re.finditer("([1-3][0-9]{3})",article.sentence)]
    if len(entities) == 1 and len(years) ==1:
        return [{
                'director':director_from_article_name(article.name),
                'film' : entities[0],
                'year' : years[0]
        }]
    if "film" in article.sentence:
        return film_rule(article)
    if "directed" in article.sentence:
        return []
    else :
        return []

In [158]:
check(learn_data)

55.44554455445545

In [157]:
check(test_data)

60.396039603960396