In [None]:
import spacy
import pandas as pd
from nltk.corpus import wordnet as wn
import nltk
import json
from nltk.tag import StanfordNERTagger
from pycorenlp import StanfordCoreNLP
from autocorrect import spell

In [None]:
nlp = %time spacy.load('en_coref_lg')
core_nlp = StanfordCoreNLP('http://localhost:9002')
core_nlp_pos_props = {"annotators":"tokenize,ssplit,pos","outputFormat": "json"}
core_nlp_ie_props = {"annotators":"openie",\
                     "outputFormat": "json","openie.triple.strict":"true","splitter.disable" : "true"}
path_data_index = '../data/bedtime_stories_data/index.csv'
ner_tagger = StanfordNERTagger('../resources/stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',\
                               '../resources/stanford-ner-2018-10-16/stanford-ner.jar',\
                              encoding='utf8')

In [None]:
def findPersons(story_text):
    ret = set()
    for sent in nltk.sent_tokenize(story_text):
        tokens = nltk.tokenize.word_tokenize(sent)
        tags = ner_tagger.tag(tokens)
        for tag in tags:
            if (tag[1]=='PERSON'): 
                ret.add(tag[0].lower())
    return ret

In [None]:
def getCharacters(story_text):
    ch = findPersons(story_text)
    output = core_nlp.annotate(story_text, properties=core_nlp_pos_props)
    for sent in output['sentences']:
        for words in sent['tokens']:
            pos_tag = words['pos']
            word = spell(words['word'])
            if(pos_tag == 'NN'):
                if(isLiving(word)):
                    ch.add(word.lower())
    return ch

In [None]:
def return_story_text(path_story):
    file = open(path_story, 'r')
    text = file.read()
    text = text.replace('\n',' ')
    file.close()
    return text

In [None]:
def isLiving(word):
    wn_word = word + '.n.01'
    try:
        wn_lemma = wn.synset(wn_word)
    except:
        return False
    l_hyper = str(list(wn_lemma.closure(lambda s: s.hypernyms())))
    if('person.n.01' in l_hyper or 'animal.n.01' in l_hyper):
        return True
    else:
        return False 

In [None]:
def resolvePronouns(story_text):
    doc = nlp(story_text)
    return doc._.coref_resolved

In [None]:
def extractRelationships(story_text):
    ret = []
    sentences = nltk.sent_tokenize(story_text)
    for sent in sentences:
        output = core_nlp.annotate(sent,properties=core_nlp_ie_props)
        result = [output["sentences"][0]["openie"] for item in output]
        for element in result:
            for relation in element:
                ret.append((relation['relation'],relation['subject'],relation['object']))
    return ret

In [None]:
def findMoreCharacters(old_char_set,phrase):
    words = nltk.word_tokenize(phrase)
    if(len(words) == 1):
        if(isLiving(words[0])):
            old_char_set.add(words[0].lower())
    return old_char_set

In [None]:
def updateCharacters(old_char_set, relations):
    for rel in relations:
        old_char_set = findMoreCharacters(old_char_set,rel[1])
        old_char_set = findMoreCharacters(old_char_set,rel[2])
    return old_char_set

In [None]:
df = pd.read_csv(path_data_index)
df['story_text'] = df.path.apply(return_story_text)
df['characters'] = df.story_text.apply(getCharacters)
df['pronouns_resolved_text'] = df.story_text.apply(resolvePronouns)
df['relations'] = df.pronouns_resolved_text.apply(extractRelationships)
df['characters'] = df.apply(lambda row: updateCharacters(row['characters'], row['relations']), axis=1)
df.head()