In [1]:
import spacy
import pandas as pd
from nltk.corpus import wordnet as wn
import nltk
import json
from nltk.tag import StanfordNERTagger
from pycorenlp import StanfordCoreNLP
from autocorrect import spell

In [2]:
nlp = %time spacy.load('en_coref_lg')
core_nlp = StanfordCoreNLP('http://localhost:9002')
core_nlp_pos_props = {"annotators":"tokenize,ssplit,pos","outputFormat": "json"}
core_nlp_ie_props = {"annotators":"openie",\
                     "outputFormat": "json","openie.triple.strict":"true","splitter.disable" : "true"}
path_data_index = 'bedtime_stories_data/index.csv'
ner_tagger = StanfordNERTagger('stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',\
                               'stanford-ner-2018-10-16/stanford-ner.jar',\
                              encoding='utf8')

CPU times: user 11.6 s, sys: 2.86 s, total: 14.5 s
Wall time: 16 s


In [3]:
def findPersons(story_text):
    ret = set()
    for sent in nltk.sent_tokenize(story_text):
        tokens = nltk.tokenize.word_tokenize(sent)
        tags = ner_tagger.tag(tokens)
        for tag in tags:
            if (tag[1]=='PERSON'): 
                ret.add(tag[0].lower())
    return ret

In [4]:
def getCharacters(story_text):
    ch = findPersons(story_text)
    output = core_nlp.annotate(story_text, properties=core_nlp_pos_props)
    for sent in output['sentences']:
        for words in sent['tokens']:
            pos_tag = words['pos']
            word = spell(words['word'])
            if(pos_tag == 'NN'):
                if(isLiving(word)):
                    ch.add(word.lower())
    return ch

In [5]:
def return_story_text(path_story):
    file = open(path_story, 'r')
    text = file.read()
    text = text.replace('\n',' ')
    file.close()
    return text

In [6]:
def isLiving(word):
    wn_word = word + '.n.01'
    try:
        wn_lemma = wn.synset(wn_word)
    except:
        return False
    l_hyper = str(list(wn_lemma.closure(lambda s: s.hypernyms())))
    if('person.n.01' in l_hyper or 'animal.n.01' in l_hyper):
        return True
    else:
        return False 

In [7]:
def resolvePronouns(story_text):
    doc = nlp(story_text)
    return doc._.coref_resolved

In [8]:
def extractRelationships(story_text):
    ret = []
    sentences = nltk.sent_tokenize(story_text)
    for sent in sentences:
        output = core_nlp.annotate(sent,properties=core_nlp_ie_props)
        result = [output["sentences"][0]["openie"] for item in output]
        for element in result:
            for relation in element:
                ret.append((relation['relation'],relation['subject'],relation['object']))
    return ret

In [9]:
def findMoreCharacters(old_char_set,phrase):
    words = nltk.word_tokenize(phrase)
    if(len(words) == 1):
        if(isLiving(words[0])):
            old_char_set.add(words[0].lower())
    return old_char_set

In [10]:
def updateCharacters(old_char_set, relations):
    for rel in relations:
        old_char_set = findMoreCharacters(old_char_set,rel[1])
        old_char_set = findMoreCharacters(old_char_set,rel[2])
    return old_char_set

In [11]:
df = pd.read_csv(path_data_index)
df['story_text'] = df.path.apply(return_story_text)
df['characters'] = df.story_text.apply(getCharacters)
df['pronouns_resolved_text'] = df.story_text.apply(resolvePronouns)
df['relations'] = df.pronouns_resolved_text.apply(extractRelationships)
df['characters'] = df.apply(lambda row: updateCharacters(row['characters'], row['relations']), axis=1)
df.head()

Unnamed: 0,essay_id,topic,path,story_text,characters,pronouns_resolved_text,relations
0,st_1,The Forest Princess,bedtime_stories_data/stories/st_1.txt,"A long time ago, in a big beautiful forest the...","{rabbit, deer, grandpa, marina, skunk, princess}","A long time ago, in a big beautiful forest the...","[(lived, many animals, life), (lived, many ani..."
1,st_2,Always Listen,bedtime_stories_data/stories/st_2.txt,Many moons ago there was a folk tale passed do...,"{mother, boy, man, friend, joey, morgan, bo, d...",Many moons ago there was a folk tale passed do...,"[(is about, story, Cho-Cho Man), (began with, ..."
2,st_3,The Lonely Dinosaur,bedtime_stories_data/stories/st_3.txt,Many years ago dinosaurs were the largest crea...,{dinosaur},Many years ago dinosaurs were the largest crea...,"[(were creatures at_time, dinosaurs, years), (..."
3,st_4,The Five Little Stars,bedtime_stories_data/stories/st_4.txt,She closed her eyes and made a wish. She wishe...,"{andrew, boy, gabriel, lady, woman, javier, ja...",She closed She eyes and made a wish. She wishe...,"[(closed, She, She eyes), (made, She, wish), (..."
4,st_5,John Goes to the City,bedtime_stories_data/stories/st_5.txt,Once up on a time there was mischievous raccoo...,"{john, policeman, man, jim}",Once up on a time there was mischievous raccoo...,"[(named, mischievous raccoon, John), (named, r..."
