# NLP

In [1]:
# Define functions available for slingshot
STONES = ['ner_spacy', 'ner_spacy_places']

## NER

In [9]:
def ner_spacy_string(string,ner_types={}):
    """
    Using spacy, this function takes any string, identifies the named entities in it,
    and returns a list of dictionaries, with one dictionary per named entitiy,
    where each dictionary looks like this:
    
    {
        'type': 'PERSON',
        'entity': 'Ryan',
        '_sent_num': 1,
        '_sent': 'Ryan Heuser cannot wait until he graduates from Stanford University.'
    }
    """
    
    try:
        # import spacy
        import spacy
        #import nltk
    except ImportError:
        print("spacy not installed. Please follow directions above.")
        return

    # clean string
    string = string.strip().replace(u"’",u"'").replace(u"‘",u"'").replace(u'—',u' -- ').replace(u'\r\n',u'\n').replace(u'\r',u'\n')
    
    # load its default English model
    nlp = spacy.load("en_core_web_sm")
    
    # make an output list
    output_list = []
    
    sent_num=0
    
    # split at pargraphs:
    paragraphs=string.split('\n\n')
    for para_i,para in enumerate(paragraphs):
        para=para.strip()
        #if para_i and not para_i%1000: print(para_i,'of',len(paragraphs),'paragraphs')
        if para_i and not para_i%1000: print(para_i,'/',len(paragraphs),'paras')
    
        # create a spacy text object
        try:
            doc = nlp(para,disable=['tagger'])
        except ValueError:
            # in case too big for spacy to handle
            continue

        # loop over sentences
        
        #for sent in doc.sents:
        #sents=nltk.sent_tokenize(string)
        sents=doc.sents
        for sent_doc in sents:
            #sent_doc=nlp(sent, disable=['parser','tagger'])
            #if sent_num and not sent_num%1000: print(sent_num)

            sent_num+=1
            added_sent_already = False

            # loop over sentence's entities
            #sent_doc = nlp(str(sent))
            for ent in sent_doc.ents:

                # make a result dict
                result_dict = {}

                # set sentence number
                result_dict['_para_num'] = para_i+1
                result_dict['_sent_num'] = sent_num

                # store text too
                if not added_sent_already:
                    sent=sent_doc.text
                    result_dict['_sent'] = sent
                    added_sent_already = True
                else:
                    result_dict['_sent'] = ''

                # get type
                result_dict['type'] = ent.label_
                if ner_types and result_dict['type'] not in ner_types:
                    continue

                # get entity
                result_dict['entity'] = ent.text

                # get start char
                result_dict['start_char'] = ent.start_char

                # get end char
                result_dict['end_char'] = ent.end_char

                # add result_dict to output_list
                output_list.append(result_dict)
            
    # return output
    return output_list
            


In [13]:
ner_spacy_string(u"I'm on my way to the Grand Hotel Abyss. It's in California.",ner_types={'GPE'})

[{'_sent': u"It's in California.",
  '_sent_num': 2,
  'end_char': 58,
  'entity': u'California',
  'start_char': 48,
  'type': u'GPE'}]

In [2]:
def ner_spacy(path_to_txt_file,ner_types={}):
    print(path_to_txt_file)
    try:
        with open(path_to_txt_file) as file:
            txt=file.read()
        return ner_spacy_string(txt)
    except FileNotFoundError:
        return []
    

In [3]:
def ner_spacy_places(path_to_txt_file):
    ner_spacy(path_to_txt_file,ner_types={'GPE'})
    

In [5]:
# results = ner_spacy('/Users/ryan/literarytextmining/corpora/fiction_since_1990/texts/Brown,_Dan.The_Da_Vinci_Code.txt')

In [6]:
# import pandas as pd
# pd.DataFrame(results[0:1000])