# NLP

In [48]:
# Define functions available for slingshot
STONES = ['ner_spacy']

## NER

In [39]:
def ner_spacy_string(string):
    """
    Using spacy, this function takes any string, identifies the named entities in it,
    and returns a list of dictionaries, with one dictionary per named entitiy,
    where each dictionary looks like this:
    
    {
        'type': 'PERSON',
        'entity': 'Ryan',
        '_sent_num': 1,
        '_sent': 'Ryan Heuser cannot wait until he graduates from Stanford University.'
    }
    """
    
    try:
        # import spacy
        import spacy
        import nltk
    except ImportError:
        print("spacy not installed. Please follow directions above.")
        return

    # clean string
    string = string.strip().replace('\n',' ').replace("’","'").replace("‘","'")
    
    # load its default English model
    nlp = spacy.load("en_core_web_sm")
    
    # make an output list
    output_list = []
    
    # split at pargraphs:
    #paragraphs=string.split('\n\n')
    #for para_i,para in enumerate(paragraphs):
    #    if not para_i%10: print(para_i,'of',len(paragraphs),'paragraphs')
    
    # create a spacy text object
    #doc = nlp(string,disable=['parser','tagger','ner'])

    # loop over sentences
    sent_num=0
    #for sent in doc.sents:
    sents=nltk.sent_tokenize(string)
    for sent in sents:
        sent_doc=nlp(sent, disable=['parser','tagger'])
        if not sent_num%1000: print(sent_num,len(sents))
        
        sent_num+=1
        added_sent_already = False

        # loop over sentence's entities
        #sent_doc = nlp(str(sent))
        for ent in sent_doc.ents:

            # make a result dict
            result_dict = {}

            # set sentence number
            #result_dict['_para_num'] = para_i+1
            result_dict['_sent_num'] = sent_num

            # store text too
            if not added_sent_already:
                result_dict['_sent'] = sent
                added_sent_already = True
            else:
                result_dict['_sent'] = ''

            # get type
            result_dict['type'] = ent.label_

            # get entity
            result_dict['entity'] = ent.text

            # get start char
            result_dict['start_char'] = ent.start_char

            # get end char
            result_dict['end_char'] = ent.end_char

            # add result_dict to output_list
            output_list.append(result_dict)
            
    # return output
    return output_list
            


In [42]:
ner_spacy_string("Hello Cambridge University.\n\nHow are you, Ryan?")

0 2


[{'_sent': 'Hello Cambridge University.',
  '_sent_num': 1,
  'end_char': 26,
  'entity': 'Hello Cambridge University',
  'start_char': 0,
  'type': 'ORG'},
 {'_sent': 'How are you, Ryan?',
  '_sent_num': 2,
  'end_char': 17,
  'entity': 'Ryan',
  'start_char': 13,
  'type': 'PERSON'}]

In [43]:
def ner_spacy(path_to_txt_file):
    print(path_to_txt_file)
    try:
        with open(path_to_txt_file) as file:
            txt=file.read()
        return ner_spacy_string(txt)
    except FileNotFoundError:
        return []
    

In [45]:
#results = ner_spacy_path('/Users/ryan/literarytextmining/corpora/fiction_since_1990/texts/Brown,_Dan.The_Da_Vinci_Code.txt')

In [47]:
#results[0]