### IEMS 308 - Data Science, Named Entity Recognition Engine

1. The data can be found at the following [link](www.google.com).
2. **Build a text pre-processing engine**
3. Build a simple AI computation engine.

In [None]:
import os
from tqdm import tqdm

directory = 'data/' #directory to search for data
filenames = [] #new list to hold filenames
dates = [] #new list to hold date keys

#traverse directory and import all relevant data
for filename in os.listdir(directory):
    if filename.endswith(".txt"): #just double check correct filetype
        url = os.path.join(directory,filename)
        dates.append(filename.strip('.txt'))
        filenames.append(url)
    else: continue        

In [None]:
filenames.sort() #organise into date order
filenames[:3]

In [None]:
%%time
import string
printable = set(string.printable)

content = []
for filename in tqdm(filenames):
    with open(filename,encoding='utf8',errors='ignore') as f:
        document = f.read()
        document = ''.join(filter(lambda x: x in printable, document))
        document = document.replace('\n', ' ')
        content.append(document)
f.close()        

In [None]:
import nltk, string, re

In [None]:
content[0]

### Preprocess `.csv` data

Extract training set from `csv` files.

In [None]:
import pandas as pd

#extract all in one step

# ceo_labels = ['','2','full']
cents_labels = ['cent']
corps_labels = ['first']

training_ceo = pd.read_csv('/Users/saifbhatti/Desktop/Northwestern/sy1920/w20/iems308/iems308-saifbhatti/homew3/trained/ceo.csv',header=None,encoding='utf-8')
training_cents = pd.read_csv('/Users/saifbhatti/Desktop/Northwestern/sy1920/w20/iems308/iems308-saifbhatti/homew3/trained/percentage.csv',names=cents_labels,header=None,encoding='utf-8')
training_corp = pd.read_csv('/Users/saifbhatti/Desktop/Northwestern/sy1920/w20/iems308/iems308-saifbhatti/homew3/trained/companies.csv',header=None,names=corps_labels,encoding='utf-8')

In [None]:
training_ceo[2] = [' '.join(s.split()) for s in training_ceo[2]]
#convert all to list
train_ceo = list(set(training_ceo[2].tolist()))
train_corp = training_corp['first'].to_list()
train_cent = training_cents['cent'].to_list()

### Constructing the Named-Entity-Recognition Engine

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm", disable=['tagger','parser'])
nlp.add_pipe(nlp.create_pipe('sentencizer'))
doc = nlp(content[0])
# for token in doc.ents:
#     print(token.text, token.start_char, token.end_char,token.label_)
#     print(token.text, token.pos_, token.dep_)

`processed` holds text files which have been parsed by `spaCy`'s NER recognition, converting their type from `str` to `spacy.Doc.Docs`.

##### Warning: `processed` runtime is approx 20 minutes.

In [None]:
%time processed = [nlp(article) for article in tqdm(content[0:len(content)])]

In order to not waste time, `processed` can be pickled and saved to disk. 

In [None]:
# stuff=[]
# for i in range(50):
#     temp = matching_engine_3000(nlp(people_found[i]),
#                                 'PERSON')
#     if len(temp)>1:
#         for j in range(len(temp)):
#             print(temp[j])
#             people_found[i]
# #     stuff.extend(matching_engine_3000(nlp(filter_sentences(processed[0],"PERSON")[i]),'PERSON'))
    
# # for i in hello:
# # #     print(i)
# #     if len(i)>1:
# #         for j in range(len(i)):
# #             print(i[j])

In [None]:
# stuff

In [None]:
# filter_sentences(processed[0],'PERSON') #list
# '''
# within the list of sentences with entities,
# look through all sentences, and check if there are more than 1 entity
# '''

In [None]:
# 

### Helper Functions

1. `pickle_me_hearties` pickles files.
2. `return_me_hearties` unpickles files.
3. `matching_engine_3000` takes `nlp` `processed` (which are `spaCy.Doc.Doc`s, and matches given entity type.
4. `filter sentences` takes `spaCy.Doc.Doc` and returns the `entity` present in the doc.
5. `match` Iterates through list of substrings to see if any are in a sentence.
6. `company_word` runs `re.search` on the sentence to detect presence of common Company words.
7. `num_capitals` returns number of capitals in the sentence.
8. `num_words` returns number of words in the sentence.
9. `split_entity` replicates the sentence if multiple entities (of the same type) are found in the sentence.

In [None]:
import pickle
def pickle_me_hearties(pval,pstart,pend):
    '''
    pickle_me_hearties pickles files.
    
    pval is the file iterator
    pstart is start slice
    pend is end slice
    '''
    with open('processed'+str(pval)+'.pkl','wb') as pf:
        pickle.dump(processed[pstart:pend],pf,protocol=pickle.HIGHEST_PROTOCOL)
    

In [None]:
def return_me_hearties(plist):
    '''
    return_me_hearties unpickles files.
    
    plist is the list of file iterators
    '''
    full_store=[]
    for i in tqdm(plist):
        with open('processed'+str(i)+'.pkl','rb') as pf:
            var = pickle.load(pf)
            full_store += var
    return full_store
    

In [None]:
# %time pickle_me_hearties(1,0,1)
# %time pickle_me_hearties(2,151,300)
# %time pickle_me_hearties(3,301,450)
# %time pickle_me_hearties(4,451,600)
# %time pickle_me_hearties(5,600,729)
# %time new = return_me_hearties([1,2,3,4,5])

In [None]:
def matching_engine_3000(values,match):
    '''
    values is the nlp processed docs
    match is the entity type to match e.g. PERSON
    '''
    engine_results = [ent.text for ent in values.ents if ent.label_ == match]
    return engine_results

In [None]:
def filter_sentences(article, entity):
    sentences = [sent.text for sent in article.sents]
    entities = [ent.text for ent in article.ents if ent.label_ == entity]
    sent_w_ent = []
    for s in sentences:
        if any(ent in s for ent in entities):
            sent_w_ent.append(s)
    
    return sent_w_ent

In [None]:
def match(sentence, label_lst):
    '''
    Iterates through list of substrings to see if any are in a sentence.
    Used for labeling the data as positive and negative samples generally. 
    '''
    if any(lbl in sentence for lbl in label_lst):
        return 1
    else:
        return 0

In [None]:
def company_word(sentence):
    if (bool(re.search(r'(Advisors|Partner|LP|Associate|Co|Group|LTD|AirLL|Management|Capital)',sentence))):
        return 1
    else: 
        return 0

In [None]:
def num_capitals(phrase):
    cap_count = 0
    for letter in phrase:
        if letter.isupper():
            cap_count = cap_count + 1
    return(cap_count)

In [None]:
def num_words(phrase):
    return len(phrase.split())

### Processing Percentages

Due to `spaCy`'s phenomenal NER capabilities, it's possible to directly extract all percentages super easily.

In [None]:
%%time
takehome_percentages =[]
for i in processed:
    takehome_percentages.extend(matching_engine_3000(i,"PERCENT"))

In [None]:
takehome_percentages=list(set(takehome_percentages))
takehome_percent = pd.DataFrame(takehome_percentages)
takehome_percent.to_csv('takehome_percent.csv',index=False)

### Processing CEOs

Using `spaCy`'s NER capabilities, we have the ability to extract all `PERSON` entities, and run Machine Learning steps on this subset.

In [None]:
full_person_list = []
for i in tqdm(range(len(processed))):
    people = matching_engine_3000(processed[i],"PERSON")
    full_person_list.append(people)
    


In [None]:
len(filter_sentences(processed[0],"PERSON"))

In [None]:
people_found = [] #people_found just holds a list of sentences if entity is present
#we want to append new sentences if there are more than 1 entity in a sentence
for i in tqdm(range(len(processed))):
    people_found.extend(filter_sentences(processed[i],"PERSON"))

In [None]:
# find_in = people_found
# len(people_found)
# len(find_in)

In [None]:
# for i in range(275,277):
#     print('sentence {}: {}'.format(i,people_found[i]))
#     temp = matching_engine_3000(nlp(people_found[i]),'PERSON')
#     if len(temp)>0:
#         print(temp)

In [None]:
# finder=[]
# for i in range(1):
#     index_holder=[]
#     print('sentence {}: {}'.format(i,people_found[i]))
#     temp = matching_engine_3000(nlp(people_found[i]),'PERSON')
#     if len(temp)>1:
#         j = len(temp)
# #         print(temp)
# #         print(type(temp))
#         for j in range(len(temp)):
#             print('REPLACE LOOP')
# #             print(temp[j])
# #             print(re.sub(temp[j], '', people_found[i]))
#             index_holder.append(people_found[i].index(temp[j]))
#             print(index_holder)
#             people_found[i] = (re.sub(temp[j], '', people_found[i]))
#             print(people_found[i])
#         for j in range(len(temp)):
#             print('ADD LOOP')
# #             print(temp[j])
# #             print(re.sub(temp[j], '', people_found[i]))
# #             people_found[i] = people_found[i][:index_holder[j]] + temp[j] + people_found[i][index_holder[j]:]
#             people_found[i] = insert_str(people_found[i],temp[j],index_holder[j])
#             print(people_found[i])
# #             s[:4] + '-' + s[4:]
# #     print(people_found[i])


# #             print(new_temp)
# #         print(people_found[i])
# #     print(temp)

In [None]:
# def insert_str(string, str_to_insert, index):
# #     return string[:index] + str_to_insert + string[index:]

In [None]:
# %%time
# finder=[]
# for i in tqdm(range(len(find_in))):
#     index_holder=[]
# #     print('----------')
# #     print('sentence {}: {}'.format(i,find_in[i]))
#     temp = matching_engine_3000(nlp(find_in[i]),'PERSON')
#     if len(temp)>1:
#         j = len(temp)
# #         print(temp)
# #         print(type(temp))
#         for j in range(len(temp)):
# #             print('REPLACE LOOP')
#             index_holder.append(find_in[i].index(temp[j]))
# #             print(index_holder)
#             find_in[i] = (re.sub(temp[j], '', find_in[i],1))
# #             print(find_in[i])
#         for j in range(len(temp)):
# #             print('ADD LOOP')
#             find_in[i] = insert_str(find_in[i],temp[j],index_holder[j])
#             finder.extend(find_in[i])
# #             print(find_in[i])
#             find_in[i] = (re.sub(temp[j], '', find_in[i]))
#     elif len(temp)<1:
#         finder.append(find_in[i])


In [None]:
# len(finder)

In [None]:
ceo_df = pd.DataFrame({'text_persons':people_found})

In [None]:
%time ceo_df['num_capitals'] = ceo_df['text_persons'].apply(lambda x: (num_capitals(x)))

In [None]:
%time ceo_df['ceo_label'] = ceo_df['text_persons'].apply(lambda x: (match(x,train_ceo)))

In [None]:
%time ceo_df['num_words'] = ceo_df['text_persons'].apply(lambda x: (num_words(x)))

In [None]:
ceo_df.to_csv('ceo_df.csv',index=False)

### Processing Companies

Using `spaCy`'s NER capabilities, we have the ability to extract all `ORG` entities, and run Machine Learning steps on this subset.

In [None]:
full_company_list = []
for i in tqdm(range(len(processed))):
    companies = matching_engine_3000(processed[i],"ORG")
    full_company_list.extend(companies)

In [None]:
organisations_found = []
for i in tqdm(range(len(processed))):
    organisations_found.extend(filter_sentences(processed[i],"ORG"))

In [None]:
company_df = pd.DataFrame({'text_organisations':organisations_found})

In [None]:
%time company_df['num_words'] = company_df['text_organisations'].apply(lambda x: (num_words(x)))

In [None]:
%time company_df['num_capitals'] = company_df['text_organisations'].apply(lambda x: (num_capitals(x)))

In [None]:
%time company_df['company_nearby'] = company_df['text_organisations'].apply(lambda x: company_word(x))

In [None]:
%time company_df['company_label'] = company_df['text_organisations'].apply(lambda x: (match(x,train_corp)))

In [None]:
company_df[['company_nearby','company_label','num_capitals','num_words']].describe()
#the results from this indicate that roughly 7% of the sentences contain matches
#given almost 350k sentences containing persons, 7% would be 24,500 sentences containing matched ceos
#this seems reasonable

In [None]:
company_df.to_csv('company_df.csv',index=False)