# QA Text Processing

In [40]:
# load libraries
#!python -m spacy download en_core_web_lg
import pandas as pd
import numpy as np
import spacy
from spacy import displacy
import en_core_web_lg
nlp = spacy.load("en_core_web_lg")

In [113]:
madata = pd.read_csv('merger_acquisition.csv')
duns = pd.read_csv('mna_duns.csv')

madata['text'] = madata['text'].replace('\'','', regex=True)
madata['text'] = madata['text'].replace(':',' ', regex=True)

In [124]:
duns

Unnamed: 0,DUNS,short_entity_name,alt_name_list
0,0,RALEIGH NC,
1,1,Rifiniti,
2,2,Premium Transportation Logistics,
3,3,Senior Management Team,
4,4,PTL,
...,...,...,...
607,607,Sierra MCC,"['Sierra Income Corporation', 'Sierra MCC']"
608,608,Spark Power Corp,"['Spark Power Group Inc', 'Spark Power Corp']"
609,609,VEON Holdings,"['VEON Holdings BV', 'VEON Holdings']"
610,610,Wacoal America Inc,"['Wacoal International Corporation', 'Wacoal A..."


In [156]:
# replace large company names with DUNS shortened entity
short_entities = duns['short_entity_name'].tolist()
duns_list = duns['DUNS'].tolist()
alt_name_lists = duns['alt_name_list']
alt_name_lists = np.where(alt_name_lists.isna(), [''], alt_name_lists).tolist()
short_entities_nlp = duns['short_entity_name'].apply(nlp).tolist()

# entity standardization function
def duns_process(text):

    # extract identified organizational entities & persons
    news_nlp = nlp(text)
    recognized_org, recognized_people = [], []
    for entity in news_nlp.ents:
        label = entity.label_
        if label == 'ORG': recognized_org.append(entity.text)
        elif label == 'PERSON': recognized_people.append(entity.text)
        else: continue

    # initalize replacement loop
    text = text.lower()
    recognized_org = pd.Series(recognized_org).unique()
    i, TRESH = 1, 0.7
    duns_new, d = [], duns_list[-1]
    
    # replace entities to shortened form
    for org in recognized_org:
        print(i, org)
        text = text.replace(org.lower(), org)
        paired, nlp_org = False, nlp(org)
        pack = zip(short_entities, alt_name_lists, duns_list, short_entities_nlp)
        for short_entity, alt_name_list, dun, entity_nlp in pack:
            if (org == short_entity) | (org in alt_name_list):
                print(i, 'Match', org, short_entity)
                entity = 'Org' + str(dun) #+ ' Company'
                text = text.replace(org, entity)
                paired = True
            elif nlp_org.similarity(entity_nlp) > TRESH:
                print(i, 'Match', org, short_entity)
                entity = 'Org' + str(dun) #+ ' Company'
                text = text.replace(org, entity)
                paired = True
        if paired == False:
            print(i, 'No Match', org)
            d += 1
            duns_new = duns_new.append([d, org, np.nan])
            entity = 'Org' + str(d) #+ ' Company'
            text = text.replace(org, entity)
        i += 1
                
    # replace people entities to 'person'
    recognized_people = pd.Series(recognized_people).unique()
    for prsn in recognized_people:
        text = text.replace(prsn, 'person')
        
    # append new duns values to dataset
    #duns = duns.append(duns_new)
                
    return text

# process text for DUNS entities
madata2 = madata.copy()
end = 1
madata2['text'][:end] = madata2['text'][:end].map(duns_process)
#madata2['text'][:end] = madata2['text'][:end].map(duns_process)

1 Rifiniti Acquisition
1 Match Rifiniti Acquisition Enterprises Acquisition Corp
2 FM Systems
2 No Match FM Systems
3 Rifiniti
3 Match Rifiniti Rifiniti


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [157]:
madata2.iloc[0,1]

'Org612 gains advanced workspace analytics through Org182. raleigh, n.c., july 10, 2019 /prnewswire/ -- Org612, a leader in facility management technology that enables facility and real estate professionals to identify, plan and deliver the ideal workplace for every employee, announced today its acquisition of  Org1, a provider of advanced analytics for workspace optimization. the companys sophisticated data science and machine learning algorithms propel Org612 towards its goal of providing clients with key insights that empower them to make critical business decisions based on highly accurate space utilization trends. with as much as 50% underutilized space in workplaces today, the actionable insights from this advanced technology helps visualize trends, significantly reducing real estate costs by rebalancing and consolidating workspaces while enabling better workspace design and workforce collaboration.\n'

In [104]:
madata2[:end].to_csv('madata-clean-duns.csv')

### Evaluate Accuracy

In [3]:
truth = pd.read_csv('groundTruth.csv').iloc[:200,]
clean_answers = pd.read_csv('clean_answers.csv').iloc[:200,]

In [8]:
# clean for confidence level
answ = clean_answers.copy()
TRESH = 1
answ['buyer'] = np.where(answ['buyer_confidence'] >= TRESH, answ['buyer'], np.nan)
answ['seller'] = np.where(answ['seller_confidence'] >= TRESH, answ['seller'], np.nan)
answ['price'] = np.where(answ['price_confidence'] >= TRESH, answ['price'], np.nan)
answ['target'] = np.where(answ['target_confidence'] >= TRESH, answ['target'], np.nan)

# isolate features
cols = ['id','buyer','seller','price','target']
ground = truth.rename(columns={'true_buyer':'buyer','true_seller':'seller','true_price':'price','true_target':'target'})
answ = answ[cols]
ground = ground[cols]

In [39]:
# define text to nlp function
def get_accuracy(col):
    truth = ground[col.name]
    TRESH = 0.7
    for i in range(col.shape[0]):
        try: col[i] = nlp(col[i]).similarity(nlp(truth[i])) > TRESH
        except: col[i] = np.nan
    return col

# get non-nan accuracy 
acc1 = answ.copy()
cols = ['buyer','seller','price','target']
acc1[cols] = acc1[cols].apply(get_accuracy)

# replace all nans
answ[cols] = np.where(answ[cols].isna(), ' ', answ[cols])
ground[cols] = np.where(ground[cols].isna(), ' ', ground[cols])

# get overall accuracy 
acc = answ.copy()
acc[cols] = acc[cols].apply(get_accuracy)
print(answ.shape, ground.shape)

  


(200, 5) (200, 5)


In [33]:
# organize performance
out = pd.DataFrame(acc[cols].apply(np.mean),columns=['ovr_accuracy'])
out['n_correct'] = acc[cols].apply(sum)
out['n_total'] = 200
out['n_answered'] = answ[cols].apply(lambda x: (x != ' ').sum())
out['n_truth'] = ground[cols].apply(lambda x: (x != ' ').sum())
out['answered_truth_ratio'] = out['n_answered']/out['n_truth']
#out['n_answered_correct'] = acc1[cols].apply(sum)
out['accuracy'] = out['n_correct']/out['n_total']
#out['truth_accuracy'] = out['n_correct']/out['n_truth']
#out['answer_accuracy'] = out['n_correct']/out['n_answered']
out

Unnamed: 0,ovr_accuracy,n_correct,n_total,n_answered,n_truth,answered_truth_ratio,accuracy
buyer,,,200,97,176,0.551136,
seller,0.005,53.0,200,120,172,0.697674,0.265
price,0.815,163.0,200,50,21,2.380952,0.815
target,0.005,109.0,200,174,171,1.017544,0.545
