# Capstone QA Notebook

Shan Ali

*Team 2*

### Initialize/Load Libraires and Data

In [None]:
#Installation:
!pip install deeppavlov

#After deeppavlov is installed, download the BERT config:
!python -m deeppavlov install squad_bert

In [None]:
import pandas as pd
import numpy as np
import pickle
from deeppavlov import build_model, configs

In [None]:
#!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
# download M&A Article Corpus dataset
link = 'https://drive.google.com/file/d/18-7kputUOwb-c8na63w-pry1JqDsPiah/view?usp=sharing'
id = link.split("/")[-2]

downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('merger_acquisition.csv')
madata = pd.read_csv('merger_acquisition.csv')
print(madata.shape)

(819, 2)


In [None]:
# download cleaned, first 200 M&A Article Corpus dataset
link = 'https://drive.google.com/file/d/1ouk-z5Eg0QWWebcrZmmCWHpYAW9ulicO/view?usp=sharing'
id = link.split("/")[-2]

downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('madata-clean.csv')
maclean = pd.read_csv('madata-clean.csv')
print(maclean.shape)

(200, 3)


In [None]:
# download Ground Truth dataset
link = 'https://drive.google.com/file/d/1gulJyA5Df5fhZcKY-w0IJsUN_UxhMw5H/view?usp=sharing'
id = link.split("/")[-2]

downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('groundTruth.csv')
truth = pd.read_csv('groundTruth.csv').iloc[:200,]
print(truth.shape)

(200, 7)


### Initalize QA Model & Functions

In [None]:
#model = build_model(configs.squad.squad_bert, download=True)   #only run this command once. after initial run, set download to False
model = build_model(configs.squad.squad_bert, download=False)

Q1 = ['Who was the buyer?']
Q2 = ['Who was the seller?']
Q3 = ['What was the price?']
Q4 = ['What was acquired?']

In [None]:
# define M&A question & answer function (MNA QA)
def mna_qa(newscollection_data):
  # define structures
  answers, c = pd.DataFrame(), 1
  newscollection = newscollection_data['text'].tolist()
  ids = newscollection_data['id'].tolist()
  
  # loop through news and get answers
  for news, id in zip(newscollection, ids):
    answ = pd.DataFrame()
    answ1 = model([news], Q1)
    answ2 = model([news], Q2)      
    answ3 = model([news], Q3)
    answ4 = model([news], Q4)
    
    answ = pd.Series([id,answ1[0][0],answ1[2][0],answ2[0][0],answ2[2][0],answ3[0][0],answ3[2][0],answ4[0][0],answ4[2][0],news])
    answers = answers.append(answ, ignore_index=True)
    print(c)
    c += 1

  # clean and return answer dataframe
  cols = {0:'id',1:'buyer',2:'buyer_confidence',3:'seller',4:'seller_confidence',5:'price',6:'price_confidence',7:'target',8:'target_confidence',9:'news'}
  answers = answers.rename(columns=cols)
  answers = answers.reset_index(drop=True)
  return answers

### Run QA for first 200 Articles & Save

In [None]:
# Loop through news articles to answer M&A questions 
#answers = mna_qa(madata[:200])
clean_answers = mna_qa(maclean)

In [None]:
# save answer data
#!pip install pandas --upgrade
from google.colab import drive
import pandas as pd
drive.mount('drive')

In [None]:
answers.to_csv('answers.csv', index=False)
!cp answers.csv "drive/My Drive/Colab Notebooks"

In [None]:
clean_answers.to_csv('clean_answers.csv', index=False)
!cp clean_answers.csv "drive/My Drive/Colab Notebooks"

### Evaluate Model Performance

In [None]:
# download answers dataset
link = 'https://drive.google.com/file/d/1-APf_Kfhy5uOPLAvj6P16sqGp42giMHS/view?usp=sharing'
id = link.split("/")[-2]

downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('answers.csv')
answers = pd.read_csv('answers.csv').iloc[:200,]
print(answers.shape)

(200, 10)


In [None]:
# download clean answers dataset
link = 'https://drive.google.com/file/d/1-Lt2A0hpMeLIDZPWY0IsfZq1AwmlGYcW/view?usp=sharing'
id = link.split("/")[-2]

downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('clean_answers.csv')
clean_answers = pd.read_csv('clean_answers.csv').iloc[:200,]
print(clean_answers.shape)

(200, 10)


In [None]:
# clean for confidence level
answ = clean_answers.copy()
TRESH = 1
answ['buyer'] = np.where(answ['buyer_confidence'] >= TRESH, answ['buyer'], np.nan)
answ['seller'] = np.where(answ['seller_confidence'] >= TRESH, answ['seller'], np.nan)
answ['price'] = np.where(answ['price_confidence'] >= TRESH, answ['price'], np.nan)
answ['target'] = np.where(answ['target_confidence'] >= TRESH, answ['target'], np.nan)

# isolate features
cols = ['id','buyer','seller','price','target']
ground = truth.rename(columns={'true_buyer':'buyer','true_seller':'seller','true_price':'price','true_target':'target'})
answ = answ[cols]
ground = ground[cols]

# get non-nan accuracy 
acc1 = answ.copy()
acc1['buyer'] = np.where(acc1['buyer'] == ground['buyer'],1,0)
acc1['seller'] = np.where(acc1['seller'] == ground['seller'],1,0)
acc1['price'] = np.where(acc1['price'] == ground['price'],1,0)
acc1['target'] = np.where(acc1['target'] == ground['target'],1,0)

# replace all nans
cols = ['buyer','seller','price','target']
answ[cols] = np.where(answ[cols].isna(), ' ', answ[cols])
ground[cols] = np.where(ground[cols].isna(), ' ', ground[cols])
print(answ.shape, ground.shape)

In [None]:
# get overall accuracy 
acc = answ.copy()
acc['buyer'] = np.where(acc['buyer'] == ground['buyer'],1,0)
acc['seller'] = np.where(acc['seller'] == ground['seller'],1,0)
acc['price'] = np.where(acc['price'] == ground['price'],1,0)
acc['target'] = np.where(acc['target'] == ground['target'],1,0)

# organize performance
out = pd.DataFrame(acc[cols].apply(np.mean),columns=['ovr_accuracy'])
out['n_correct'] = acc[cols].apply(sum)
out['n_total'] = 200
out['n_answered'] = answ[cols].apply(lambda x: (x != ' ').sum())
out['n_truth'] = ground[cols].apply(lambda x: (x != ' ').sum())
out['answered_truth_ratio'] = out['n_answered']/out['n_truth']
out['n_answered_correct'] = acc1[cols].apply(sum)
out['truth_accuracy'] = out['n_correct']/out['n_truth']
out['answer_accuracy'] = out['n_correct']/out['n_answered']
out

Unnamed: 0,ovr_accuracy,n_correct,n_total,n_answered,n_truth,answered_truth_ratio,n_answered_correct,truth_accuracy,answer_accuracy
buyer,0.15,30,200,97,176,0.551136,14,0.170455,0.309278
seller,0.17,34,200,120,172,0.697674,17,0.197674,0.283333
price,0.76,152,200,54,21,2.571429,8,7.238095,2.814815
target,0.255,51,200,197,171,1.152047,51,0.298246,0.258883


In [None]:
out

NameError: ignored

In [None]:
ground

In [None]:
answ