In [1]:
# General libraries
import re, os, string, random, requests
import pandas as pd
from subprocess import Popen, PIPE, STDOUT

# Haystack importings
from haystack import Finder
from haystack.reader.farm import FARMReader
from haystack.utils import print_answers
from haystack.document_store.elasticsearch import ElasticsearchDocumentStore
from haystack.retriever.sparse import ElasticsearchRetriever

from sklearn.feature_extraction.text import TfidfVectorizer

11/27/2020 22:44:59 - INFO - faiss -   Loading faiss.


In [2]:
# Starting ElasticSearch server as daemon
es_server = Popen(['elasticsearch'],
                   stdout=PIPE, stderr=STDOUT  # as daemon
                  )

# wait until ElasticSearch has started
! sleep 30

In [3]:
def get_index(n):
    """Return a random string of length n"""
    letters = string.ascii_lowercase
    result_str = ''.join(random.choice(letters) for i in range(n))
    return result_str

def get_stop_words(stop_file_path):
    """load stop words """
    
    with open(stop_file_path, 'r', encoding="utf-8") as f:
        stopwords = f.readlines()
        stop_set = set(m.strip() for m in stopwords)
        return frozenset(stop_set)

In [4]:
def trim_doc(doc):
    """Trim doc with respect to the boundary of a sentence."""
    
    trimmedText = []
    charCount = 0
    for sentence in doc.split('.'):
        if charCount < DOC_THRESHOLD:
            charCount+=len(sentence.strip())
            trimmedText.append(sentence)

    finalText = ".".join(trimmedText)
    
    return finalText


def clean_text(text):
    """Doc cleaning"""
    
    # Lowering text
    text = text.lower()
    
    # Removing punctuation
    text = "".join([c for c in text if c not in PUNCTUATION])
    
    # Removing whitespace and newlines
    text = re.sub('\s+',' ',text)
    
    # Trimming doc
    text = trim_doc(text)
    return text

In [5]:
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

In [6]:
# Constants
ES_INDEX = get_index(10) # Elastic Search DB index name
PUNCTUATION = """!"#$%&'()*+,-/:;<=>?@[\]^_`{|}~""" # excluding . (full-stop) from the set of punctuations
DOC_THRESHOLD = 10000 # character limit for a doc
TOP_K_RETRIEVER = 10 # top k documents to analyze further for a given query
TOP_K_READER = 5 # top k number of answers to return
TOP_K_KEYWORDS = 10 # top k number of keywords to retrieve in a ranked document
BASE_URL = "http://localhost:9200/"+ES_INDEX+"/_doc/"
STOPWORD_PATH = "data/stopwords.txt"
question = "What is ROC curve?"

In [7]:
data = pd.read_csv("data/papers.csv")
data.head()

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and Its Applications,,1-self-organization-of-associative-database-and-its-applications.pdf,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABASE\nAND ITS APPLICATIONS\nHisa...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cortex and Its Application to Arti...,,10-a-mean-field-theory-of-layer-iv-of-visual-cortex-and-its-application-to-a...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISUAL CORTEX\nAND ITS APPLICATION...
2,100,1988,Storing Covariance by the Associative Long-Term Potentiation and Depression ...,,100-storing-covariance-by-the-associative-long-term-potentiation-and-depress...,Abstract Missing,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\nLONG?TERM POTENTIATION AND DEP...
3,1000,1994,Bayesian Query Construction for Neural Network Models,,1000-bayesian-query-construction-for-neural-network-models.pdf,Abstract Missing,Bayesian Query Construction for Neural\nNetwork Models\nGerhard Paass\nJorg ...
4,1001,1994,"Neural Network Ensembles, Cross Validation, and Active Learning",,1001-neural-network-ensembles-cross-validation-and-active-learning.pdf,Abstract Missing,"Neural Network Ensembles, Cross\nValidation, and Active Learning\n\nAnders K..."


In [8]:
data.shape

(7241, 7)

In [9]:
# Structuring data to haystack required format
# Format: [{'text': 'paper_content', 'meta':{'name':'title'}}]
docs = []
corpora = []
doc_len = []

for index, row in data.iterrows():
    dicts = {}
    dicts['text'] = clean_text(row['paper_text'])
    doc_len.append(len(dicts['text']))
    corpora.append(dicts['text'])
    dicts['meta'] = {}
    dicts['meta']['name'] = clean_text(row['title'])
    docs.append(dicts)

In [10]:
# Average characters in a document after trimming
sum(doc_len)/len(docs)

10245.576577820742

In [11]:
# Be careful while overwriting data on the same ES index
document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index=ES_INDEX)

11/27/2020 22:46:03 - INFO - elasticsearch -   PUT http://localhost:9200/oczbuiemph [status:200 request:0.880s]
11/27/2020 22:46:03 - INFO - elasticsearch -   HEAD http://localhost:9200/label [status:200 request:0.003s]


In [12]:
# Now, let's write the dicts containing documents to our DB.
document_store.write_documents(docs)

11/27/2020 22:46:04 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.327s]
11/27/2020 22:46:05 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.162s]
11/27/2020 22:46:07 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.106s]
11/27/2020 22:46:08 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.119s]
11/27/2020 22:46:09 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.098s]
11/27/2020 22:46:10 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.134s]
11/27/2020 22:46:11 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.109s]
11/27/2020 22:46:13 - INFO - elasticsearch -   POST http://localhost:9200/_bulk?refresh=wait_for [status:200 request:1.124s]


In [13]:
# Instantiating ES retriever 
retriever = ElasticsearchRetriever(document_store=document_store)

In [14]:
# Initializing reader on the top of roberta-base-squad2 pre-trained model, which will be downloaded on the first run
# Here, we can set the size of context window for our answers and use the GPU if available

reader = FARMReader(model_name_or_path="deepset/roberta-base-squad2",use_gpu=False, context_window_size=500)

11/27/2020 22:46:21 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
11/27/2020 22:46:21 - INFO - farm.infer -   Could not find `deepset/roberta-base-squad2` locally. Try to download from model hub ...
Some weights of RobertaModel were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
	 We guess it's an *ENGLISH* model ... 
	 If not: Init the language model by supplying the 'language' param.
11/27/2020 22:46:47 - INFO - farm.utils -   device: cpu n_gpu: 0, distributed training: False, automatic mixed precision training: None
11/27/2020 22:46:47 - INFO - farm.infer -   Got ya 7 parallel workers to do inference ...
11/27/2020 22:46:47 - INFO - farm.infer -    0    0    0    0    0    0    0 
11/27/2020 2

In [15]:
# Fitting reader and retriever to Finder
finder = Finder(reader, retriever)

In [16]:
# Question prediction with TOP_K_RETRIEVER and TOP_K_READER
prediction = finder.get_answers(question=question, top_k_retriever=TOP_K_RETRIEVER, top_k_reader=TOP_K_READER)


11/27/2020 22:46:47 - INFO - elasticsearch -   POST http://localhost:9200/oczbuiemph/_search [status:200 request:0.071s]
11/27/2020 22:46:47 - INFO - haystack.finder -   Got 10 candidates from retriever
11/27/2020 22:46:47 - INFO - haystack.finder -   Reader is looking for detailed answer in 102477 chars ...
Inferencing Samples: 100%|██████████| 1/1 [00:06<00:00,  6.62s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:06<00:00,  6.52s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:07<00:00,  7.78s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:08<00:00,  8.98s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:08<00:00,  8.21s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:09<00:00,  9.83s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:10<00:00, 10.49s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:07<00:00,  7.88s/ Batches]
Inferencing Samples: 100%|██████████| 1/1 [00:08<00:00,  8.26s/ Batches]
Inferencing Samples: 100%|███████

In [17]:
# Printing answers with minimal detail
# details = minimal | medium | all

print_answers(prediction, details="minimal")

[   {   'answer': 'a straight line connecting the origin to 1 1',
        'context': 'tion of the false positive rate. the points of the curve '
                   'are obtained by sweeping the classification threshold from '
                   'the most positive classification value to the most '
                   'negative. for a fully random classification the roc curve '
                   'is a straight line connecting the origin to 1 1. any '
                   'improvement over random classification results in an roc '
                   'curve at least partially above this straight line. fig. 1 '
                   'shows an example of roc curve. the auc is defined as the '
                   'area under the roc curve and is closely related to'},
    {   'answer': 'a piecewise linear function',
        'context': 'not assume the classifiers are independent or related in '
                   'any way. before introducing our method we analyze the '
                   'oneclassif

In [18]:
top_5_docs = []

for doc in prediction['answers']:
    DOC_URL = BASE_URL + doc['document_id']
    response = requests.get(DOC_URL)
    if response.status_code == 200:
        full_doc = {}
        full_doc['title'] = response.json()['_source']['name']
        full_doc['text'] = response.json()['_source']['text']
        full_doc['answer'] = doc['answer']
        top_5_docs.append(full_doc)

### Getting Top K keywords using TF-IDF Method

In [19]:
#load a set of stop words
stopwords=get_stop_words(STOPWORD_PATH)

# Initializing TF-IDF Vectorizer with stopwords
vectorizer = TfidfVectorizer(stop_words=stopwords, smooth_idf=True, use_idf=True)

# Creating vocab with our corpora
vectorizer.fit_transform(corpora)

# Storing vocab
feature_names = vectorizer.get_feature_names()



In [20]:
def get_keywords(vectorizer, feature_names, doc):
    """Return top k keywords from a doc using TF-IDF method"""

    #generate tf-idf for the given document
    tf_idf_vector = vectorizer.transform([doc])
    
    #sort the tf-idf vectors by descending order of scores
    sorted_items=sort_coo(tf_idf_vector.tocoo())

    #extract only TOP_K_KEYWORDS
    keywords=extract_topn_from_vector(feature_names,sorted_items,TOP_K_KEYWORDS)
    
    return list(keywords.keys())


In [21]:
for doc in top_5_docs:
    doc['keywords'] = get_keywords(vectorizer, feature_names, doc['text'])

In [22]:
final = pd.DataFrame(top_5_docs)

In [23]:
print(question)
print("Top 5 articles with keywords\n")
final

What is ROC curve?
Top 5 articles with keywords



Unnamed: 0,title,text,answer,keywords
0,auc optimization vs. error rate minimization,auc optimization vs. error rate minimization corinna cortes and mehryar mohr...,a straight line connecting the origin to 1 1,"[auc, roc, examples, rate, positive, negative, classification, threshold, cu..."
1,optimal roc curve for a combination of classifiers,optimal roc curve for a combination of classifiers marco barreno alvaro a. c...,a piecewise linear function,"[roc, pf, classifiers, curve, pd1, neymanpearson, pd, yh0, pry, h0]"
2,familiarity discrimination of radar pulses,familiarity discrimination of radar pulses eric grangerl stephen grossberg 2...,a plot of h vs. f parameterized by the thresholdy,"[familiarity, artmapfd, artmap, discrimination, fuzzy, radar, training, fami..."
3,confidence intervals for the area under the roc curve,confidence intervals for the area under the roc curve corinna cortes google ...,a straight line connecting the origin to 1 1,"[auc, confidence, intervals, roc, variance, negative, positive, examples, ra..."
4,overlaying classifiers a practical approach for optimal ranking,overlaying classifiers a practical approach for optimal ranking stephan clem...,the area under a roc curve,"[roc, scoring, curve, ranking, optimal, bipartite, curves, risk, sx, positive]"
