In [1]:
%%capture

import json
import tqdm
import nltk
import numpy as np

nltk.download('punkt')

In [2]:
dataset_id = 1
datasets = ['peerread', 'acl', 'refseer', 'arxiv']
dataset = datasets[dataset_id]

In [3]:
with open('./' + dataset + '/' + 'papers.json', 'r+') as f:
    papers = json.load(f)
    
with open('./' + dataset + '/' + 'contexts.json', 'r+') as f:
    contexts = json.load(f)

mapping_types = ['train', 'val', 'test']

with open('./' + dataset + '/' + mapping_types[2] + '.json', 'r+') as f:
    raw_test = json.load(f)

In [4]:
def process(text):
    tokens = [token for token in nltk.word_tokenize(text) if token != ' '
                                                          and token != 'OTHERCIT'
                                                          and token != 'mcOTHERCIT']
    return tokens

def isolate_sentence(text):
    sents = nltk.sent_tokenize(text)
    main_sent = ''
    for sent in sents:
        if 'TARGETCIT' in sent:
            main_sent = sent
            break
            
    tokens = [token for token in nltk.word_tokenize(main_sent) if token != ''
                                                               and token != 'TARGETCIT'
                                                               and token != 'OTHERCIT']
    return tokens

def tokens_to_string(tokens):
    return ' '.join(tokens).strip()

In [5]:
test = []

def fill_test(datapoints, papers, contexts):
    global test
    for unit in tqdm.tqdm(datapoints):
        context_id = unit['context_id']
        paper_ids = unit['positive_ids']

        paper_node = papers[paper_ids[0]]
        context_text = contexts[context_id]['masked_text']

        blob = {}
        blob['cite_span'] = tokens_to_string(isolate_sentence(context_text))
        blob['cite_context'] = tokens_to_string(process(context_text))
        blob['raw_context_id'] = context_id
        blob['paper_id'] = paper_ids[0]

        blob['paper'] = paper_node

        test.append(blob)
    
fill_test(raw_test, papers, contexts)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9585/9585 [00:03<00:00, 3035.22it/s]


In [6]:
def get_candidates(index):
    # Create Candidate Citation Spans from constructed index
    candidate_texts = []
    
    for blob in tqdm.tqdm(index):
        candidate_texts.append(blob['cite_context'])
    
    return candidate_texts

with open('./' + dataset + '/' + 'index.json', 'r+') as f:
    index = json.load(f)
    
candidates = get_candidates(index)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 39771/39771 [00:00<00:00, 2940345.21it/s]


In [7]:
candidates

['e and target languages are annotated with a coarse parts-of-speech tagset which is shared across languages . Such tagsets are commonly used in multilingual parsing ( ; McOTHERCIT ; Søgaard , 2011 ; TARGETCIT . The key feature of our model is a two-tier approach that separates the selection of dependents from their ordering : 1 . Selection Component : Determines the dependent tags given the parent tag . 2 . Ord',
 'ms with a brevity penalty ) is computed with human abstracts as reference . BLEU has a fairly good agreement with human judgement and has been used to evaluate a variety of language generation systems ( TARGETCIT ; ) . 4We use SVMlight ( ) with RBF kernel by default parameters for SVM-based classifiers and regressor . 5The four types of meetings in AMI are : project kick-off ( 35 meetings ) , function',
 'ng the sets of elementary dependency triples ( Oepen and Lønning , 2006 ) extracted from the returned and gold MRS . These annotations are similar in spirit to those used 

In [8]:
%%capture

!pip install rank_bm25
from rank_bm25 import BM25Okapi

In [9]:
# Creating BM25 fetcher
bm25 = BM25Okapi(candidates)
fetch_count = 10
outputs = []

for datapoint in tqdm.tqdm(test[0:10]):
    query = datapoint['cite_context']
    doc_scores = bm25.get_scores(query)
    best_docs = np.ndarray.tolist(np.argsort(doc_scores)[:: -1][0: fetch_count])
    outputs.append(best_docs)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:32<00:00,  3.24s/it]


In [10]:
# Viewing Extractions

def view(point):
    global test, outputs, index
    print('Query cite span: ' + str(test[point]['cite_span']))
    print('Query cite context: ' + str(test[point]['cite_context']))
    print('True citation: ' + str(test[point]['paper']['title']))
    
    for serial, item in enumerate(outputs[point]):
        print('Fetched span ' + str(serial + 1) + ':')
        print('Span: ' + index[item]['cite_span'])
        print('Suggested Citation: ' + index[item]['paper']['title'])

In [11]:
view(0)

Query cite span: These measures have been shown to correlate best with human judgments in general , but among the automatic measures , ROUGE-1 and ROUGE-2 also correlate best with the Pyramid ( ; ) and Responsiveness manual metrics ( ) .
Query cite context: retaining all stopwords . These measures have been shown to correlate best with human judgments in general , but among the automatic measures , ROUGE-1 and ROUGE-2 also correlate best with the Pyramid ( TARGETCIT ; ) and Responsiveness manual metrics ( ) . Moreover , ROUGE-1 has been shown to best reflect human-automatic summary comparisons ( ) . For single concept systems , the results are s
True citation: Evaluating Content Selection in Summarization: The Pyramid Method
Fetched span 1:
Span: ROUGE variations ( ROUGE1 , ROUGE-2 , ROUGE-3 , ROUGE-4 ) ( and the AutoSummENG-MeMoG ( ) and NPowER ( ) methods were used to automatically evaluate the summarization systems .
Suggested Citation: ROUGE: A Package for Automatic Evaluation of S

In [12]:
# That's it