In [5]:
import json
import os
import random
import re
import tqdm
from tqdm import tqdm 

import numpy as np
import pandas as pd

In [16]:
d = {}
with open("../data/citation_generation/result_comps.jsonl") as f:
    for i, line in enumerate(f):
        d[i] = json.loads(line)
        
result_comps = pd.DataFrame.from_dict(d).T
display(result_comps.head())

Unnamed: 0,citation_sentence,manuscript_id,cited_id,background_prob,method_prob,result_prob,intent
0,"In addition, we here report comparable changes...",8281087,15228934,0.046504,0.010625,0.942871,result
1,"In addition, we here report comparable changes...",8281087,20547173,0.046504,0.010625,0.942871,result
2,"In addition, we here report comparable changes...",8281087,15671550,0.046504,0.010625,0.942871,result
3,Such a discrepancy may likely be due to dose-o...,8281923,6704669,0.45166,0.020438,0.527902,result
4,"Similar to earlier observations (42) , the inf...",11155963,4009171,0.223237,0.105094,0.671669,result


In [32]:
# how many citation sentences only cite one paper?
unique = result_comps.drop_duplicates(subset=["citation_sentence"], keep=False).reset_index(drop=True)
display(unique.head())

Unnamed: 0,citation_sentence,manuscript_id,cited_id,background_prob,method_prob,result_prob,intent
0,Such a discrepancy may likely be due to dose-o...,8281923,6704669,0.45166,0.020438,0.527902,result
1,"Similar to earlier observations (42) , the inf...",11155963,4009171,0.223237,0.105094,0.671669,result
2,"In a cohort study in Korea (20) , the highest ...",11158470,15153392,0.417643,0.049526,0.532832,result
3,This finding can be explained by the fact that...,11159272,14840523,0.01713,0.005576,0.977294,result
4,The distribution of flow velocity and WSS was ...,11159272,13786003,0.016776,0.019489,0.963735,result


In [37]:
print(unique.loc[0,"citation_sentence"])
print()

with open("../data/s2orc/s2orc_result_subset.jsonl", "r") as f:
    for line in f:
        p = json.loads(line)
        if p["paper_id"] != unique.loc[0,"manuscript_id"]:
            continue
        for para in p["body_text"]:
            print(para)
            print()
        break

Such a discrepancy may likely be due to dose-or species-specific differences since an acute administration of 3.3 mg/kg MDMA did not stimulate locomotion in mice (Scearce-Levie et al, 1999) , but did in rats ( Bankson and Cunningham, 2002) .

{'section': 'INTRODUCTION', 'text': 'It is well known that serotonin (5-HT) regulates feeding behavior, as shown by the potent anorectic properties of 5-HT releasers, such as fenfluramine. Pharmacological studies combined with knockout strategies have clearly demonstrated that among the 15 5-HT receptor subtypes, the 5-HT receptors are key elements regulating food intake in mammals. The 5-HT 1B , 5-HT 2A , and 5-HT 2C receptor agonists produce hypophagia (Bendotti and Samanin, 1987 ; Kennett and Curzon, 1988 ; Schechter and Simansky, 1988; Macor et al, 1990; Aulakh et al, 1994; Halford and Blundell, 1996 ; Lee and Simansky, 1997; Lucas et al, 1998; Vickers et al, 2001; Lee et al, 2004) . Conversely, the inactivation of 5-HT 2C receptors alters fen

In [42]:
unique.to_json("./unique_result_comps.jsonl", orient="records", lines=True)

### 2.1. Extracting citation sentencesduplicated

In [12]:
from nltk.tokenize.punkt import PunktParameters, PunktSentenceTokenizer

# tokenizer should not split at abbrieviations
punkt_params = PunktParameters()
punkt_params.abbrev_types = set(["i.e", "e.g", "etc", "al", "fig", "figs", 
                                 "ref", "refs", "p", "c", "s"]) 

# initialise sentence tokenizer
punkt_sent_tokenizer = PunktSentenceTokenizer(punkt_params)

In [87]:
# extract correct citation sentences
citation_sentences = []

## for each paper, 
for i in tqdm(range(len(s2orc))):
    manuscript_id = s2orc.loc[i, "paper_id"]
    full_text = s2orc.loc[i, "body_text"]

    for paragraph in full_text:
        ## skip paragraphs that are not in "discussion" section 
        section_name = paragraph["section"].lower()
        if "discuss" not in section_name and "conclu" not in section_name:
            continue 
            
        ## also skip paragraphs with no citation sentences
        if not paragraph["cite_spans"]: 
            continue

        ## tokenize paragraph into sentences 
        paragraph_text = paragraph["text"]
        endpoints = list(punkt_sent_tokenizer.span_tokenize(paragraph_text))

        j = 0

        ## for each citation marker, 
        for cite_span in paragraph["cite_spans"]:
            cite_id = cite_span["cite_id"]
            cite_text = cite_span["text"]
            start, end = cite_span["start"], cite_span["end"]

            ## extract the sentence containing the citation marker
            a, b = endpoints[j]
            while start >= b:
                j += 1
                a, b = endpoints[j]

            ## if citation marker is textual or sentence begins with words, 
            ## assume extracted sentence is true citation sentence
            ## else, take previous sentence instead
            textual = re.search('[a-zA-Z]', cite_text) 
            if not (textual or re.search("[a-zA-Z]", paragraph_text[a:start])): 
                a, b = endpoints[j-1]

            citation_sentence = paragraph_text[a:b]
            citation_sentences.append((citation_sentence, manuscript_id, cite_id))

# convert to pd.DataFrame
citation_sentences = pd.DataFrame(
    citation_sentences, 
    columns=["citation_sentence", "manuscript_id", "cited_id"]).drop_duplicates()

100%|██████████| 100000/100000 [00:27<00:00, 3655.61it/s]


In [89]:
outpath = "../misc/citation_sentences.jsonl"
if os.path.exists(outpath):
    d = load_data(outpath)
    citation_sentences = pd.DataFrame.from_dict(d).T
else: 
    # save as json
    citation_sentences.to_json(outpath, orient="records", lines=True)
    
citation_sentences

Unnamed: 0,citation_sentence,manuscript_id,cited_id
0,"Specifically, we generalized the distributiona...",18980380,7229756
1,Further studies are needed to understand wheth...,18981358,21704892
2,The absence of changes in the perceived positi...,18981358,11263174
3,"In addition to enzymes, some ligands, such as ...",18982781,25252408
4,"Furthermore, the number of estimated deaths (5...",18985891,205233560
...,...,...,...
189865,"In addition, abnormal inflammatory input may i...",11685058,3548631
189866,Current and proposed clinical guidelines allow...,11685058,18494690
189867,Many investigators are dedicated to unveil the...,11685696,8854921
189868,Obesity is an independent risk factor for OSA ...,11685696,26741728


### 2.2. Filter out citation sentences with wrong intent

In [82]:
# preprocess citation sentences
dataloader = DataLoader(citation_sentences["citation_sentence"], 
                        sampler=SequentialSampler(citation_sentences["citation_sentence"]), 
                        batch_size=BATCH_SIZE)
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_WEIGHTS)

# get input_ids, attention_masks encodings
input_ids = []
attention_masks = []
for batch in tqdm(dataloader):
    b_input_ids, b_attn_mask = tokenizer(batch, 
                                         max_length=MAX_LEN, 
                                         padding="max_length",
                                         truncation=True,  
                                         add_special_tokens=True, 
                                         return_token_type_ids=False).values()

    input_ids.extend(b_input_ids)
    attention_masks.extend(b_attn_mask)
    
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)

100%|██████████| 5934/5934 [00:15<00:00, 386.97it/s]


In [85]:
# create dataloader for model input
encodings = TensorDataset(input_ids, attention_masks)

# process 1000 sentences each time
model_input = DataLoader(encodings[:100], 
                         sampler=SequentialSampler(encodings[:100]), 
                         batch_size=BATCH_SIZE)

In [86]:
# pass citation sentences through intent classifier 
# output index_to_prob (dict)
# - index (int): index of a row whose citation sentence has intent "result-comparison"
# - prob (float): computed probability of "result-comparison" intent

model.to(device)
model.eval()

i, m = 0, 0
index_to_prob = {}
for batch in tqdm(model_input): 
        
    b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)
    
    with torch.no_grad():
        # forward pass through model
        logits = model(b_input_ids, b_attn_mask)
        
    
    # compute probabilities of each label
    probs = F.softmax(logits, dim=1).cpu().numpy()
        
    # get indices where "result-comparison" has highest probability
    ind = np.where(np.argmax(probs, axis=1) == 2)[0]

    # create dict of (index, prob) pairs and update index_to_prob
    # index = ind + batch_size * i = row number in original input data
    d = dict(zip(ind + batch_size * i, probs[ind,2]))
    index_to_prob.update(d)
        
    i += 1
    m += len(ind)

  0%|          | 0/1 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 150.00 MiB (GPU 0; 7.77 GiB total capacity; 3.71 GiB already allocated; 17.50 MiB free; 3.86 GiB reserved in total by PyTorch)

In [12]:
d = load_data("./citation_sentences_subset.jsonl")
citation_sentences_0_2 = pd.DataFrame.from_dict(d).T
citation_sentences_0_2

Unnamed: 0,citation_sentence,manuscript_id,cited_id
0,These ␤␣␤␤␣ folds have also been detected in o...,25056663,28215109
1,KH domains consist of a three-stranded ␤ sheet...,25056663,28215109
2,"However, the typical GXXG motif required for D...",25056663,28215109
3,(24 -26) We also confirm the welldocumented re...,25593307,11387986
4,The N-terminal part of S. cerevisiae contains ...,25593541,26887717
...,...,...,...
240484,"In this regard, it was also noticeable that th...",8359616,17485379
240485,"In this regard, it was also noticeable that th...",8359616,14591
240486,The other eight SP-proteins identified in Sus ...,8359616,24241995
240487,"Noticeable, deoxyribonuclease -2-alpha, an aci...",8359616,39723204


In [15]:
d = load_data("./citation_sentences_3_84.jsonl")
citation_sentences_3_84 = pd.DataFrame.from_dict(d).T
citation_sentences_3_84

Unnamed: 0,citation_sentence,manuscript_id,cited_id
0,A study in 5 government and 12 private hospita...,25214135,44347243
1,"Moreover, these health disparities are not exp...",25217127,41231576
2,Another important area of future study is the ...,25217127,19474685
3,"However, due to limitations in the computation...",21610508,18523563
4,According to published anecdotal descriptions ...,21617327,67777532
...,...,...,...
10145782,"However, the mesh, as a foreign body itself, u...",22603883,25101686
10145783,"Interestingly, previous research with postinst...",7530033,30892
10145784,We and others have previously demonstrated tha...,7530965,39845408
10145785,We and others have previously demonstrated tha...,7530965,10791690
