## 1. Set-up

In [1]:
import json
import os
import random
import re
import time
import tqdm
from tqdm import tqdm 

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, ConcatDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel

import sys
sys.path.append('..')
from citation_intent_classification import CiteBERT

In [2]:
# check for GPU
if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f"There is {torch.cuda.device_count()} GPU(s) available.")
    print("Device name:", torch.cuda.get_device_name(0))

else:
    print("No GPU available, using the CPU instead.")
    device = torch.device("cpu")

There is 1 GPU(s) available.
Device name: GeForce RTX 2070


## 2. S2ORC dataset

In [3]:
def load_data(filepath):
    d = {}
    with open(filepath) as f:
        for i, line in enumerate(f):
            d[i] = json.loads(line)
    return d

s = time.time()
filepath = "/home/jessica/data/s2orc/s2orc_fullbody_subset_100K.jsonl"
d = load_data(filepath)
s2orc = pd.DataFrame.from_dict(d).T
print(f"Time taken: {round(time.time()-s,4)} seconds")

display(s2orc.head())

# drop unnecessary columns
s2orc = s2orc[["paper_id", "body_text"]]

FileNotFoundError: [Errno 2] No such file or directory: '/home/jessica/data/s2orc/s2orc_fullbody_subset_100K.jsonl'

### 2.1. Extracting citation sentences

In [8]:
# tokenizer should not split at abbrieviations
punkt_params = PunktParameters()
punkt_params.abbrev_types = set(["i.e", "e.g", "etc", "al", "fig", "figs", 
                                 "ref", "refs", "p", "c", "s"]) 

# initialise sentence tokenizer
punkt_sent_tokenizer = PunktSentenceTokenizer(punkt_params)

In [11]:
# (try to) extract citation sentences from example
for paragraph in s2orc.loc[0, "body_text"]:
    cite_spans = paragraph["cite_spans"]
    
    # ignore paragraphs without citations within S2ORC
    if not cite_spans: continue 
        
    paragraph_text = paragraph["text"]
    print(paragraph_text)
        
    # tokenize the paragraph into sentences
    endpoints = list(punkt_sent_tokenizer.span_tokenize(paragraph_text))
    
    j = 0
    # for each citation marker, 
    for cite_span in cite_spans:
        cite_text = cite_span["text"]
        start, end = cite_span["start"], cite_span["end"]

        # get the sentence that contains cite_text
        a, b = endpoints[j]
        while start >= b:
            j += 1
            a, b = endpoints[j]
            
        print(f"\n{cite_text}: {paragraph_text[a:b]}")

Recently, we found that ethanolamine (EA) or ethylenediamine (ED)-functionalized poly(glycidyl methacrylate) (PGMA), namely PGEA or PGED, could be used as effective gene carriers. 15, 16 They possess good gene transfection properties. To further improve the performance of PGMA-based gene carriers, several strategies have been applied such as polysaccharide introduction and target molecule binding. 16, 17 Owing to the dynamically unable ability of supramolecular polymers, the application of supramolecular chemistry for gene delivery has been a hot research topic in the biomedical field. 18, 19 The construction of supramolecular polycations via host-guest interaction is a popular strategy for high-efficiency gene delivery systems. 20 In particular, cyclodextrins (CDs) and their derivatives have been widely utilized for constructing supramolecular gene delivery systems, mainly because of their superior biocompatibility. [21] [22] [23] With the host-guest interaction strategy, we successfu

In [27]:
# extract based on citation marker type 
# if marker is textual, e.g. "Devlin et al., 2018", "CS93", "(Ehrman, 2020)" 
#     then extract sentence containing the marker
# 
# else marker is non-textual, e.g. "21", "[1]"
#     if marker is at the start of the sentence containing it, e.g. "[21] [22] With the host-guest ..."
#         then marker actually belongs to the previous sentence
#         extract previous sentence
# 
#     else there are words before the marker, e.g. "It has been shown in [21] that ..."
#         then marker belongs to this sentence
#         extract sentence containing the marker

for paragraph in s2orc.loc[0, "body_text"]:
    # skip paragraphs with no citations
    if not paragraph["cite_spans"]: continue 
        
    paragraph_text = paragraph["text"]
    print(paragraph_text)
        
    # tokenize the paragraph into sentences
    endpoints = list(punkt_sent_tokenizer.span_tokenize(paragraph_text))
    
    j = 0
    cite_spans = paragraph["cite_spans"]
    # for each citation marker, 
    for cite_span in cite_spans:
        cite_text = cite_span["text"]
        start, end = cite_span["start"], cite_span["end"]
        
        # get the sentence that contains cite_text
        a, b = endpoints[j]
        while start >= b:
            j += 1
            a, b = endpoints[j]
        
        # get the correct citation sentence
        textual = re.search("[a-zA-Z]", cite_text)
        if not (textual or re.search("[a-zA-Z]", paragraph_text[a:start])): 
            a, b = endpoints[j-1]

        print(f"\n{cite_text} - {paragraph_text[a:b]}")

Recently, we found that ethanolamine (EA) or ethylenediamine (ED)-functionalized poly(glycidyl methacrylate) (PGMA), namely PGEA or PGED, could be used as effective gene carriers. 15, 16 They possess good gene transfection properties. To further improve the performance of PGMA-based gene carriers, several strategies have been applied such as polysaccharide introduction and target molecule binding. 16, 17 Owing to the dynamically unable ability of supramolecular polymers, the application of supramolecular chemistry for gene delivery has been a hot research topic in the biomedical field. 18, 19 The construction of supramolecular polycations via host-guest interaction is a popular strategy for high-efficiency gene delivery systems. 20 In particular, cyclodextrins (CDs) and their derivatives have been widely utilized for constructing supramolecular gene delivery systems, mainly because of their superior biocompatibility. [21] [22] [23] With the host-guest interaction strategy, we successfu

In [87]:
# extract correct citation sentences
citation_sentences = []

## for each paper, 
for i in tqdm(range(len(s2orc))):
    manuscript_id = s2orc.loc[i, "paper_id"]
    full_text = s2orc.loc[i, "body_text"]

    for paragraph in full_text:
        ## skip paragraphs that are not in "discussion" section 
        section_name = paragraph["section"].lower()
        if "discuss" not in section_name and "conclu" not in section_name:
            continue 
            
        ## also skip paragraphs with no citation sentences
        if not paragraph["cite_spans"]: 
            continue

        ## tokenize paragraph into sentences 
        paragraph_text = paragraph["text"]
        endpoints = list(punkt_sent_tokenizer.span_tokenize(paragraph_text))

        j = 0

        ## for each citation marker, 
        for cite_span in paragraph["cite_spans"]:
            cite_id = cite_span["cite_id"]
            cite_text = cite_span["text"]
            start, end = cite_span["start"], cite_span["end"]

            ## extract the sentence containing the citation marker
            a, b = endpoints[j]
            while start >= b:
                j += 1
                a, b = endpoints[j]

            ## if citation marker is textual or sentence begins with words, 
            ## assume extracted sentence is true citation sentence
            ## else, take previous sentence instead
            textual = re.search('[a-zA-Z]', cite_text) 
            if not (textual or re.search("[a-zA-Z]", paragraph_text[a:start])): 
                a, b = endpoints[j-1]

            citation_sentence = paragraph_text[a:b]
            citation_sentences.append((citation_sentence, manuscript_id, cite_id))

# convert to pd.DataFrame
citation_sentences = pd.DataFrame(
    citation_sentences, 
    columns=["citation_sentence", "manuscript_id", "cited_id"]).drop_duplicates()

100%|██████████| 100000/100000 [00:27<00:00, 3655.61it/s]


In [89]:
outpath = "../misc/citation_sentences.jsonl"
if os.path.exists(outpath):
    d = load_data(outpath)
    citation_sentences = pd.DataFrame.from_dict(d).T
else: 
    # save as json
    citation_sentences.to_json(outpath, orient="records", lines=True)
    
citation_sentences

Unnamed: 0,citation_sentence,manuscript_id,cited_id
0,"Specifically, we generalized the distributiona...",18980380,7229756
1,Further studies are needed to understand wheth...,18981358,21704892
2,The absence of changes in the perceived positi...,18981358,11263174
3,"In addition to enzymes, some ligands, such as ...",18982781,25252408
4,"Furthermore, the number of estimated deaths (5...",18985891,205233560
...,...,...,...
189865,"In addition, abnormal inflammatory input may i...",11685058,3548631
189866,Current and proposed clinical guidelines allow...,11685058,18494690
189867,Many investigators are dedicated to unveil the...,11685696,8854921
189868,Obesity is an independent risk factor for OSA ...,11685696,26741728


### 2.2. Filter out citation sentences with wrong intent

In [81]:
# prepare citation intent classifier
MAX_LEN, BATCH_SIZE = 512, 32
PRETRAINED_WEIGHTS="allenai/scibert_scivocab_uncased"

model = CiteBERT(PRETRAINED_WEIGHTS, D_out=3)
state_dict = torch.load("../models/CiteSciBERT")
model.load_state_dict(state_dict)

<All keys matched successfully>

In [82]:
# preprocess citation sentences
dataloader = DataLoader(citation_sentences["citation_sentence"], 
                        sampler=SequentialSampler(citation_sentences["citation_sentence"]), 
                        batch_size=BATCH_SIZE)
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_WEIGHTS)

# get input_ids, attention_masks encodings
input_ids = []
attention_masks = []
for batch in tqdm(dataloader):
    b_input_ids, b_attn_mask = tokenizer(batch, 
                                         max_length=MAX_LEN, 
                                         padding="max_length",
                                         truncation=True,  
                                         add_special_tokens=True, 
                                         return_token_type_ids=False).values()

    input_ids.extend(b_input_ids)
    attention_masks.extend(b_attn_mask)
    
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)

100%|██████████| 5934/5934 [00:15<00:00, 386.97it/s]


In [85]:
# create dataloader for model input
encodings = TensorDataset(input_ids, attention_masks)

# process 1000 sentences each time
model_input = DataLoader(encodings[:100], 
                         sampler=SequentialSampler(encodings[:100]), 
                         batch_size=BATCH_SIZE)

In [86]:
# pass citation sentences through intent classifier 
# output index_to_prob (dict)
# - index (int): index of a row whose citation sentence has intent "result-comparison"
# - prob (float): computed probability of "result-comparison" intent

model.to(device)
model.eval()

i, m = 0, 0
index_to_prob = {}
for batch in tqdm(model_input): 
        
    b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)
    
    with torch.no_grad():
        # forward pass through model
        logits = model(b_input_ids, b_attn_mask)
        
    
    # compute probabilities of each label
    probs = F.softmax(logits, dim=1).cpu().numpy()
        
    # get indices where "result-comparison" has highest probability
    ind = np.where(np.argmax(probs, axis=1) == 2)[0]

    # create dict of (index, prob) pairs and update index_to_prob
    # index = ind + batch_size * i = row number in original input data
    d = dict(zip(ind + batch_size * i, probs[ind,2]))
    index_to_prob.update(d)
        
    i += 1
    m += len(ind)

  0%|          | 0/1 [00:00<?, ?it/s]


RuntimeError: CUDA out of memory. Tried to allocate 150.00 MiB (GPU 0; 7.77 GiB total capacity; 3.71 GiB already allocated; 17.50 MiB free; 3.86 GiB reserved in total by PyTorch)