In [10]:
import json
import os
import random
import re
import time
import tqdm
from tqdm import tqdm 

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, ConcatDataset, DataLoader
from torch.utils.data import RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel

In [2]:
def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    
set_seed()

In [3]:
def load_data(filepath):
    d = {}
    with open(filepath) as f:
        for i, line in enumerate(f):
            d[i] = json.loads(line)
    return pd.DataFrame.from_dict(d).T

s = time.time()
fullbody = load_data("/home/jessica/data/s2orc/s2orc_fullbody_subset_100K.jsonl")
print(f"Time taken: {round(time.time()-s,4)}")

Time taken: 27.3759


In [4]:
fullbody.head()

Unnamed: 0,paper_id,title,authors,abstract,year,arxiv_id,acl_id,pmc_id,pubmed_id,doi,...,inbound_citations,has_outbound_citations,has_inbound_citations,has_pdf_parse,has_pdf_parsed_abstract,has_pdf_parsed_body_text,has_pdf_parsed_bib_entries,has_pdf_parsed_ref_entries,s2_url,body_text
0,94551546,Gd(III) ion-chelated supramolecular assemblies...,"[{'first': 'Yu', 'middle': [], 'last': 'Zhao',...",An intricate polymer complex can carry genes t...,2015,,,,,10.1038/am.2015.67,...,[543091],True,True,True,True,True,True,True,https://api.semanticscholar.org/CorpusID:94551546,"[{'section': 'INTRODUCTION', 'text': 'Gene the..."
1,18980380,Distributionally Robust Counterpart in Markov ...,"[{'first': 'Pengqian', 'middle': [], 'last': '...",This technical note studies Markov decision pr...,2016,1501.07418,,,,10.1109/TAC.2015.2495174,...,"[18980380, 202130625, 12287826, 195316976, 206...",True,True,True,True,True,True,True,https://api.semanticscholar.org/CorpusID:18980380,"[{'section': '', 'text': '. Illustration of th..."
2,18980463,Adult and larval photoreceptors use different ...,"[{'first': 'S. G.', 'middle': [], 'last': 'Spr...",Although development of the adult Drosophila c...,2007,,,PMC1950857,17785526.0,10.1101/gad.1565407,...,"[201966468, 16579577, 14667576, 20024434, 1344...",True,True,True,True,True,True,True,https://api.semanticscholar.org/CorpusID:18980463,"[{'section': '', 'text': 'In spite of the morp..."
3,18981111,Exploration of Unknown Spaces by People Who Ar...,"[{'first': 'Orly', 'middle': [], 'last': 'Laha...",Exploration of unknown spaces is essential for...,2004,,,,,10.1177/016264340401900302,...,"[6869367, 24873259, 69173440, 12309066, 357763...",True,True,True,False,True,True,True,https://api.semanticscholar.org/CorpusID:18981111,"[{'section': '', 'text': 'The ability to explo..."
4,18981358,Rubber hand illusion induced by touching the f...,"[{'first': 'Michele', 'middle': [], 'last': 'S...",Background: Studies in animals and humans indi...,2014,,,PMC4050649,24959128.0,10.3389/fnhum.2014.00404,...,"[209325151, 206095004, 55701872, 5015858, 5802...",True,True,True,True,True,True,True,https://api.semanticscholar.org/CorpusID:18981358,"[{'section': 'INTRODUCTION', 'text': 'Spinal c..."


In [5]:
fullbody.columns

Index(['paper_id', 'title', 'authors', 'abstract', 'year', 'arxiv_id',
       'acl_id', 'pmc_id', 'pubmed_id', 'doi', 'venue', 'journal',
       'has_pdf_body_text', 'mag_id', 'mag_field_of_study',
       'outbound_citations', 'inbound_citations', 'has_outbound_citations',
       'has_inbound_citations', 'has_pdf_parse', 'has_pdf_parsed_abstract',
       'has_pdf_parsed_body_text', 'has_pdf_parsed_bib_entries',
       'has_pdf_parsed_ref_entries', 's2_url', 'body_text'],
      dtype='object')

In [6]:
fullbody = fullbody[["paper_id", "body_text"]]
fullbody.head()

Unnamed: 0,paper_id,body_text
0,94551546,"[{'section': 'INTRODUCTION', 'text': 'Gene the..."
1,18980380,"[{'section': '', 'text': '. Illustration of th..."
2,18980463,"[{'section': '', 'text': 'In spite of the morp..."
3,18981111,"[{'section': '', 'text': 'The ability to explo..."
4,18981358,"[{'section': 'INTRODUCTION', 'text': 'Spinal c..."


In [7]:
# work with one paper for now
paper = fullbody.loc[0]
print(paper["paper_id"])
print(" ".join([_["text"] for _ in paper["body_text"]]))

94551546
Gene therapy holds potential for treating many severe diseases, such as cancer and genetic diseases. 1 Successful gene therapy depends on highefficiency gene delivery processes, in which the gene carriers have an essential role. The application of traditional viral vectors has been a challenge because of their toxicity, immunogenicity and low capability for scaling up. 2 There has long been a scientific demand for developing non-viral gene delivery systems that can overcome the drawbacks of viral vectors. 3 Non-viral gene delivery has been advanced by the rapid development of materials science and technology. Numerous novel gene delivery systems have been proposed based on functional cationic polymers, such as polyethylenimine (PEI), [4] [5] [6] [7] [8] [9] poly(2-(dimethylamino) ethyl methacrylate), 4,10 poly(L-lysine), 11 poly(aspartic acid) 12, 13 and polyamidoamine. 14 However, these non-viral gene carriers still have shortcomings, including cytotoxicity, low transfection 

In [8]:
# initialise sentence tokenizer
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters

punkt_param = PunktParameters()

# tokenizer should not split at abbrieviations
punkt_param.abbrev_types = set(["i.e", "etc", "al", "fig", "figs", "ref", "refs", "p", "c", "s"]) 
punkt_sent_tokenizer = PunktSentenceTokenizer(punkt_param)

In [9]:
# (try to) extract citation sentence
citation_sentences = []
for paragraph in paper["body_text"]:
    cite_spans = paragraph["cite_spans"]
    
    # ignore paragraphs without citations
    if not cite_spans: continue 
        
    paragraph_text = paragraph["text"]
    print(paragraph_text)
    print()
        
    # tokenize the paragraph into sentences
    endpoints = list(punkt_sent_tokenizer.span_tokenize(paragraph_text))
    
    # for each citation marker, 
    for cite_span in cite_spans:
        start, end = cite_span["start"], cite_span["end"]
        
        # print the sentence containing the citation marker
        for a,b in endpoints:
            if start >= a and end <= b:
                print(f"{cite_span['text']}: {paragraph_text[a:b]}")

Recently, we found that ethanolamine (EA) or ethylenediamine (ED)-functionalized poly(glycidyl methacrylate) (PGMA), namely PGEA or PGED, could be used as effective gene carriers. 15, 16 They possess good gene transfection properties. To further improve the performance of PGMA-based gene carriers, several strategies have been applied such as polysaccharide introduction and target molecule binding. 16, 17 Owing to the dynamically unable ability of supramolecular polymers, the application of supramolecular chemistry for gene delivery has been a hot research topic in the biomedical field. 18, 19 The construction of supramolecular polycations via host-guest interaction is a popular strategy for high-efficiency gene delivery systems. 20 In particular, cyclodextrins (CDs) and their derivatives have been widely utilized for constructing supramolecular gene delivery systems, mainly because of their superior biocompatibility. [21] [22] [23] With the host-guest interaction strategy, we successfu

In [12]:
# use regex to extract citation sentences correctly
citation_sentences = []
for paragraph in paper["body_text"]:
    cite_spans = paragraph["cite_spans"]
    
    # ignore paragraphs without citations
    if not cite_spans: continue 
        
    paragraph_text = paragraph["text"]
    print(paragraph_text)
    print()
        
    # tokenize the paragraph into sentences
    endpoints = list(punkt_sent_tokenizer.span_tokenize(paragraph_text))
    
    # for each citation marker, 
    for cite_span in cite_spans:
        cite_text = cite_span["text"]
        start, end = cite_span["start"], cite_span["end"]
        
        # determine if citation marker is textual
        textual = re.search('[a-zA-Z]', cite_text) 
        
        # find the sentence containing the citation marker
        for i, pair in enumerate(endpoints):
            a,b = pair
            if start >= a and end <= b: 
                candidate = paragraph_text[a:b]
                
                # if citation marker is textual or candidate begins with words,  
                if textual or re.search('[a-zAZ]', paragraph_text[a:start]): 
                    # assume candidate is correct citation sentence
                    citation_sentences.append((cite_text, candidate))
                else:
                    # previous sentence is true citation sentence 
                    a,b = endpoints[i-1]
                    citation_sentences.append((cite_text, paragraph_text[a:end]))

for pair in citation_sentences:
    print(f"{pair[0]}: {pair[1]}")

Recently, we found that ethanolamine (EA) or ethylenediamine (ED)-functionalized poly(glycidyl methacrylate) (PGMA), namely PGEA or PGED, could be used as effective gene carriers. 15, 16 They possess good gene transfection properties. To further improve the performance of PGMA-based gene carriers, several strategies have been applied such as polysaccharide introduction and target molecule binding. 16, 17 Owing to the dynamically unable ability of supramolecular polymers, the application of supramolecular chemistry for gene delivery has been a hot research topic in the biomedical field. 18, 19 The construction of supramolecular polycations via host-guest interaction is a popular strategy for high-efficiency gene delivery systems. 20 In particular, cyclodextrins (CDs) and their derivatives have been widely utilized for constructing supramolecular gene delivery systems, mainly because of their superior biocompatibility. [21] [22] [23] With the host-guest interaction strategy, we successfu

In [13]:
# try the above on papers with different citation marker types 
paper = fullbody.loc[2]

citation_sentences = []
for paragraph in paper["body_text"]:
    cite_spans = paragraph["cite_spans"]
    if not cite_spans: continue
    
    paragraph_text = paragraph["text"]
    print(paragraph_text)
    print()
    
    endpoints = list(punkt_sent_tokenizer.span_tokenize(paragraph_text))

    for cite_span in cite_spans:
        cite_text = cite_span["text"]
        start, end = cite_span["start"], cite_span["end"]
        textual = re.search('[a-zA-Z]', cite_text) 

        for i, pair in enumerate(endpoints):
            a,b = pair
            if start >= a and end <= b: 
                candidate = paragraph_text[a:b]
                if textual or re.search('[a-zAZ]', paragraph_text[a:start]): 
                    citation_sentence = candidate
                else:
                    a,b = endpoints[i-1]
                    citation_sentence = paragraph_text[a:end]

        print(f"{cite_text}: {citation_sentence}")
    print("\n")

The generation of the two types of ommatidia, yellow and pale, includes several steps. First, the stochastic expression of the transcription factor Spineless (Ss) in a subset of R7 cells specifies yellow ommatidia. Ss is required cell autonomously in yR7 for rh4 expression and, further, cell nonautonomously for the underlying R8 cell to acquire y fate and turn on rh6 expression (Wernet et al. 2006) . The coordination between R7 and R8 rhodopsins requires a signal from pR7 that induces the pR8 fate. In sevenless mutants that lack R7, rh5 expression is lost while rh6 is expanded to almost all R8 (Papatsenko et al. 1997; Chou et al. 1999) . The y versus p choice in R8 is then reinforced by a bistable loop of regulation between the tumor suppressor gene warts (wts) and the growth regulator melted (melt) (Mikeladze-Dvali et al. 2005b) : wts is required for rh6 expression, whereas melt is essential for rh5 expression. wts and melt repress each other transcriptionally, thereby ensuring that a

In [None]:
# deploy on sample set
extracted = {}

s = time.time()
for i in tqdm(range(len(fullbody))):
    paper_id = fullbody.loc[i]["paper_id"]
    bodytext = fullbody.loc[i]["body_text"]
    
    citation_sentences = []
    for paragraph in bodytext:
        cite_spans = paragraph["cite_spans"]
        if not cite_spans: continue

        paragraph_text = paragraph["text"]

        endpoints = list(punkt_sent_tokenizer.span_tokenize(paragraph_text))

        for cite_span in cite_spans:
            cite_text = cite_span["text"]
            start, end = cite_span["start"], cite_span["end"]
            textual = re.search('[a-zA-Z]', cite_text) 

            for i, pair in enumerate(endpoints):
                a,b = pair
                if start >= a and end <= b: 
                    candidate = paragraph_text[a:b]
                    if textual or re.search('[a-zAZ]', paragraph_text[a:start]): 
                        citation_sentences.append((cite_text, candidate))
                    else:
                        a,b = endpoints[i-1]
                        citation_sentences.append((cite_text, paragraph_text[a:end]))
    
    if citation_sentences:
        extracted[paper_id] = citation_sentences
print(f"Time taken: {round(time.time()-s,4)}")

In [None]:
print(len(extracted))
print(random.choice(list(extracted.values())))