In [10]:
import table_utils
import process_paper_utils as utils
import os
from bs4 import BeautifulSoup
import html
from datetime import datetime
import pandas as pd

In [None]:
#Enter PMC here:
PMC = "PMC5514907"

In [12]:
os.makedirs("papers", exist_ok=True)
if os.path.isfile(f"papers/{PMC}.xml"):
    with open(f"papers/{PMC}.xml", 'r', encoding='utf-8') as f:
        paper_xml = f.read()
else:
    paper_xml = utils.fetch_xml(PMC).replace("|", "/")
    with open(f"papers/{PMC}.xml", "w", encoding="utf-8") as f:
        f.write(paper_xml)
print(paper_xml)

<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE collection SYSTEM "BioC.dtd"><collection><source>BioC-API</source><date>20250708</date><key>collection.key</key><document><id>PMC7767363</id><infon key="license">CC BY</infon><passage><infon key="article-id_doi">10.3390/ijms21249764</infon><infon key="article-id_pmc">PMC7767363</infon><infon key="article-id_pmid">33371333</infon><infon key="article-id_publisher-id">ijms-21-09764</infon><infon key="elocation-id">9764</infon><infon key="issue">24</infon><infon key="kwd">HER2-positive cancer RNA aptamer mertansine aptamer delivery</infon><infon key="license">Licensee MDPI, Basel, Switzerland. This article is an open access article distributed under the terms and conditions of the Creative Commons Attribution (CC BY) license (http://creativecommons.org/licenses/by/4.0/).</infon><infon key="name_0">surname:Jeong;given-names:Hwa Yeon</infon><infon key="name_1">surname:Kim;given-names:Hyeri</infon><infon key="name_2">surname:Lee;given-names:Myun

In [13]:
soup = BeautifulSoup(paper_xml, "lxml-xml")

#abstract
abstract_passages = [p for p in soup.find_all("passage") if p.find("infon", {"key": "section_type"}) and 
        p.find("infon", {"key": "section_type"}).text.strip().upper() == "ABSTRACT"]
abstract_text_list = [p.find("text").get_text() for p in abstract_passages if p.find("text")]
abstract = "\n".join(abstract_text_list)

#methods
methods_passages = [p for p in soup.find_all("passage") if p.find("infon", {"key": "section_type"}) and 
        p.find("infon", {"key": "section_type"}).text.strip().upper() == "METHODS"]
methods_text_list = [p.find("text").get_text() for p in methods_passages if p.find("text")]
methods = "\n".join(methods_text_list)

#results
results_passages = [p for p in soup.find_all("passage") if p.find("infon", {"key": "section_type"}) and 
        p.find("infon", {"key": "section_type"}).text.strip().upper() == "RESULTS"]
results_title_passages = [p for p in soup.find_all("passage") if p.find("infon", {"key": "section_type"}) and 
        p.find("infon", {"key": "section_type"}).text.strip().upper() == "RESULTS" and
        p.find("infon", {"key": "type"}) and "TITLE" in p.find("infon", {"key": "type"}).text.strip().upper()]
results_text_list = [p.find("text").get_text() for p in results_passages if p.find("text")]
results_title_list = [p.find("text").get_text() for p in results_title_passages if p.find("text")]
results = "\n".join(results_text_list)

#discussion
discussion_passages = [p for p in soup.find_all("passage") if p.find("infon", {"key": "section_type"}) and 
        p.find("infon", {"key": "section_type"}).text.strip().upper() == "DISCUSSION"]
discussion_text_list = [p.find("text").get_text() for p in discussion_passages if p.find("text")]
discussion = "\n".join(discussion_text_list)

In [14]:
text_groups = []
group_names = []
for row in utils.groups_from_text(PMC, abstract, methods, results):
    if len(row) not in [4, 6, 7, 9] and len(row) < 10:
        print("ERROR: Invalid text_groups label")
        continue
    text_group = {}
    text_group["group"] = row[0]
    text_group["animal_model"] = row[1] if row[1] else None
    text_group["sample_size"] = row[2] if row[2] else None
    if len(row) == 4:
        text_group["treatment1"] = None
        text_group["dose1"] = None
        text_group["units1"] = None
        text_group["treatment2"] = None
        text_group["dose2"] = None
        text_group["units2"] = None
    elif len(row) == 6:
        text_group["treatment1"] = row[3]
        text_group["dose1"] = row[4]
        text_group["units1"] = row[5]
        text_group["treatment2"] = None
        text_group["dose2"] = None
        text_group["units2"] = None
    else:
        text_group["treatment1"] = row[3]
        text_group["dose1"] = row[4]
        text_group["units1"] = row[5]
        text_group["treatment2"] = row[6]
        text_group["dose2"] = row[7]
        text_group["units2"] = row[8]
    text_groups.append(text_group)
    group_names.append(row[0].strip().lower())
text_groups_df = pd.DataFrame(data=text_groups, index=group_names)

control|mouse|4||||||| 
PBS|mouse|4||||||| 
free DM1|mouse|4|DM1|60|μg/kg||| 
ApDC|mouse|4|ApDC|2.7|mg/kg||| 
DM1 low|mouse| |DM1|12|μg/kg||| 
DM1 medium|mouse| |DM1|60|μg/kg||| 
DM1 high|mouse| |DM1|300|μg/kg|||
[['control', 'mouse', '4', '', '', '', '', '', '', ''], ['PBS', 'mouse', '4', '', '', '', '', '', '', ''], ['free DM1', 'mouse', '4', 'DM1', '60', 'μg/kg', '', '', ''], ['ApDC', 'mouse', '4', 'ApDC', '2.7', 'mg/kg', '', '', ''], ['DM1 low', 'mouse', ' ', 'DM1', '12', 'μg/kg', '', '', ''], ['DM1 medium', 'mouse', ' ', 'DM1', '60', 'μg/kg', '', '', ''], ['DM1 high', 'mouse', ' ', 'DM1', '300', 'μg/kg', '', '', '']]


In [15]:
BATCH_SIZE = 5
paragraph_list = []
last_title = "Results"
for paragraph in results_text_list:
    if paragraph in results_title_list:
        last_title = paragraph
    else:
        paragraph_list.append({"title": last_title, "text": paragraph})

batch_list = []
for index in range(0, len(paragraph_list), BATCH_SIZE):
    batch = []
    for i in range(5):
        if index+i >= len(paragraph_list): break
        batch.append(paragraph_list[index+i])
    utils.label_batch(PMC, methods, text_groups, batch)
    batch_list.append(batch)
        

In [16]:
for batch in batch_list:
    for paragraph in batch:
        print(f"Label: {paragraph["label"]}")
        print("---")
        print(paragraph["text"])
        print("------------------------------------------------")

Label: False
---
It is known that RNA molecules are generally unstable and easily degraded by RNases in various environments. However, the stability of RNA can be secured by various conventional modification methods, and the RNA aptamer used in the present study (Figure 1A) was also modified at the pyrimidine by 2'-fluorinated pyrimidine, thereby achieving much higher stability than the unmodified RNA aptamer. The gel retardation analysis of aptamer molecules (Figure 1B) shows the relative stabilities of unmodified RNA aptamer and 2'-pyrimidine-modified RNA aptamer in 10% FBS-containing medium. The results of gel retardation analysis of the unmodified RNA aptamer revealed that the unmodified molecules degraded more than 50% in 1 h. On the other hand, 2'-pyrimidine-modified RNA aptamers were able to maintain their structure over 50% even 48 h later.
------------------------------------------------
Label: False
---
The 2'-pyrimidine-modified RNA aptamers utilized in this study exhibited 

In [17]:
num_output = []
freq_output = []
desc_output = []
for batch in batch_list:
    for paragraph in batch:
        if paragraph["label"]:
            paragraph_data = utils.data_from_text(PMC, methods, text_groups_df, paragraph)
            num_output = num_output + paragraph_data["num"]
            freq_output = freq_output + paragraph_data["freq"]
            desc_output = desc_output + paragraph_data["desc"]