In [1]:
import table_utils
import process_paper_utils as utils
import os
from bs4 import BeautifulSoup
import html
from datetime import datetime
import pandas as pd

In [2]:
#Enter PMC here:
PMC = "PMC9322224"

Imports paper from PMC

In [3]:
os.makedirs("papers", exist_ok=True)
if os.path.isfile(f"papers/{PMC}.xml"):
    with open(f"papers/{PMC}.xml", 'r', encoding='utf-8') as f:
        paper_xml = f.read()
else:
    paper_xml = utils.fetch_xml(PMC)
    with open(f"papers/{PMC}.xml", "w", encoding="utf-8") as f:
        f.write(paper_xml)
print(paper_xml)

<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE collection SYSTEM "BioC.dtd"><collection><source>BioC-API</source><date>20250617</date><key>collection.key</key><document><id>PMC9322224</id><infon key="license">CC BY-NC-ND</infon><passage><infon key="article-id_doi">10.1016/j.jfda.2017.11.001</infon><infon key="article-id_pmc">PMC9322224</infon><infon key="article-id_pmid">29567262</infon><infon key="article-id_publisher-id">jfda-26-02-903</infon><infon key="fpage">903</infon><infon key="issue">2</infon><infon key="kwd">Aspartame Memory Folate deficient rat model Oxidative stress Free radical</infon><infon key="license">This is an open access article under the CC-BY-NC-ND license (http://creativecommons.org/licenses/by-nc-nd/4.0/).</infon><infon key="lpage">916</infon><infon key="name_0">surname:Iyaswamy;given-names:Ashok</infon><infon key="name_1">surname:Kammella;given-names:Ananth Kumar</infon><infon key="name_2">surname:Thavasimuthu;given-names:Citarasu</infon><infon key="name_3">su

Extracts abstract and tables from paper

In [4]:
from collections import defaultdict

soup = BeautifulSoup(paper_xml, "lxml-xml")

#abstract
abstract_passages = [p for p in soup.find_all("passage") if p.find("infon", {"key": "section_type"}) and 
        p.find("infon", {"key": "section_type"}).text.strip().upper() == "ABSTRACT"]
abstract_text_list = [p.find("text").get_text() for p in abstract_passages if p.find("text")]
abstract = "\n".join(abstract_text_list)

#methods
methods_passages = [p for p in soup.find_all("passage") if p.find("infon", {"key": "section_type"}) and 
        p.find("infon", {"key": "section_type"}).text.strip().upper() == "METHODS"]
methods_text_list = [p.find("text").get_text() for p in methods_passages if p.find("text")]
methods = "\n".join(methods_text_list)

#results
results_passages = [p for p in soup.find_all("passage") if p.find("infon", {"key": "section_type"}) and 
        p.find("infon", {"key": "section_type"}).text.strip().upper() == "RESULTS"]
results_text_list = [p.find("text").get_text() for p in results_passages if p.find("text")]
results = "\n".join(results_text_list)

#tables
table_passages = [p for p in soup.find_all("passage") if (sect := p.find("infon", {"key": "section_type"})) and sect.text.strip() == "TABLE"]
tables = defaultdict(lambda: {"label": "o", "caption": None, "markdown": None, "footnotes": [], "col_labels": None, "group_cols": [], "biomarker_cols": []})
for p in table_passages:
    table_id = p.find("infon", {"key": "id"}).text.strip()
    passage_type = p.find("infon", {"key": "type"}).text.strip()
    if passage_type == "table_caption":
        tables[table_id]["caption"] = p.find("text").text.strip()
    elif passage_type == "table":
        table_xml = p.find("infon", {"key": "xml"}).text
        if "<table" not in table_xml:
            continue
        table = table_utils.single_html_table_to_markdown(html.unescape(table_xml))
        tables[table_id]["markdown"] = table_utils.transpose_markdown_table(table_utils.transpose_markdown_table(table))
    elif passage_type in ("table_foot", "table_footnote"):
        tables[table_id]["footnotes"].append(p.find("text").text.strip())


Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html_content, "html.parser")

Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html, "html.parser")


Generate prompt for identifying each table

In [5]:
id_tables_prompt = utils.id_tables_prompt(abstract, tables)
print(id_tables_prompt)

# Instructions
You will be given the abstract of a research paper which includes at least one animal toxicity study then a list of all the tables included in that paper, each one labeled with its title. For each table identify it as one of three categories: treatment group table (G), biomarker table (B), or other (O). Treatment group tables should contain information on the specific treatment groups, such as the medications, the dosages, the sample size, etc. Tables that only give information on the chemicals used (such as the sourcing) should not be labeled G and should instead be labeled O. Even if the table meets these criteria, only label the table G if it describes the groups of specifically an animal toxicity study within the paper. Otherwise, label it O. Biomarker tables may contain treatment group dosage information, but the main difference between is that biomarker tables should contain biomarker data observed from the treatment groups. This can frequency data (such as surviva

API call to identify tables

In [6]:
id_tables_output = utils.chatgpt_request(id_tables_prompt, 100, 0)

time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
with open(f"responses/table_id/{time}.txt", "w", encoding="utf-8") as file:
    file.write(f"{PMC}\n---\n{id_tables_prompt}\n---\n{id_tables_output}")

https://pharmacoinfo-openai-5.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-10-21
ChatGPT Request Status: 200


Assigning labels to table dict

In [7]:
for label in id_tables_output.split("\n"):
    label = label.strip()
    if not label: continue

    content = label[1:-1]
    tid, label = content.split("|")
    tables[tid.strip()]["label"] = label.strip().lower()

print(tables)

defaultdict(<function <lambda> at 0x000001F826675440>, {'t1-jfda-26-02-903': {'label': 'b', 'caption': 'Effect of Aspartame (40 mg/kg b.wt) on Na+/K+ ATPase, Ca+ ATPase and Mg2+ ATPase (mumoles of phosphorous liberated/min/mg protein) in brain regions.', 'markdown': '| Parameter | Control | MTX treated | Asp + MTX treated |\n| --- | --- | --- | --- |\n| Cerebralcortex | Cerebralcortex | Cerebralcortex | Cerebralcortex |\n| Na/KATPase | 0.73 +- 0.04 | 0.71 +- 0.04 | 0.29 +- 0.04*# |\n| CaATPase | 0.44 +- 0.04 | 0.45 +- 0.04 | 0.28 +- 0.03*# |\n| Mg | 0.70 +- 0.03 | 0.69 +- 0.02 | 0.38 +- 0.02*# |\n| Cerebellum | Cerebellum | Cerebellum | Cerebellum |\n| Na/KATPase | 0.53 +- 0.04 | 0.52 +- 0.03 | 0.35 +- 0.03*# |\n| CaATPase | 0.34 +- 0.04 | 0.34 +- 0.03 | 0.20 +- 0.04*# |\n| Mg | 0.49 +- 0.03 | 0.51 +- 0.03 | 0.22 +- 0.04*# |\n| Midbrain | Midbrain | Midbrain | Midbrain |\n| Na/KATPase | 0.70 +- 0.05 | 0.69 +- 0.03 | 0.28 +- 0.03*# |\n| CaATPase | 0.45 +- 0.04 | 0.47 +- 0.04 | 0.25 +- 0

Formatting Tables and converting to df

In [8]:
#Note: 1 LLM API call per table
formatted_tables = utils.format_tables(PMC, abstract, methods, tables)

for tid, info in formatted_tables.items():
    print(f"###{tid}###")
    if info["markdown"]: print(info["markdown"])
    info["df"] = table_utils.markdown_to_dataframe(info["markdown"])

['Cerebralcortex', 'Cerebralcortex', 'Cerebralcortex', 'Cerebellum', 'Cerebellum', 'Cerebellum', 'Midbrain', 'Midbrain', 'Midbrain', 'Pons medulla', 'Pons medulla', 'Pons medulla', 'Hippocampus', 'Hippocampus', 'Hippocampus', 'Hypothalamus', 'Hypothalamus', 'Hypothalamus']
https://pharmacoinfo-openai-5.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-10-21
ChatGPT Request Status: 200
###t1-jfda-26-02-903###
| Header.Parameter | Cerebralcortex.Na/KATPase.mean | Cerebralcortex.Na/KATPase.SD | Cerebralcortex.CaATPase.mean | Cerebralcortex.CaATPase.SD | Cerebralcortex.Mg.mean | Cerebralcortex.Mg.SD | Cerebellum.Na/KATPase.mean | Cerebellum.Na/KATPase.SD | Cerebellum.CaATPase.mean | Cerebellum.CaATPase.SD | Cerebellum.Mg.mean | Cerebellum.Mg.SD | Midbrain.Na/KATPase.mean | Midbrain.Na/KATPase.SD | Midbrain.CaATPase.mean | Midbrain.CaATPase.SD | Midbrain.Mg.mean | Midbrain.Mg.SD | Pons medulla.Na/KATPase.mean | Pons medulla.Na/KATPase.SD | Pons medulla.CaATPase.me

IDing columns via LLM

In [9]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    info_combined = ""
    if not info["markdown"]: continue
    if info["caption"]:
        info_combined += f"Caption: {info["caption"]}\n"
    info_combined += info["markdown"] + "\n"
    if len(info["footnotes"]) > 0:
        info_combined += "Footnotes:\n"
        for fn in info["footnotes"]:
            info_combined += fn + "\n"
    prompt = utils.id_columns_prompt(info_combined)
    answer = utils.chatgpt_request(prompt, 5000, 0)
    info["col_labels"] = [[elt.strip() for elt in label.strip().lstrip('<').rstrip('>').split("|")] for label in answer.split("\n")]
    time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    with open(f"responses/column_id/{time}.txt", "w", encoding="utf-8") as file:
        file.write(f"{PMC}\n---\n{tid}\n---\n{prompt}\n---\n{answer}")

https://pharmacoinfo-openai-5.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-10-21
ChatGPT Request Status: 200


Processing column IDs

In [10]:
for tid, info in formatted_tables.items():
    if info["label"] != "b":
        continue
    info["biomarker_cols"] = []
    info["group_cols"] = []
    column_info = {}
    for label in info["col_labels"]:
        if label[1].lower() not in ["biomarker", "group", "dose", "size", "animal", "time"]: continue
        print(label[1].lower())
        if label[1].lower() == "biomarker":
            if label[3].lower() not in ["mean", "variation", "frequency", "severity"]: continue
            biomarker_label = {"column": label[0], "label": "biomarker", "type": label[3].lower(), "units": None, "name": label[2]}
            if label[3].lower() == "mean" and len(label) > 4:
                biomarker_label["units"] = label[4]
            elif label[3].lower() == "variation" and len(label) > 4:
                biomarker_label["units"] = label[4]
            elif label[3].lower() == "frequency":
                if len(label) > 4 and label[4].lower() in ["percent", "percents", "percentage", "percentages", "%", "count", "counts", "decimal", "decimals"]:
                    biomarker_label["units"] = label[4]
                else:
                    print("ERROR: Invalid type for count label -> skipping")
                    continue
            info["biomarker_cols"].append(biomarker_label)
        else:
            group_label = {"column": label[0], "label": label[1].strip().lower(), "units": [], "link": None, "name": None}
            if label[1].lower() == "dose":
                for i in range(2,len(label)):
                    if label[i].strip()[:5].lower() == "name:":
                        group_label["name"] = label[i].strip()[5:].strip()
                    elif label[i].strip()[:6].lower() == "units:":
                        group_label["units"].append(label[i].strip()[6:].strip())
                    elif label[i].strip()[:5].lower() == "link:":
                        group_label["link"] = label[i].strip()[5:].strip()
            elif label[1].lower() == "animal":
                if len(label) > 2:
                    group_label["name"] = label[2]
            info["group_cols"].append(group_label)

group
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker


In [11]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    print(info["col_labels"])
    print(f"GROUP COLUMNS:\n{info["group_cols"]}")
    print(f"BIOMARKER COLUMNS:\n{info["biomarker_cols"]}")

[['Header.Parameter', 'group'], ['Cerebralcortex.Na/KATPase.mean', 'biomarker', 'Cerebral cortex Na+/K+ ATPase', 'mean', 'mumoles/min/mg protein'], ['Cerebralcortex.Na/KATPase.SD', 'biomarker', 'Cerebral cortex Na+/K+ ATPase', 'variation', 'mumoles/min/mg protein'], ['Cerebralcortex.CaATPase.mean', 'biomarker', 'Cerebral cortex Ca+ ATPase', 'mean', 'mumoles/min/mg protein'], ['Cerebralcortex.CaATPase.SD', 'biomarker', 'Cerebral cortex Ca+ ATPase', 'variation', 'mumoles/min/mg protein'], ['Cerebralcortex.Mg.mean', 'biomarker', 'Cerebral cortex Mg2+ ATPase', 'mean', 'mumoles/min/mg protein'], ['Cerebralcortex.Mg.SD', 'biomarker', 'Cerebral cortex Mg2+ ATPase', 'variation', 'mumoles/min/mg protein'], ['Cerebellum.Na/KATPase.mean', 'biomarker', 'Cerebellum Na+/K+ ATPase', 'mean', 'mumoles/min/mg protein'], ['Cerebellum.Na/KATPase.SD', 'biomarker', 'Cerebellum Na+/K+ ATPase', 'variation', 'mumoles/min/mg protein'], ['Cerebellum.CaATPase.mean', 'biomarker', 'Cerebellum Ca+ ATPase', 'mean', '

Deriving Groups via Tables

In [12]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    table_groups = []
    group_names = []
    info["df"].reset_index(drop=True, inplace=True)
    for idx, row in info["df"].iterrows():
        group_row = {"group": None, "animal_model": None, "sample_size": None, "terminal_time": None,"treatment1": None, "dose1": None, "units1": None,"treatment2": None, "dose2": None, "units2": None}
        group_name = ""
        for label in info["group_cols"]:
            print(label)
            col_val = info["df"].loc[idx, label["column"]]
            if label["label"] == "group":
                group_name += f".{col_val}"
            elif label["label"] == "dose":
                if not group_row["dose1"]:
                    group_row["dose1"] = col_val
                    if label["name"]:
                        group_row["treatment1"] = label["name"]
                    elif label["link"]:
                        group_row["treatment1"] = info["df"].loc[idx, label["link"]]
                elif not group_row["dose2"]:
                    group_row["dose2"] = col_val
                    if label["name"]:
                        group_row["treatment2"] = label["name"]
                    elif label["link"]:
                        group_row["treatment2"] = info["df"].loc[idx, label["link"]]
                else:
                    print("ERROR: >2 treatments")
            elif label["label"] == "size":
                group_row["sample_size"] = col_val
            elif label["label"] == "animal":
                group_row["animal_model"] = col_val
            elif label["label"] == "time":
                group_row["terminal_time"] = col_val
        table_groups.append(group_row)
        group_name = group_name.lstrip(".") if group_name else f"Group {idx}"
        group_names.append(group_name.lower() if group_name else group_name)
        group_row["group"] = group_name
        print(group_names)
    info["df"].index = [name.lower().strip() for name in group_names]
    info["table_groups"] = pd.DataFrame(data=table_groups, index=group_names)
    print(info["table_groups"])

# group_label = {"column": label[0], "label": label[1].strip().lower(), "type": None, "units": [], "link": None, "name": None}

{'column': 'Header.Parameter', 'label': 'group', 'units': [], 'link': None, 'name': None}
['control']
{'column': 'Header.Parameter', 'label': 'group', 'units': [], 'link': None, 'name': None}
['control', 'mtx treated']
{'column': 'Header.Parameter', 'label': 'group', 'units': [], 'link': None, 'name': None}
['control', 'mtx treated', 'asp + mtx treated']
                               group animal_model sample_size terminal_time  \
control                      Control         None        None          None   
mtx treated              MTX treated         None        None          None   
asp + mtx treated  Asp + MTX treated         None        None          None   

                  treatment1 dose1 units1 treatment2 dose2 units2  
control                 None  None   None       None  None   None  
mtx treated             None  None   None       None  None   None  
asp + mtx treated       None  None   None       None  None   None  


Deriving Groups form Text via LLM

In [13]:
text_groups = []
group_names = []
for row in utils.groups_from_text(PMC, abstract, methods, results):
    if len(row) not in [4, 7, 10] and len(row) < 10:
        print("ERROR: Invalid text_groups label")
        continue
    text_group = {}
    text_group["group"] = row[0]
    text_group["animal_model"] = row[1] if row[1] else None
    text_group["sample_size"] = row[2] if row[2] else None
    text_group["terminal_time"] = row[3] if row[3] else None
    if len(row) == 4:
        text_group["treatment1"] = None
        text_group["dose1"] = None
        text_group["units1"] = None
        text_group["treatment2"] = None
        text_group["dose2"] = None
        text_group["units2"] = None
    elif len(row) == 7:
        text_group["treatment1"] = row[4]
        text_group["dose1"] = row[5]
        text_group["units1"] = row[6]
        text_group["treatment2"] = None
        text_group["dose2"] = None
        text_group["units2"] = None
    else:
        text_group["treatment1"] = row[4]
        text_group["dose1"] = row[5]
        text_group["units1"] = row[6]
        text_group["treatment2"] = row[7]
        text_group["dose2"] = row[8]
        text_group["units2"] = row[9]
    text_groups.append(text_group)
    group_names.append(row[0].strip().lower())
text_groups_df = pd.DataFrame(data=text_groups, index=group_names)
    

https://pharmacoinfo-openai-5.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-10-21
ChatGPT Request Status: 200
<control|rat|6|90 days||||||>  
<MTXcontrol|rat|6|90 days|methotrexate|0.2|mg/kg||||>  
<MTXASP|rat|6|90 days|methotrexate|0.2|mg/kg|aspartame|40|mg/kg>  
[['control', 'rat', '6', '90 days', '', '', '', '', '', ''], ['MTXcontrol', 'rat', '6', '90 days', 'methotrexate', '0.2', 'mg/kg', '', '', '', ''], ['MTXASP', 'rat', '6', '90 days', 'methotrexate', '0.2', 'mg/kg', 'aspartame', '40', 'mg/kg']]


Linking table and text treatment groups

In [None]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    utils.link_groups(PMC, tid, methods, info["table_groups"], text_groups_df)
    print(table_utils.dataframe_to_markdown(info["table_groups"]))

https://pharmacoinfo-openai-5.openai.azure.com/openai/deployments/gpt-4o/chat/completions?api-version=2024-10-21
ChatGPT Request Status: 200


Process Biomarkers

In [None]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    mean_columns = defaultdict(lambda: {"mean_col": None, "variance_col": None, "units": None})
    freq_columns = defaultdict(lambda: {"freq_col": None, "severity_col": None, "units": None})
    for label in info["biomarker_cols"]:
        if label["type"] == "mean":
            mean_columns[label["name"]]["mean_col"] = label["column"]
            if label["units"]: mean_columns[label["name"]]["units"] = label["units"]
        elif label["type"] == "variance" or label["type"] == "variation":
            mean_columns[label["name"]]["variance_col"] = label["column"]
            if label["units"]: mean_columns[label["name"]]["units"] = label["units"]
        elif label["type"] == "frequency":
            freq_columns[label["name"]]["freq_col"] = label["column"]
            if label["units"]: mean_columns[label["name"]]["units"] = label["units"]
        elif label["type"] == "severity":
            freq_columns[label["name"]]["severity_col"] = label["column"]
    info["mean_cols"] = mean_columns
    info["freq_cols"] = freq_columns

In [29]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    table_df = table_utils.markdown_to_dataframe(info["markdown"])
    table_df.index = info["table_groups"].index
    output_mean = []
    output_freq = []
    for row_index in table_df.index:
        for col_index, col_params in info["mean_cols"].items():
            if not table_df.at[row_index, col_params["mean_col"]]: continue
            output_mean.append({
                "Animal Model": info["table_groups"].at[row_index, "animal_model"],
                "Sample Size": info["table_groups"].at[row_index, "sample_size"],
                "Treatment 1": info["table_groups"].at[row_index, "treatment1"],
                "Dose 1": info["table_groups"].at[row_index, "dose1"],
                "Units 1": info["table_groups"].at[row_index, "units1"],
                "Treatment 2": info["table_groups"].at[row_index, "treatment2"],
                "Dose 2": info["table_groups"].at[row_index, "dose2"],
                "Units 2": info["table_groups"].at[row_index, "units2"],
                "Terminal Time": info["table_groups"].at[row_index, "terminal_time"],
                "Biomarker": col_index,
                "Value": table_df.at[row_index, col_params["mean_col"]],
                "Units": col_params["units"],
                "Variance": table_df.at[row_index, col_params["variance_col"]] if col_params["variance_col"] else None,
            })
        for col_index, col_params in info["freq_cols"].items():
            if not table_df.at[row_index, col_params["freq_col"]] and not table_df.at[row_index, col_params["severity_col"]]: continue
            if table_df.at[row_index, col_params["freq_col"]]:
                if col_params["units"] in ["percent", "percents", "percentage", "percentages", "%"]:
                    frequency = table_df.at[row_index, col_params["frequency"]]/info["table_groups"].at[row_index, "sample_size"] * 100
                elif col_params["units"] in ["count", "counts"]:
                    frequency = table_df.at[row_index, col_params["frequency"]]/info["table_groups"].at[row_index, "sample_size"]
                else:
                    frequency = table_df.at[row_index, col_params["frequency"]]
            output_mean.append({
                "Animal Model": info["table_groups"].at[row_index, "animal_model"],
                "Sample Size": info["table_groups"].at[row_index, "sample_size"],
                "Treatment 1": info["table_groups"].at[row_index, "treatment1"],
                "Dose 1": info["table_groups"].at[row_index, "dose1"],
                "Units 1": info["table_groups"].at[row_index, "units1"],
                "Treatment 2": info["table_groups"].at[row_index, "treatment2"],
                "Dose 2": info["table_groups"].at[row_index, "dose2"],
                "Units 2": info["table_groups"].at[row_index, "units2"],
                "Terminal Time": info["table_groups"].at[row_index, "terminal_time"],
                "Biomarker": col_index,
                "Frequency": frequency,
                "Severity": table_df.at[row_index, col_params["variance_col"]],
            })
    info["output_mean"] = pd.DataFrame(output_mean)
    info["output_freq"] = pd.DataFrame(output_freq)

In [30]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    print(info["output_mean"])
    print(info["output_freq"])

   Animal Model Sample Size   Treatment 1 Dose 1 Units 1 Treatment 2 Dose 2  \
0           rat           6                                                   
1           rat           6                                                   
2           rat           6                                                   
3           rat           6                                                   
4           rat           6                                                   
5           rat           6                                                   
6           rat           6                                                   
7           rat           6                                                   
8           rat           6                                                   
9           rat           6                                                   
10          rat           6                                                   
11          rat           6                         