In [1]:
import table_utils
import process_paper_utils as utils
import os
from bs4 import BeautifulSoup
import html
from datetime import datetime
import pandas as pd

In [2]:
#Enter PMC here:
PMC = "PMC5373957"

Imports paper from PMC

In [3]:
os.makedirs("papers", exist_ok=True)
if os.path.isfile(f"papers/{PMC}.xml"):
    with open(f"papers/{PMC}.xml", 'r', encoding='utf-8') as f:
        paper_xml = f.read()
else:
    paper_xml = utils.fetch_xml(PMC)
    with open(f"papers/{PMC}.xml", "w", encoding="utf-8") as f:
        f.write(paper_xml)
print(paper_xml)

<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE collection SYSTEM "BioC.dtd"><collection><source>BioC-API</source><date>20250617</date><key>collection.key</key><document><id>5373957</id><infon key="license">author_manuscript</infon><passage><infon key="article-id_doi">10.1002/jat.3387</infon><infon key="article-id_manuscript">NIHMS817236</infon><infon key="article-id_pmc">5373957</infon><infon key="article-id_pmid">27696470</infon><infon key="fpage">530</infon><infon key="issue">5</infon><infon key="kwd">Silver nanoparticles in vivo distribution pregnant rats maternal-fetal transfer metabolomics analysis biomarkers oral gavage intravenous injection cytokines oxidative stress</infon><infon key="license">
          This file is available for text mining. It may also be used consistent with the principles of fair use under the copyright law.
        </infon><infon key="lpage">544</infon><infon key="name_0">surname:Fennell;given-names:Timothy R.</infon><infon key="name_1">surname:Mortensen

Extracts abstract and tables from paper

In [4]:
from collections import defaultdict

soup = BeautifulSoup(paper_xml, "lxml-xml")

#abstract
abstract_passages = [p for p in soup.find_all("passage") if p.find("infon", {"key": "section_type"}) and 
        p.find("infon", {"key": "section_type"}).text.strip().upper() == "ABSTRACT"]
abstract_text_list = [p.find("text").get_text() for p in abstract_passages if p.find("text")]
abstract = "\n".join(abstract_text_list)

#methods
methods_passages = [p for p in soup.find_all("passage") if p.find("infon", {"key": "section_type"}) and 
        p.find("infon", {"key": "section_type"}).text.strip().upper() == "METHODS"]
methods_text_list = [p.find("text").get_text() for p in methods_passages if p.find("text")]
methods = "\n".join(methods_text_list)

#results
results_passages = [p for p in soup.find_all("passage") if p.find("infon", {"key": "section_type"}) and 
        p.find("infon", {"key": "section_type"}).text.strip().upper() == "RESULTS"]
results_text_list = [p.find("text").get_text() for p in results_passages if p.find("text")]
results = "\n".join(results_text_list)

#tables
table_passages = [p for p in soup.find_all("passage") if (sect := p.find("infon", {"key": "section_type"})) and sect.text.strip() == "TABLE"]
tables = defaultdict(lambda: {"label": "o", "caption": None, "markdown": None, "footnotes": [], "col_labels": None, "group_cols": [], "biomarker_cols": []})
for p in table_passages:
    table_id = p.find("infon", {"key": "id"}).text.strip()
    passage_type = p.find("infon", {"key": "type"}).text.strip()
    if passage_type == "table_caption":
        tables[table_id]["caption"] = p.find("text").text.strip()
    elif passage_type == "table":
        table_xml = p.find("infon", {"key": "xml"}).text
        if "<table" not in table_xml:
            continue
        table = table_utils.single_html_table_to_markdown(html.unescape(table_xml))
        tables[table_id]["markdown"] = table_utils.transpose_markdown_table(table_utils.transpose_markdown_table(table))
    elif passage_type in ("table_foot", "table_footnote"):
        tables[table_id]["footnotes"].append(p.find("text").text.strip())


Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html_content, "html.parser")

Assuming this really is an XML document, what you're doing might work, but you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the Python package 'lxml' installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor.




  soup = BeautifulSoup(html, "html.parser")


Generate prompt for identifying each table

In [5]:
id_tables_prompt = utils.id_tables_prompt(abstract, tables)
print(id_tables_prompt)

# Instructions
You will be given the abstract of a research paper which includes at least one animal toxicity study then a list of all the tables included in that paper, each one labeled with its title. For each table identify it as one of three categories: treatment group table (G), biomarker table (B), or other (O). Treatment group tables should contain information on the specific treatment groups, such as the medications, the dosages, the sample size, etc. Tables that only give information on the chemicals used (such as the sourcing) should not be labeled G and should instead be labeled O. Even if the table meets these criteria, only label the table G if it describes the groups of specifically an animal toxicity study within the paper. Otherwise, label it O. Biomarker tables may contain treatment group dosage information, but the main difference between is that biomarker tables should contain biomarker data observed from the treatment groups. This can frequency data (such as surviva

API call to identify tables

In [6]:
id_tables_output = utils.chatgpt_request(id_tables_prompt, 100, 0)

time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
with open(f"responses/table_id/{time}.txt", "w", encoding="utf-8") as file:
    file.write(f"{PMC}\n---\n{id_tables_prompt}\n---\n{id_tables_output}")

ChatGPT Request Status: 200


Assigning labels to table dict

In [7]:
for label in id_tables_output.split("\n"):
    label = label.strip()
    if not label: continue

    content = label[1:-1]
    tid, label = content.split("|")
    tables[tid.strip()]["label"] = label.strip().lower()

print(tables)

defaultdict(<function <lambda> at 0x00000199B350E520>, {'T1': {'label': 'o', 'caption': 'Methods for analysis of AgNP stock standard solutions and characteristics.', 'markdown': '| Characterization Assay | Method ID | 20 nm AgNP (NIEHS-2) | 110 nm AgNP (NIEHS-4) |\n| --- | --- | --- | --- |\n| Endotoxin Quantification - Kinetic Turbidity (EU/mL) | STE-1.2 | 1.1 | <0.5 |\n| Hydrodynamic Size/Size Distribution by DLS (Z-Avg [nm]) | PCC-1 | 26 | 112.3 |\n| Size by TEM (nm) | PCC-7 | 20.5 | 111.3 |\n| Surface Charge by Zeta Potential (mV) | PCC-2 | -37.1 | -25.9 |\n| Silver concentration by ICP-MS (mg/g) | PCC-8 | 1.09 | 1.10 |', 'footnotes': ['Two populations observed; <9% were smaller than 10 nm, the remainder averaged 20 nm', 'Two populations observed; <13% were smaller than 60 nm, the remainder averaged 111 nm'], 'col_labels': None, 'group_cols': [], 'biomarker_cols': []}, 'T2': {'label': 'b', 'caption': 'Concentration of silver (mug Ag/g) in tissues following i.v. administration (1 mg

Formatting Tables and converting to df

In [8]:
#Note: 1 LLM API call per table
formatted_tables = utils.format_tables(PMC, abstract, methods, tables)

for tid, info in formatted_tables.items():
    print(f"###{tid}###")
    if info["markdown"]: print(info["markdown"])
    info["df"] = table_utils.markdown_to_dataframe(info["markdown"])

ChatGPT Request Status: 200
Tissue
0            AgAc.24 h
1            AgAc.48 h
2      20 nm AgNP.24 h
3      20 nm AgNP.48 h
4     110 nm AgNP.24 h
5    110 nm AgNP .48 h
Name: Tissue, dtype: object
Liver
0     1.03+-0.399
1    0.511+-0.534
2    0.312+-0.044
3    0.228+-0.130
4      2.28+-1.69
5      2.71+-1.63
Name: Liver, dtype: object
Blood
0     0.663+-0.195
1     0.180+-0.033
2    0.375+-0.0485
3    0.214+-0.0916
4    0.594+-0.0566
5    0.348+-0.0720
Name: Blood, dtype: object
Spleen
0     2.95+-0.927
1    0.524+-0.493
2     1.42+-0.499
3     1.14+-0.636
4      6.79+-2.16
5     3.60+-0.467
Name: Spleen, dtype: object
Lungs
0      1.38+-0.958
1      1.31+-0.906
2     0.519+-0.103
3    0.397+-0.0950
4    0.436+-0.0839
5    0.305+-0.0831
Name: Lungs, dtype: object
Heart
0       0.162+-0.0306
1    0.0584+-0.000479
2       0.127+-0.0168
3              0.0863
4       0.119+-0.0204
5       0.104+-0.0144
Name: Heart, dtype: object
Kidney
0    0.405+-0.0944
1    0.233+-0.0364
2    0.431+

IDing columns via LLM

In [9]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    info_combined = ""
    if not info["markdown"]: continue
    if info["caption"]:
        info_combined += f"Caption: {info["caption"]}\n"
    info_combined += info["markdown"] + "\n"
    if len(info["footnotes"]) > 0:
        info_combined += "Footnotes:\n"
        for fn in info["footnotes"]:
            info_combined += fn + "\n"
    prompt = utils.id_columns_prompt(info_combined)
    answer = utils.chatgpt_request(prompt, 5000, 0)
    info["col_labels"] = [[elt.strip() for elt in label.strip().lstrip('<').rstrip('>').split("|")] for label in answer.split("\n")]
    time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    with open(f"responses/column_id/{time}.txt", "w", encoding="utf-8") as file:
        file.write(f"{PMC}\n---\n{tid}\n---\n{prompt}\n---\n{answer}")

ChatGPT Request Status: 200
ChatGPT Request Status: 200
ChatGPT Request Status: 200
ChatGPT Request Status: 200
ChatGPT Request Status: 200
ChatGPT Request Status: 200


Processing column IDs

In [10]:
for tid, info in formatted_tables.items():
    if info["label"] != "b":
        continue
    info["biomarker_cols"] = []
    info["group_cols"] = []
    column_info = {}
    for label in info["col_labels"]:
        if label[1].lower() not in ["biomarker", "group", "dose", "size", "animal", "time"]: continue
        print(label[1].lower())
        if label[1].lower() == "biomarker":
            if label[3].lower() not in ["mean", "variation", "frequency", "severity"]: continue
            biomarker_label = {"column": label[0], "label": "biomarker", "type": label[3].lower(), "units": None, "name": label[2]}
            if label[3].lower() == "mean" and len(label) > 4:
                biomarker_label["units"] = label[4]
            elif label[3].lower() == "variation" and len(label) > 4:
                biomarker_label["units"] = label[4]
            elif label[3].lower() == "frequency":
                if len(label) > 4 and label[4].lower() in ["percent", "percents", "percentage", "percentages", "%", "count", "counts", "decimal", "decimals"]:
                    biomarker_label["units"] = label[4]
                else:
                    print("ERROR: Invalid type for count label -> skipping")
                    continue
            info["biomarker_cols"].append(biomarker_label)
        else:
            group_label = {"column": label[0], "label": label[1].strip().lower(), "units": [], "link": None, "name": None}
            if label[1].lower() == "dose":
                for i in range(2,len(label)):
                    if label[i].strip()[:5].lower() == "name:":
                        group_label["name"] = label[i].strip()[5:].strip()
                    elif label[i].strip()[:6].lower() == "units:":
                        group_label["units"].append(label[i].strip()[6:].strip())
                    elif label[i].strip()[:5].lower() == "link:":
                        group_label["link"] = label[i].strip()[5:].strip()
            elif label[1].lower() == "animal":
                if len(label) > 2:
                    group_label["name"] = label[2]
            info["group_cols"].append(group_label)

biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
group
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
group
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarker
biomarke

In [11]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    print(info["col_labels"])
    print(f"GROUP COLUMNS:\n{info["group_cols"]}")
    print(f"BIOMARKER COLUMNS:\n{info["biomarker_cols"]}")

[['Liver.mean', 'biomarker', 'Liver', 'mean', 'mug/g'], ['Liver.SD', 'biomarker', 'Liver', 'variation', 'mug/g'], ['Blood.mean', 'biomarker', 'Blood', 'mean', 'mug/g'], ['Blood.SD', 'biomarker', 'Blood', 'variation', 'mug/g'], ['Spleen.mean', 'biomarker', 'Spleen', 'mean', 'mug/g'], ['Spleen.SD', 'biomarker', 'Spleen', 'variation', 'mug/g'], ['Lungs.mean', 'biomarker', 'Lungs', 'mean', 'mug/g'], ['Lungs.SD', 'biomarker', 'Lungs', 'variation', 'mug/g'], ['Heart.mean', 'biomarker', 'Heart', 'mean', 'mug/g'], ['Heart.SD', 'biomarker', 'Heart', 'variation', 'mug/g'], ['Kidney.mean', 'biomarker', 'Kidney', 'mean', 'mug/g'], ['Kidney.SD', 'biomarker', 'Kidney', 'variation', 'mug/g'], ['Brain.mean', 'biomarker', 'Brain', 'mean', 'mug/g'], ['Brain.SD', 'biomarker', 'Brain', 'variation', 'mug/g'], ['Skin.mean', 'biomarker', 'Skin', 'mean', 'mug/g'], ['Skin.SD', 'biomarker', 'Skin', 'variation', 'mug/g'], ['Muscle.mean', 'biomarker', 'Muscle', 'mean', 'mug/g'], ['Muscle.SD', 'biomarker', 'Muscle

Deriving Groups via Tables

In [12]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    table_groups = []
    group_names = []
    info["df"].reset_index(drop=True, inplace=True)
    for idx, row in info["df"].iterrows():
        group_row = {"group": None, "animal_model": None, "sample_size": None, "terminal_time": None,"treatment1": None, "dose1": None, "units1": None,"treatment2": None, "dose2": None, "units2": None}
        group_name = ""
        for label in info["group_cols"]:
            print(label)
            col_val = info["df"].loc[idx, label["column"]]
            if label["label"] == "group":
                group_name += f".{col_val}"
            elif label["label"] == "dose":
                if not group_row["dose1"]:
                    group_row["dose1"] = col_val
                    if label["name"]:
                        group_row["treatment1"] = label["name"]
                    elif label["link"]:
                        group_row["treatment1"] = info["df"].loc[idx, label["link"]]
                elif not group_row["dose2"]:
                    group_row["dose2"] = col_val
                    if label["name"]:
                        group_row["treatment2"] = label["name"]
                    elif label["link"]:
                        group_row["treatment2"] = info["df"].loc[idx, label["link"]]
                else:
                    print("ERROR: >2 treatments")
            elif label["label"] == "size":
                group_row["sample_size"] = col_val
            elif label["label"] == "animal":
                group_row["animal_model"] = col_val
            elif label["label"] == "time":
                group_row["terminal_time"] = col_val
        table_groups.append(group_row)
        group_name = group_name.lstrip(".") if group_name else f"Group {idx}"
        group_names.append(group_name.lower() if group_name else group_name)
        group_row["group"] = group_name
        print(group_names)
    info["df"].index = [name.lower().strip() for name in group_names]
    info["table_groups"] = pd.DataFrame(data=table_groups, index=group_names)
    print(tid)
    print(info["table_groups"])

# group_label = {"column": label[0], "label": label[1].strip().lower(), "type": None, "units": [], "link": None, "name": None}

['group 0']
['group 0', 'group 1']
['group 0', 'group 1', 'group 2']
['group 0', 'group 1', 'group 2', 'group 3']
['group 0', 'group 1', 'group 2', 'group 3', 'group 4']
['group 0', 'group 1', 'group 2', 'group 3', 'group 4', 'group 5']
T2
           group animal_model sample_size terminal_time treatment1 dose1  \
group 0  Group 0         None        None          None       None  None   
group 1  Group 1         None        None          None       None  None   
group 2  Group 2         None        None          None       None  None   
group 3  Group 3         None        None          None       None  None   
group 4  Group 4         None        None          None       None  None   
group 5  Group 5         None        None          None       None  None   

        units1 treatment2 dose2 units2  
group 0   None       None  None   None  
group 1   None       None  None   None  
group 2   None       None  None   None  
group 3   None       None  None   None  
group 4   None       N

Deriving Groups form Text via LLM

In [13]:
text_groups = []
group_names = []
for row in utils.groups_from_text(PMC, abstract, methods, results):
    if len(row) not in [4, 7, 10] and len(row) < 10:
        print("ERROR: Invalid text_groups label")
        continue
    text_group = {}
    text_group["group"] = row[0]
    text_group["animal_model"] = row[1] if row[1] else None
    text_group["sample_size"] = row[2] if row[2] else None
    text_group["terminal_time"] = row[3] if row[3] else None
    if len(row) == 4:
        text_group["treatment1"] = None
        text_group["dose1"] = None
        text_group["units1"] = None
        text_group["treatment2"] = None
        text_group["dose2"] = None
        text_group["units2"] = None
    elif len(row) == 7:
        text_group["treatment1"] = row[4]
        text_group["dose1"] = row[5]
        text_group["units1"] = row[6]
        text_group["treatment2"] = None
        text_group["dose2"] = None
        text_group["units2"] = None
    else:
        text_group["treatment1"] = row[4]
        text_group["dose1"] = row[5]
        text_group["units1"] = row[6]
        text_group["treatment2"] = row[7]
        text_group["dose2"] = row[8]
        text_group["units2"] = row[9]
    text_groups.append(text_group)
    group_names.append(row[0].strip().lower())
text_groups_df = pd.DataFrame(data=text_groups, index=group_names)
    

ChatGPT Request Status: 200
<control_iv|rat|6|24 hours||||||>  
<control_iv|rat|6|48 hours||||||>  
<control_po|rat|6|24 hours||||||>  
<control_po|rat|6|48 hours||||||>  
<AgAc_iv|rat|6|24 hours|silver acetate|1|mg/kg|||>  
<AgAc_iv|rat|6|48 hours|silver acetate|1|mg/kg|||>  
<AgAc_po|rat|6|24 hours|silver acetate|10|mg/kg|||>  
<AgAc_po|rat|6|48 hours|silver acetate|10|mg/kg|||>  
<AgNP20_iv|rat|6|24 hours|silver nanoparticles (20 nm)|1|mg/kg|||>  
<AgNP20_iv|rat|6|48 hours|silver nanoparticles (20 nm)|1|mg/kg|||>  
<AgNP20_po|rat|6|24 hours|silver nanoparticles (20 nm)|10|mg/kg|||>  
<AgNP20_po|rat|6|48 hours|silver nanoparticles (20 nm)|10|mg/kg|||>  
<AgNP110_iv|rat|6|24 hours|silver nanoparticles (110 nm)|1|mg/kg|||>  
<AgNP110_iv|rat|6|48 hours|silver nanoparticles (110 nm)|1|mg/kg|||>  
<AgNP110_po|rat|6|24 hours|silver nanoparticles (110 nm)|10|mg/kg|||>  
<AgNP110_po|rat|6|48 hours|silver nanoparticles (110 nm)|10|mg/kg|||>  
[['control_iv', 'rat', '6', '24 hours', '', '', ''

Linking table and text treatment groups

In [14]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    utils.link_groups(PMC, tid, methods, info["table_groups"], text_groups_df)
    print(table_utils.dataframe_to_markdown(info["table_groups"]))

ChatGPT Request Status: 200
| group | animal_model | sample_size | terminal_time | treatment1 | dose1 | units1 | treatment2 | dose2 | units2 |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| Group 0 | control_iv    rat
control_iv    rat
Name: animal_model, dtype: object | control_iv    6
control_iv    6
Name: sample_size, dtype: object | control_iv    24 hours
control_iv    48 hours
Name: terminal_time, dtype: object | control_iv    
control_iv    
Name: treatment1, dtype: object | control_iv    
control_iv    
Name: dose1, dtype: object | control_iv    
control_iv    
Name: units1, dtype: object | control_iv    
control_iv    
Name: treatment2, dtype: object | control_iv    
control_iv    
Name: dose2, dtype: object | control_iv    
control_iv    
Name: units2, dtype: object |
| Group 1 | control_po    rat
control_po    rat
Name: animal_model, dtype: object | control_po    6
control_po    6
Name: sample_size, dtype: object | control_po    24 hours
control_po    48 hour

Process Biomarkers

In [15]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    mean_columns = defaultdict(lambda: {"mean_col": None, "variation_col": None, "units": None})
    freq_columns = defaultdict(lambda: {"freq_col": None, "severity_col": None, "units": None})
    for label in info["biomarker_cols"]:
        if label["type"] == "mean":
            mean_columns[label["name"]]["mean_col"] = label["column"]
            if label["units"]: mean_columns[label["name"]]["units"] = label["units"]
        elif label["type"] == "variation":
            mean_columns[label["name"]]["variation_col"] = label["column"]
            if label["units"]: mean_columns[label["name"]]["units"] = label["units"]
        elif label["type"] == "frequency":
            freq_columns[label["name"]]["freq_col"] = label["column"]
            if label["units"]: mean_columns[label["name"]]["units"] = label["units"]
        elif label["type"] == "severity":
            freq_columns[label["name"]]["severity_col"] = label["column"]
    info["mean_cols"] = mean_columns
    info["freq_cols"] = freq_columns

In [16]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    table_df = table_utils.markdown_to_dataframe(info["markdown"])
    print(info)
    table_df.index = info["table_groups"].index
    output_mean = []
    output_freq = []
    for row_index in table_df.index:
        for col_index, col_params in info["mean_cols"].items():
            if not table_df.at[row_index, col_params["mean_col"]]: continue
            output_mean.append({
                "Animal Model": info["table_groups"].at[row_index, "animal_model"],
                "Sample Size": info["table_groups"].at[row_index, "sample_size"],
                "Treatment 1": info["table_groups"].at[row_index, "treatment1"],
                "Dose 1": info["table_groups"].at[row_index, "dose1"],
                "Units 1": info["table_groups"].at[row_index, "units1"],
                "Treatment 2": info["table_groups"].at[row_index, "treatment2"],
                "Dose 2": info["table_groups"].at[row_index, "dose2"],
                "Units 2": info["table_groups"].at[row_index, "units2"],
                "Terminal Time": info["table_groups"].at[row_index, "terminal_time"],
                "Biomarker": col_index,
                "Value": table_df.at[row_index, col_params["mean_col"]],
                "Units": col_params["units"],
                "Variation": table_df.at[row_index, col_params["variation_col"]] if col_params["variation_col"] else None,
            })
        for col_index, col_params in info["freq_cols"].items():
            if not table_df.at[row_index, col_params["freq_col"]] and not table_df.at[row_index, col_params["severity_col"]]: continue
            if table_df.at[row_index, col_params["freq_col"]]:
                if col_params["units"] in ["percent", "percents", "percentage", "percentages", "%"]:
                    frequency = table_df.at[row_index, col_params["frequency"]]/info["table_groups"].at[row_index, "sample_size"] * 100
                elif col_params["units"] in ["count", "counts"]:
                    frequency = table_df.at[row_index, col_params["frequency"]]/info["table_groups"].at[row_index, "sample_size"]
                else:
                    frequency = table_df.at[row_index, col_params["frequency"]]
            output_mean.append({
                "Animal Model": info["table_groups"].at[row_index, "animal_model"],
                "Sample Size": info["table_groups"].at[row_index, "sample_size"],
                "Treatment 1": info["table_groups"].at[row_index, "treatment1"],
                "Dose 1": info["table_groups"].at[row_index, "dose1"],
                "Units 1": info["table_groups"].at[row_index, "units1"],
                "Treatment 2": info["table_groups"].at[row_index, "treatment2"],
                "Dose 2": info["table_groups"].at[row_index, "dose2"],
                "Units 2": info["table_groups"].at[row_index, "units2"],
                "Terminal Time": info["table_groups"].at[row_index, "terminal_time"],
                "Biomarker": col_index,
                "Frequency": frequency,
                "Severity": table_df.at[row_index, col_params["severity_col"]],
            })
    info["output_mean"] = pd.DataFrame(output_mean)
    info["output_freq"] = pd.DataFrame(output_freq)

{'label': 'b', 'caption': 'Concentration of silver (mug Ag/g) in tissues following i.v. administration (1 mg/kg) of AgAc, 20 nm AgNP or 110 nm AgNP to pregnant rats.', 'markdown': '| Tissue | Liver.mean | Liver.SD | Blood.mean | Blood.SD | Spleen.mean | Spleen.SD | Lungs.mean | Lungs.SD | Heart.mean | Heart.SD | Kidney.mean | Kidney.SD | Brain.mean | Brain.SD | Skin.mean | Skin.SD | Muscle.mean | Muscle.SD | Adipose.mean | Adipose.SD | Bone.mean | Bone.SD | Stomach and Small Intestine.mean | Stomach and Small Intestine.SD | Cecum and Large Intestine.mean | Cecum and Large Intestine.SD | Pancreas.mean | Pancreas.SD | Placenta.mean | Placenta.SD |\n| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\n| AgAc.24 h | 1.03 | 0.399 | 0.663 | 0.195 | 2.95 | 0.927 | 1.38 | 0.958 | 0.162 | 0.0306 | 0.405 | 0.0944 | 0.0358 | 0.000661 | 0.224 | 0.0488 | 0.0833 | 0

In [17]:
for tid, info in formatted_tables.items():
    if info["label"] != "b": continue
    print(info["output_mean"])
    print(info["output_freq"])

                                         Animal Model  \
0   control_iv    rat
control_iv    rat
Name: anim...   
1   control_iv    rat
control_iv    rat
Name: anim...   
2   control_iv    rat
control_iv    rat
Name: anim...   
3   control_iv    rat
control_iv    rat
Name: anim...   
4   control_iv    rat
control_iv    rat
Name: anim...   
..                                                ...   
77  agnp20_po    rat
agnp20_po    rat
Name: animal...   
78  agnp20_po    rat
agnp20_po    rat
Name: animal...   
79  agnp20_po    rat
agnp20_po    rat
Name: animal...   
80  agnp20_po    rat
agnp20_po    rat
Name: animal...   
81  agnp20_po    rat
agnp20_po    rat
Name: animal...   

                                          Sample Size  \
0   control_iv    6
control_iv    6
Name: sample_s...   
1   control_iv    6
control_iv    6
Name: sample_s...   
2   control_iv    6
control_iv    6
Name: sample_s...   
3   control_iv    6
control_iv    6
Name: sample_s...   
4   control_iv    6
control_iv