In [None]:
# fetch_spl_list
import requests
from typing import Any
import os 

def fetch_spl_list(base_uri: str, limit: int = 30) -> list[dict[str: Any]]:
    
    """ Fetch a list of Structured Product Label (SPL) entries. """

    spl_url = os.path.join(base_uri, "spls.json")
    print(spl_url)
    response = requests.get(
        url = spl_url,
        params = {'pagesize': limit}
        )
    
    response.raise_for_status()
    data = response.json()
    return data.get("data", [])

spls = fetch_spl_list(base_uri = "https://dailymed.nlm.nih.gov/dailymed/services/v2",
                      limit=1
                      )
print(spls)

In [None]:
# download_spl_xml
from pathlib import Path 

def download_spl_xml(
        set_id, 
        download_url: str, 
        output_dir: str
        ) -> Path:
    
    """Download SPL SML file given a set_id."""
    download_url_json  = os.path.join(download_url, f"spls/{set_id}.xml")
    response = requests.get(download_url_json)
    response.raise_for_status()
    print(download_url_json)
    os.makedirs(output_dir, exist_ok=True)
    file_path = os.path.join(output_dir, f"{set_id}.xml")

    with open (file_path, "wb") as f: 
        f.write(response.content) 

    return file_path

download_url = "https://dailymed.nlm.nih.gov/dailymed/services/v2"
output_dir = "./data/raw"
spl_1 = spls[0]

file_path = download_spl_xml(set_id = spl_1['setid'],
                              download_url = download_url,
                              output_dir = output_dir)

In [None]:
from lxml import etree 
import json 
from typing import Optional
from pathlib import Path 
import os 

TARGET_SECTIONS_SET = {
    'INDICATIONS & USAGE',
    'WARNINGS',
    'ADVERSE REACTIONS',
    'CONTRAINDICATIONS',
    'ACTIVE INGREDIENTS',
    'PURPOSE',
    'USES',
    'PRODUCT LABEL',
    'OTHER INFORMATION',
    'DIRECTIONS',
    'INACTIVE INGREDIENTS',
    
}

def parse_drug_label(input_path: str, output_path: str):

    with open(input_path, mode = 'rb') as xml_file: 
        tree = etree.parse(xml_file)

    root = tree.getroot() 

    if None in root.nsmap:
        namespace = root.nsmap.get(None) 
    else:
        namespace = 0
        print("No default namespace")
    
    output = {}

    elements_found = find_elements_by_tag(element = root, tag = "section", namespace = namespace)

    for section in elements_found:
        # print("section: ", section.tag)

        title_element = section.find(f".//{{{namespace}}}title") if namespace else section.find("title")
        
        if title_element is not None:
            title = " ".join(title_element.itertext()).strip().upper()
            # print("\ntitle: ", title)

            if title in TARGET_SECTIONS_SET:
                text_elements = section.findall(f".//{{{namespace}}}text") if namespace else section.findall("text") # returns a list of matching Elements 

                if not text_elements:
                    print("findall returned and empty list")

                section_text = " ".join(
                    " ".join(t.itertext()).strip()
                    for t in text_elements if t is not None)
                
                # print("section text: ",section_text)
                output[title] = section_text
    
    # if the dir does not exist, create the dir 
    output_dir = Path(output_path).parent 
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # dump to the output_path 
    with open(output_path, "w") as f:
        json.dump(obj = output, fp = f)

    return output


def find_elements_by_tag(element, tag, namespace: Optional[str]) -> list[etree.Element]:
    if namespace:
        return element.findall(f".//{{{namespace}}}{"section"}")
    else: 
        return element.findall(f".//{tag}")
    
input_path = '/home/spunion/Documents/homework/ds-toolkit/data/raw/1c82929f-934b-4df4-b24f-1e5da090fd8a.xml' 
output = parse_drug_label(input_path=input_path, output_path = "./testing.json")


In [None]:
from lxml import etree 
xml_path = '/home/spunion/Documents/homework/ds-toolkit/data/raw/20538bc0-10c3-492c-952a-6797e46003f2.xml' 

tree = etree.parse(xml_path)
root = tree.getroot()

type(root.nsmap.get(None))

In [None]:
import pprint
namespace = root.nsmap.get(None) 
if namespace:
    pprint.pprint(root.findall(f".//{{{namespace}}}{"section"}"))

In [None]:

sample_dict = {} 
sample_dict['sample_key'] = 'sample_value'
sample_dict
import json 
with open("./trying-out-json-dumps.json", "w")as f: 
    json.dump(obj = sample_dict, fp = f)

In [None]:
from pathlib import Path 
output_path = "./data/processed/2025-05-06_10:11:50/1c82929f-934b-4df4-b24f-1e5da090fd8a.json"
output_path = Path(output_path)
output_path.parent

In [None]:
from toolkit.ingestion.clean import get_namespace, create_etree, gather_titles, parse_drug_label, find_elements_by_tag

# tree = create_etree(input_path = '/home/spunion/Documents/homework/ds-toolkit/data/raw/20538bc0-10c3-492c-952a-6797e46003f2.xml')
# titles = gather_titles(tree = tree)
tree = create_etree(input_path = '/home/spunion/Documents/homework/ds-toolkit/data/raw/20538bc0-10c3-492c-952a-6797e46003f2.xml')

parse_drug_label(tree = tree, 
                 input_path = '/home/spunion/Documents/homework/ds-toolkit/data/raw/20538bc0-10c3-492c-952a-6797e46003f2.xml',
                 output_path = '/home/spunion/Documents/homework/ds-toolkit/data/processed/20538bc0-10c3-492c-952a-6797e46003f2.json'
                 )

# Spacy 


In [None]:
import json
import spacy 
from pathlib import Path 
from typing import Dict 

def extract_entities_from_dict(input_dict: Dict) -> Dict[str, list[Dict[str, str]]]]:
    # Language object 
    nlp = spacy.load("en_core_web_sm")

    sections_doc = {}

    for section, text in input_dict.items():
        if text:
            sections_doc[section] = nlp(' '.join(text.split()))
        else: 
            sections_doc[section] = ''
            
    entities = {}

    for section, doc in sections_doc.items():
        if type(sections_doc[section]) == spacy.tokens.doc.Doc:
            entities[section] = [{'text': ents.text, 'label': ents.label_} for ents in doc.ents]

        else: 
            continue 

    return entities

In [59]:
import json 

input_file_path = '/home/spunion/Documents/homework/ds-toolkit/data/processed/2025-05-06_13:06:12/1c82929f-934b-4df4-b24f-1e5da090fd8a.json'
with open(file = input_file_path, mode = 'rb') as f:
    json_file_contents = json.load(f)


In [60]:
entities = extract_entities_from_dict(input_dict = json_file_contents)
entities


{'CONTRAINDICATIONS': [{'text': 'QT', 'label': 'ORG'},
  {'text': 'Drug Interaction Studies', 'label': 'ORG'},
  {'text': 'PRECAUTIONS', 'label': 'ORG'}],
 'ADVERSE REACTIONS': [{'text': 'the United States', 'label': 'GPE'},
  {'text': '448', 'label': 'CARDINAL'},
  {'text': '150', 'label': 'CARDINAL'},
  {'text': '26%', 'label': 'PERCENT'},
  {'text': '422', 'label': 'CARDINAL'},
  {'text': '16%', 'label': 'PERCENT'},
  {'text': '150', 'label': 'CARDINAL'},
  {'text': '13%', 'label': 'PERCENT'},
  {'text': '7%', 'label': 'PERCENT'},
  {'text': '6%', 'label': 'PERCENT'},
  {'text': 'greater than', 'label': 'PERCENT'},
  {'text': '1%', 'label': 'PERCENT'},
  {'text': '3%', 'label': 'PERCENT'},
  {'text': '1%', 'label': 'PERCENT'},
  {'text': '1%', 'label': 'PERCENT'},
  {'text': '1%', 'label': 'PERCENT'},
  {'text': 'Sixteen percent', 'label': 'PERCENT'},
  {'text': '4000', 'label': 'CARDINAL'},
  {'text': '7 days', 'label': 'DATE'},
  {'text': '1.5%', 'label': 'PERCENT'},
  {'text': '1

In [70]:
nlp = spacy.load("en_core_web_sm")
doc = nlp("string input this gregory we know drug is percent 89% ")
entity = doc.ents
type(entity[0].label_)

str