In [6]:
import requests
import json

# Define the API endpoint and query parameters
API_URL = "https://api.fda.gov/drug/label.json"
PARAMS = {
    'limit': 50  # Limit the number of results for testing
}

# Function to fetch drug labels from the openFDA API
def fetch_drug_labels(api_url, params):
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None

# Function to extract interactions and contraindications
def extract_ddis_and_contraindications(drug_label_data):
    interactions = []
    contraindications = []

    for result in drug_label_data.get('results', []):
        drug_name = result.get('openfda', {}).get('brand_name', 'Unknown Drug')
        print(f"Extracting information for {drug_name}...\n")

        # Print the raw JSON for each drug to see what fields are available
        # print(json.dumps(result, indent=4))  # Pretty-print the raw data to inspect its structure

        # Extract the drug interactions section
        drug_interactions_section = result.get('drug_interactions', [])
        if drug_interactions_section:
            for interaction in drug_interactions_section:
                interaction_text = interaction
                if interaction_text:
                    interactions.append({
                        'drug': drug_name,
                        'interaction': interaction_text
                    })
        else:
            print(f"No interactions found for {drug_name}")

        # Extract the contraindications section
        contraindications_section = result.get('contraindications', [])
        if contraindications_section:
            for contraindication in contraindications_section:
                contraindication_text = contraindication
                if contraindication_text:
                    contraindications.append({
                        'drug': drug_name,
                        'contraindication': contraindication_text
                    })
        else:
            print(f"No contraindications found for {drug_name}")

    return interactions, contraindications

# Function to display the results
def display_results(interactions, contraindications):
    print("\nDrug-Drug Interactions:")
    for interaction in interactions:
        print(f"Drug: {interaction['drug']}, Interaction: {interaction['interaction']}")

    print("\nContraindications:")
    for contraindication in contraindications:
        print(f"Drug: {contraindication['drug']}, Contraindication: {contraindication['contraindication']}")

# Main script to fetch, extract, and display the information
def main():
    # Fetch drug labels
    drug_label_data = fetch_drug_labels(API_URL, PARAMS)

    if drug_label_data:
        # Extract DDI and contraindication information
        interactions, contraindications = extract_ddis_and_contraindications(drug_label_data)

        # Display the results
        display_results(interactions, contraindications)

if __name__ == "__main__":
    main()



Extracting information for ['SILICEA']...

No interactions found for ['SILICEA']
No contraindications found for ['SILICEA']
Extracting information for Unknown Drug...

No interactions found for Unknown Drug
No contraindications found for Unknown Drug
Extracting information for ['Betadine']...

No interactions found for ['Betadine']
No contraindications found for ['Betadine']
Extracting information for Unknown Drug...

No interactions found for Unknown Drug
No contraindications found for Unknown Drug
Extracting information for Unknown Drug...

Extracting information for ['Naproxen']...

Extracting information for Unknown Drug...

No interactions found for Unknown Drug
No contraindications found for Unknown Drug
Extracting information for ['Moisturizing Antibacterial']...

No interactions found for ['Moisturizing Antibacterial']
No contraindications found for ['Moisturizing Antibacterial']
Extracting information for Unknown Drug...

No interactions found for Unknown Drug
No contraindicat

In [4]:
# !pip install --upgrade scispacy
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_lg-0.5.1.tar.gz
# !pip install request
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz
# !pip install safetensors
# !pip install bitsandbytes
# !pip install --upgrade numpy
# !pip install --upgrade spacy

Collecting spacy
  Using cached spacy-3.8.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Using cached thinc-8.3.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting blis<1.4.0,>=1.3.0 (from thinc<8.4.0,>=8.3.4->spacy)
  Using cached blis-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)
Collecting pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4 (from spacy)
  Downloading pydantic-2.11.2-py3-none-any.whl.metadata (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.7/64.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting pydantic-core==2.33.1 (from pydantic!=1.8,!=1.8.1,<3.0.0,>=1.7.4->spacy)
  Downloading pydantic_core-2.33.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Using cached spacy-3.8.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30.6 MB)
Using cached thinc-8.3.6-cp311-

In [8]:
import requests
import json
import scispacy
import spacy
import re
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
import torch
import json


# Define the API endpoint and query parameters
API_URL = "https://api.fda.gov/drug/label.json"
PARAMS = {
    'limit': 50  # Limit the number of results for testing
}


# Load LLM (Falcon-1B is good for Colab)
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")
# model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")

def build_prompt(drug_name, section_type, section_text):
    return f"""
Given the following text, extract the {section_type} in one clear sentence. Do not repeat the input.

Text: "{section_text}"
"""

def extract_with_llm(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=150)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("🧾 LLM Output:\n", decoded)
    return decoded


# Function to fetch drug labels from the openFDA API
def fetch_drug_labels(api_url, params):
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # collapse whitespace
    text = text.replace('•', '\n•')   # reformat bullets for clarity
    return text.strip()

nlp = spacy.load("en_core_sci_sm")

def extract_drug_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "CHEMICAL"]

interaction_phrases = [
    r'(?P<drug1>[A-Z][a-zA-Z]{2,})\b.*?\b(?:and|with)\b.*?\b(?P<drug2>[A-Z][a-zA-Z]{2,})\b.*?\b(?:increase|reduce|inhibit|enhance)\b.*?(?P<effect>[^\.]+)\.',
    r'concomitant use of (?P<drug1>[A-Z][a-zA-Z\s]{2,}?) and (?P<drug2>[A-Z][a-zA-Z\s]{2,}?).*?\b(?:may|can|will)\b (?P<effect>[^\.]+)\.',
]


def extract_interactions(text):
    results = []
    for pattern in interaction_phrases:
        for match in re.finditer(pattern, text, flags=re.IGNORECASE):
            results.append(match.groupdict())
    return results


def resolve_drug_name(entry):
    openfda = entry.get('openfda', {})
    fallback_name = entry.get('id') or entry.get('set_id') or "Unknown Drug"

    # First: Try to use known, useful fields from openFDA
    for field in ['brand_name', 'generic_name', 'substance_name']:
        names = openfda.get(field)
        if names:
            return names[0] if isinstance(names, list) else names

    # Second: Try parsing from 'id' or filename if it has a meaningful string
    if isinstance(fallback_name, str):
        fallback_name_clean = re.sub(r'[^a-zA-Z0-9 ]', '', fallback_name)
        if len(fallback_name_clean.split()) > 1:
            return fallback_name_clean

    # Third: Try grabbing the drug name from structured label sections
    for section in ['description', 'indications_and_usage']:
        texts = entry.get(section, [])
        if texts:
            # Look for a "This product contains <DrugName>..." style pattern
            match = re.search(r'\b([A-Z][a-z]+(?: [A-Z][a-z]+)?)\b', texts[0])
            if match and match.group(1).lower() not in {"uses", "directions", "indications", "purpose", "the"}:
                return match.group(1)

    return fallback_name


# Function to extract interactions and contraindications
def extract_ddis_and_contraindications(drug_label_data):
    interactions = []
    contraindications = []
    llm_ddis = []
    llm_contras = []
    unknowns = []

    for result in drug_label_data.get('results', []):
        drug_name = resolve_drug_name(result)
        print(f"\n🔍 Extracting for: {drug_name}")

        if drug_name == "Unknown Drug":
            unknowns.append(result)
            continue

        # --- DRUG INTERACTIONS ---
        for section_text in result.get('drug_interactions', []):
            cleaned = clean_text(section_text)

            # Baseline methods
            entities = extract_drug_entities(cleaned)
            regex_matches = extract_interactions(cleaned)

            if entities or regex_matches:
                interactions.append({
                    'drug': drug_name,
                    'interaction_text': cleaned,
                    'interacting_drugs': list(set(entities)),
                    'regex_matches': regex_matches
                })

            # LLM-based method
            prompt = build_prompt(drug_name, "drug-drug interaction", cleaned)
            llm_result = extract_with_llm(model, tokenizer, prompt)
            for entry in llm_result:
                entry["source_text"] = cleaned
                llm_ddis.append(entry)

        # --- CONTRAINDICATIONS ---
        for section_text in result.get('contraindications', []):
            cleaned = clean_text(section_text)

            # Baseline methods
            entities = extract_drug_entities(cleaned)
            contraindications.append({
                'drug': drug_name,
                'contraindication_text': cleaned,
                'interacting_drugs': list(set(entities))
            })

            # LLM-based method
            prompt = build_prompt(drug_name, "contraindication", cleaned)
            llm_result = extract_with_llm(model, tokenizer, prompt)
            for entry in llm_result:
                entry["source_text"] = cleaned
                llm_contras.append(entry)

    return interactions, contraindications, llm_ddis, llm_contras, unknowns


# Function to display the results
def display_results(interactions, contraindications, llm_ddis, llm_contras, unknowns):
    df_interactions = pd.DataFrame(interactions)
    df_contras = pd.DataFrame(contraindications)
    df_llm_ddi = pd.DataFrame(llm_ddis)
    df_llm_contra = pd.DataFrame(llm_contras)
    df_unknow = pd.DataFrame(unknowns)

    print("\n🧪 Regex + NER Interactions:")
    print(df_interactions.head())

    print("\n🤖 LLM Interactions:")
    print(df_llm_ddi.head())

    print("\n⛔ Regex Contraindications:")
    print(df_contras.head())

    print("\n🤖 LLM Contraindications:")
    print(df_llm_contra.head())

    return df_interactions, df_contras, df_llm_ddi, df_llm_contra, df_unknow


# Main script to fetch, extract, and display the information
def main():
    drug_label_data = fetch_drug_labels(API_URL, PARAMS)

    if drug_label_data:
        interactions, contraindications, llm_ddis, llm_contras, unknowns = extract_ddis_and_contraindications(drug_label_data)

        # View results
        df_interactions, df_contras, df_llm_ddi, df_llm_contra, df_unknow = display_results(
            interactions, contraindications, llm_ddis, llm_contras, unknowns
        )


# if __name__ == "__main__":
#     main()


In [10]:
naproxen_text = """
Naproxen tablets and naproxen sodium tablets are contraindicated in the following patients:
• Known hypersensitivity (e.g., anaphylactic reactions and serious skin reactions) to naproxen or any components of the drug product
• History of asthma, urticaria, or other allergic-type reactions after taking aspirin or other NSAIDs.
Severe, sometimes fatal, anaphylactic reactions to NSAIDs have been reported in such patients
• In the setting of coronary artery bypass graft (CABG) surgery
"""

glimepiride_text = """
Glimepiride tablets are contraindicated in patients with a history of a hypersensitivity reaction to:
Glimepiride or any of the product’s ingredients.
Sulfonamide derivatives: Patients who have developed an allergic reaction to sulfonamide derivatives may develop an allergic reaction to glimepiride.
Do not use glimepiride in patients who have a history of an allergic reaction to sulfonamide derivatives.
"""

ofloxacin_text = """
Ofloxacin ophthalmic solution is contraindicated in patients with a history of hypersensitivity to ofloxacin,
to other quinolones, or to any of the components in this medication (see WARNINGS).
"""


In [11]:
sample_inputs = [
    {
        "type": "contraindication",
        "drug": "Naproxen",
        "text": naproxen_text
    },
    {
        "type": "contraindication",
        "drug": "Glimepiride",
        "text": glimepiride_text
    },
    {
        "type": "contraindication",
        "drug": "Ofloxacin Ophthalmic",
        "text": ofloxacin_text
    }
]

for i, sample in enumerate(sample_inputs):
    print(f"\n--- [{i+1}] {sample['drug']} / {sample['type']} ---")
    prompt = build_prompt(sample['drug'], sample['type'], sample['text'])
    output = extract_with_llm(model, tokenizer, prompt)
    print("🧾 LLM Output:\n", output)



--- [1] Naproxen / contraindication ---
🧾 LLM Output:
 Naproxen tablets and naproxen sodium tablets are contraindicated in the following patients:
🧾 LLM Output:
 Naproxen tablets and naproxen sodium tablets are contraindicated in the following patients:

--- [2] Glimepiride / contraindication ---
🧾 LLM Output:
 Sulfonamide derivatives may develop an allergic reaction to glimepiride. Do not use glimepiride in patients who have a history of an allergic reaction to sulfonamide derivatives.
🧾 LLM Output:
 Sulfonamide derivatives may develop an allergic reaction to glimepiride. Do not use glimepiride in patients who have a history of an allergic reaction to sulfonamide derivatives.

--- [3] Ofloxacin Ophthalmic / contraindication ---
🧾 LLM Output:
🧾 LLM Output:


In [2]:
# !pip install --upgrade langchain langchain_community
# !pip install --upgrade scispacy
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_lg-0.5.1.tar.gz
# !pip install request
# !pip install --upgrade https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz
# !pip install --upgrade safetensors
# !pip install --upgrade bitsandbytes
# !pip install --upgrade numpy
# !pip install --upgrade spacy


Collecting langchain_community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting langchain-core<1.0.0,>=0.3.49 (from langchain)
  Downloading langchain_core-0.3.51-py3-none-any.whl.metadata (5.9 kB)
Collecting langchain
  Downloading langchain-0.3.23-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain_community)
  Downloading pydantic_settings-2.8.1-py3-none-any.whl.metadata (3.5 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain_community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain)
  Downloading langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain_community)
  Downloading marshmallow-3

In [1]:
# %env HUGGINGFACEHUB_API_TOKEN=<Your-Token-Here>
# !pip install --upgrade nltk
# !pip install --upgrade transformers

In [31]:
import requests
import json
import scispacy
import spacy
import re
import pandas as pd
import torch
from huggingface_hub import InferenceClient
from spacy.lang.en.stop_words import STOP_WORDS
# from nltk.stem import WordNetLemmatizer

# --- CONFIG ---
API_URL = "https://api.fda.gov/drug/label.json"
PARAMS = {'limit': 50}  # Limit for testing

# --- HUGGINGFACE INFERENCE CLIENT ---
client = InferenceClient("google/flan-t5-base")  # Stable public model

# --- TEXT CLEANING ---
# lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('•', '\n•')
    text = text.lower()
    words = re.findall(r'\b\w+\b', text)
    filtered = [word for word in words if word not in STOP_WORDS]
    return ' '.join(filtered)

# --- FDA API ---
def fetch_drug_labels(api_url, params):
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None

# --- NLP ---
nlp = spacy.load("en_core_sci_sm")

def extract_drug_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "CHEMICAL"]

interaction_phrases = [
    r'(?P<drug1>[A-Z][a-zA-Z]{2,})\b.*?\b(?:and|with)\b.*?\b(?P<drug2>[A-Z][a-zA-Z]{2,})\b.*?\b(?:increase|reduce|inhibit|enhance)\b.*?(?P<effect>[^\.]+)\.',
    r'concomitant use of (?P<drug1>[A-Z][a-zA-Z\s]{2,}?) and (?P<drug2>[A-Z][a-zA-Z\s]{2,}?).*?\b(?:may|can|will)\b (?P<effect>[^\.]+)\.',
]

def extract_interactions(text):
    results = []
    for pattern in interaction_phrases:
        for match in re.finditer(pattern, text, flags=re.IGNORECASE):
            results.append(match.groupdict())
    return results

# --- DRUG NAME RESOLUTION ---
def resolve_drug_name(entry):
    openfda = entry.get('openfda', {})
    fallback_name = entry.get('id') or entry.get('set_id') or "Unknown Drug"
    for field in ['brand_name', 'generic_name', 'substance_name']:
        names = openfda.get(field)
        if names:
            return names[0] if isinstance(names, list) else names
    for section in ['description', 'indications_and_usage']:
        texts = entry.get(section, [])
        if texts:
            match = re.search(r'\b([A-Z][a-z]+(?: [A-Z][a-z]+)?)\b', texts[0])
            if match and match.group(1).lower() not in {"uses", "directions", "indications", "purpose", "the"}:
                return match.group(1)
    return fallback_name

# --- RECORD STRUCTURE ---
def build_extraction_record(drug, type, effect, method, raw_text):
    return {
        "drug": drug,
        "type": type,
        "effect_or_reason": effect,
        "source_method": method,
        "raw_text": raw_text
    }

# --- HUGGINGFACE MODEL WRAPPER ---
def call_hf_model(prompt):
    try:
        response = client.text_generation(prompt, max_new_tokens=200)
        print("\n🧾 HF Output:", response)
        return response.strip()
    except Exception as e:
        print("HF inference error:", e)
        return ""

# --- EXTRACTION WRAPPER ---
def aggregate_all_extractions(result):
    drug_name = resolve_drug_name(result)
    records = []
    if drug_name == "Unknown Drug":
        return records

    for section_type in ["drug_interactions", "contraindications"]:
        for text in result.get(section_type, []):
            cleaned = clean_text(text)
            section_label = "interaction" if section_type == "drug_interactions" else "contraindication"

            # Regex
            if section_type == "drug_interactions":
                regex_matches = extract_interactions(cleaned)
                for match in regex_matches:
                    records.append(build_extraction_record(drug_name, section_label, match['effect'], "regex", cleaned))

            # NER
            entities = extract_drug_entities(cleaned)
            for ent in entities:
                records.append(build_extraction_record(drug_name, section_label, ent, "NER", cleaned))

            # HuggingFace LLM - structured plain text format
            prompt = f"""
            You are a biomedical extraction expert.

            Given this text, extract structured data in plain text format. Return:
            Drugs: DrugA, DrugB
            Type: DDI or contraindication
            Reason: Short sentence explaining the interaction or contraindication.

            Text: {text}
            """
            hf_output = call_hf_model(prompt)
            try:
                drugs_match = re.search(r"Drugs:\s*(.+)", hf_output)
                type_match = re.search(r"Type:\s*(DDI|contraindication)", hf_output, re.IGNORECASE)
                reason_match = re.search(r"Reason:\s*(.+)", hf_output)
                if drugs_match and type_match and reason_match:
                    records.append(build_extraction_record(
                        drug_name,
                        type_match.group(1).lower(),
                        reason_match.group(1),
                        "LLM",
                        cleaned
                    ))
                else:
                    raise ValueError("Missing one or more fields")
            except Exception as e:
                print("⚠️ Could not parse structured output from LLM:", hf_output)
                records.append(build_extraction_record(drug_name, section_label, hf_output, "LLM", cleaned))

    return records

# --- MAIN ---
def main():
    data = fetch_drug_labels(API_URL, PARAMS)
    if not data:
        return

    all_records = []
    for entry in data.get("results", []):
        extracted = aggregate_all_extractions(entry)
        all_records.extend(extracted)

    df = pd.DataFrame(all_records)
    print("\n📊 Final Extracted Data Preview:")
    print(df.head())
    df.to_csv("fda_ddi_contra_output.csv", index=False)
    print("\n✅ Saved output to fda_ddi_contra_output.csv")

# --- RUN ---
if __name__ == "__main__":
    main()



🧾 HF Output: quinolones, drugA, drugB, systemic administration of some quinolones, systemic administration of some quinolones, systemic administration of some quinolones, systemic administration of some quinolones, systemic administration of some quinolones, systemic administration of some quinolones, systemic administration of some quinolones, systemic administration of some quinolones, systemic administration of some quinolones, systemic administration of some quinolones, systemic administration of some quinolones, systemic administration of some quinolones, systemic administration of some quinolones, systemic administration of some quinolones, systemic administration of some quinolones, systemic administration of some quinolones, systemic administration of some quinolones, system
⚠️ Could not parse structured output from LLM: quinolones, drugA, drugB, systemic administration of some quinolones, systemic administration of some quinolones, systemic administration of some quinolones, 

Works fine BELOW

In [32]:
import requests
import json
import scispacy
import spacy
import re
import pandas as pd
import torch
from huggingface_hub import InferenceClient

# --- CONFIG ---
API_URL = "https://api.fda.gov/drug/label.json"
PARAMS = {'limit': 50}  # Limit for testing

# --- HUGGINGFACE INFERENCE CLIENT (Preferred over LangChain for now) ---
client = InferenceClient("google/flan-t5-base")  # Smaller model, reliable

def call_hf_model(prompt):
    try:
        response = client.text_generation(prompt, max_new_tokens=200)
        print("\n🧾 HF Output:", response)
        return response.strip()
    except Exception as e:
        print("HF inference error:", e)
        return ""

# --- TEXT CLEANING ---
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('•', '\n•')
    return text.strip()

# --- FDA API ---
def fetch_drug_labels(api_url, params):
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None

# --- NLP ---
nlp = spacy.load("en_core_sci_sm")

def extract_drug_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "CHEMICAL"]

interaction_phrases = [
    r'(?P<drug1>[A-Z][a-zA-Z]{2,})\b.*?\b(?:and|with)\b.*?\b(?P<drug2>[A-Z][a-zA-Z]{2,})\b.*?\b(?:increase|reduce|inhibit|enhance)\b.*?(?P<effect>[^\.]+)\.',
    r'concomitant use of (?P<drug1>[A-Z][a-zA-Z\s]{2,}?) and (?P<drug2>[A-Z][a-zA-Z\s]{2,}?).*?\b(?:may|can|will)\b (?P<effect>[^\.]+)\.',
]

def extract_interactions(text):
    results = []
    for pattern in interaction_phrases:
        for match in re.finditer(pattern, text, flags=re.IGNORECASE):
            results.append(match.groupdict())
    return results

# --- DRUG NAME RESOLUTION ---
def resolve_drug_name(entry):
    openfda = entry.get('openfda', {})
    fallback_name = entry.get('id') or entry.get('set_id') or "Unknown Drug"
    for field in ['brand_name', 'generic_name', 'substance_name']:
        names = openfda.get(field)
        if names:
            return names[0] if isinstance(names, list) else names
    for section in ['description', 'indications_and_usage']:
        texts = entry.get(section, [])
        if texts:
            match = re.search(r'\b([A-Z][a-z]+(?: [A-Z][a-z]+)?)\b', texts[0])
            if match and match.group(1).lower() not in {"uses", "directions", "indications", "purpose", "the"}:
                return match.group(1)
    return fallback_name

# --- RECORD STRUCTURE ---
def build_extraction_record(drug, type, effect, method, raw_text):
    return {
        "drug": drug,
        "type": type,
        "effect_or_reason": effect,
        "source_method": method,
        "raw_text": raw_text
    }

# --- EXTRACTION WRAPPER ---
def aggregate_all_extractions(result):
    drug_name = resolve_drug_name(result)
    records = []
    if drug_name == "Unknown Drug":
        return records

    for section_type in ["drug_interactions", "contraindications"]:
        for text in result.get(section_type, []):
            cleaned = clean_text(text)
            section_label = "interaction" if section_type == "drug_interactions" else "contraindication"

            # Regex
            if section_type == "drug_interactions":
                regex_matches = extract_interactions(cleaned)
                for match in regex_matches:
                    records.append(build_extraction_record(drug_name, section_label, match['effect'], "regex", cleaned))

            # NER
            entities = extract_drug_entities(cleaned)
            for ent in entities:
                records.append(build_extraction_record(drug_name, section_label, ent, "NER", cleaned))

            # HuggingFace LLM
            prompt = f"""
            You are a pharmacology expert. Given the following section of an FDA drug label, extract a drug interaction or contraindication clearly.

            Text: "{cleaned}"

            Return one clean sentence describing the interaction or contraindication.
            """
            hf_output = call_hf_model(prompt)
            records.append(build_extraction_record(drug_name, section_label, hf_output, "LLM", cleaned))

    return records

# --- MAIN ---
def main():
    data = fetch_drug_labels(API_URL, PARAMS)
    if not data:
        return

    all_records = []
    for entry in data.get("results", []):
        extracted = aggregate_all_extractions(entry)
        all_records.extend(extracted)

    df = pd.DataFrame(all_records)
    print("\n📊 Final Extracted Data Preview:")
    print(df.head())
    df.to_csv("fda_output.csv", index=False)
    print("\n✅ Saved output to fda_output.csv")

# --- RUN ---
if __name__ == "__main__":
    main()





🧾 HF Output: drug interaction

🧾 HF Output: Ofloxacin ophthalmic solution is contraindicated in patients with a history of hypersensitivity to ofloxacin, to other quinolones, or to any of the components in this medication

🧾 HF Output: Naproxen may interfere with hemostasis.

🧾 HF Output: Known hypersensitivity to naproxen or any components of the drug product

🧾 HF Output: MEKINIST

🧾 HF Output: None

🧾 HF Output: Gabapentin inhibits the major cytochrome P450 enzymes

🧾 HF Output: Gabapentin tablets USP are contraindicated in patients who have demonstrated hypersensitivity to the drug or its ingredients

🧾 HF Output: Colesevelam


🧾 HF Output: Clarithromycin may be associated with an increase of serum theophylline concentrations. Clarithromycin 500 mg every 8 hours was given in combination with omeprazole 40 mg daily to healthy adult subjects. The steady-state levels of C max, C min, and AUC of Clarithromycin were increased by the concomitant administration of clarithromycin. Clarith

In [None]:
import requests
import json
import scispacy
import spacy
import re
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig
import torch

# --- CONFIG ---
API_URL = "https://api.fda.gov/drug/label.json"
PARAMS = {'limit': 50}  # Limit for testing

# --- LOAD LLM ---
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")

# --- PROMPT + LLM ---
def build_prompt(drug_name, section_type, section_text):
    return f"""
Given the following text, extract the {section_type} in one clear sentence. Do not repeat the input.

Text: \"{section_text}\"
"""

def extract_with_llm(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=150)
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("\n🧾 LLM Output:\n", decoded)
    return decoded.strip()

# --- FDA API ---
def fetch_drug_labels(api_url, params):
    try:
        response = requests.get(api_url, params=params)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data: {e}")
        return None

# --- TEXT CLEANING ---
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = text.replace('•', '\n•')
    return text.strip()

def chunk_text(text, max_sentences=3):
    sentences = re.split(r'(?<=[.!?]) +', text)
    return [' '.join(sentences[i:i+max_sentences]) for i in range(0, len(sentences), max_sentences)]

# --- NLP ---
nlp = spacy.load("en_core_sci_sm")

def extract_drug_entities(text):
    doc = nlp(text)
    return [ent.text for ent in doc.ents if ent.label_ == "CHEMICAL"]

interaction_phrases = [
    r'(?P<drug1>[A-Z][a-zA-Z]{2,})\b.*?\b(?:and|with)\b.*?\b(?P<drug2>[A-Z][a-zA-Z]{2,})\b.*?\b(?:increase|reduce|inhibit|enhance)\b.*?(?P<effect>[^\.]+)\.',
    r'concomitant use of (?P<drug1>[A-Z][a-zA-Z\s]{2,}?) and (?P<drug2>[A-Z][a-zA-Z\s]{2,}?).*?\b(?:may|can|will)\b (?P<effect>[^\.]+)\.',
]

def extract_interactions(text):
    results = []
    for pattern in interaction_phrases:
        for match in re.finditer(pattern, text, flags=re.IGNORECASE):
            results.append(match.groupdict())
    return results

# --- DRUG NAME EXTRACTION ---
def resolve_drug_name(entry):
    openfda = entry.get('openfda', {})
    fallback_name = entry.get('id') or entry.get('set_id') or "Unknown Drug"
    for field in ['brand_name', 'generic_name', 'substance_name']:
        names = openfda.get(field)
        if names:
            return names[0] if isinstance(names, list) else names
    for section in ['description', 'indications_and_usage']:
        texts = entry.get(section, [])
        if texts:
            match = re.search(r'\b([A-Z][a-z]+(?: [A-Z][a-z]+)?)\b', texts[0])
            if match and match.group(1).lower() not in {"uses", "directions", "indications", "purpose", "the"}:
                return match.group(1)
    return fallback_name

# --- RECORD STRUCTURE ---
def build_extraction_record(drug, scope, related_drug, effect, method, raw_text):
    return {
        "drug": drug,
        "type": scope,
        "related_drug": related_drug,
        "effect_or_reason": effect,
        "source_method": method,
        "raw_text": raw_text
    }

# --- AGGREGATE ALL EXTRACTIONS ---
def aggregate_all_extractions(result):
    drug_name = resolve_drug_name(result)
    records = []
    if drug_name == "Unknown Drug":
        return records

    for section_type in ["drug_interactions", "contraindications"]:
        for text in result.get(section_type, []):
            cleaned = clean_text(text)
            section_label = "interaction" if section_type == "drug_interactions" else "contraindication"

            # Regex
            if section_type == "drug_interactions":
                regex_matches = extract_interactions(cleaned)
                for match in regex_matches:
                    records.append(build_extraction_record(drug_name, section_label, f"{match['drug1']} + {match['drug2']}", match['effect'], "regex", cleaned))

            # NER
            entities = extract_drug_entities(cleaned)
            for ent in entities:
                records.append(build_extraction_record(drug_name, section_label, ent, "", "NER", cleaned))

            # LLM
            for chunk in chunk_text(cleaned):
                prompt = build_prompt(drug_name, section_label, chunk)
                llm_output = extract_with_llm(model, tokenizer, prompt)
                records.append(build_extraction_record(drug_name, section_label, "", llm_output, "LLM", chunk))

    print(f"\n✅ Processed: {drug_name}")
    return records

# --- MAIN LOGIC ---
def main():
    data = fetch_drug_labels(API_URL, PARAMS)
    if not data:
        return

    all_records = []
    for entry in data.get("results", []):
        extracted = aggregate_all_extractions(entry)
        all_records.extend(extracted)

    df = pd.DataFrame(all_records)
    print("\n📊 Final Extracted Data Preview:")
    print(df.head())
    df.to_csv("fda_i_output.csv", index=False)
    print("\n✅ Saved output to fda_i_output.csv")

# --- RUN ---
if __name__ == "__main__":
    main()

In [11]:
contras.head()

Unnamed: 0,drug,contraindication_text,interacting_drugs
0,Ofloxacin Ophthalmic,CONTRAINDICATIONS Ofloxacin ophthalmic solutio...,[]
1,Naproxen,4 CONTRAINDICATIONS Naproxen tablets and napro...,[]
2,Mekinist,4 CONTRAINDICATIONS None. None. ( 4 ),[]
3,Gabapentin,CONTRAINDICATIONS Gabapentin tablets USP are c...,[]
4,Glimepiride,4 CONTRAINDICATIONS Glimepiride tablets are co...,[]


In [9]:
interact.head()

Unnamed: 0,drug,interaction_text,interacting_drugs,regex_matches
0,Ofloxacin Ophthalmic,Drug Interactions: Specific drug interaction s...,[],"[{'drug1': 'Drug', 'drug2': 'ofloxacin', 'effe..."
1,Naproxen,7 DRUG INTERACTIONS See Table 1 for clinically...,[],"[{'drug1': 'When', 'drug2': 'periodically', 'e..."
2,Glimepiride,7 DRUG INTERACTIONS Certain medications may af...,[],"[{'drug1': 'DRUG', 'drug2': 'close', 'effect':..."
3,Clarithromycin,Drug Interactions Clarithromycin use in patien...,[],"[{'drug1': 'Drug', 'drug2': 'increase', 'effec..."
4,Methocarbamol,Drug interactions See Warnings and Precautions...,[],"[{'drug1': 'Drug', 'drug2': 'Precautions', 'ef..."
