In [None]:
import os
import re
import pdfplumber
import pandas as pd
from nltk.tokenize import sent_tokenize
import numpy as np
import openai
import faiss
import time
from tqdm import tqdm
from nltk.tokenize import sent_tokenize, word_tokenize

openai.api_key = os.getenv("OPENAI_API_KEY")
EMBEDDING_MODEL = "text-embedding-3-large"
EMBEDDING_DIM = 3072  # this is correct for "text-embedding-3-large"

# ----------------------------- #
#       TABLE FORMATTER        #
# ----------------------------- #
def format_table_as_markdown(table):
    df = pd.DataFrame(table[1:], columns=table[0])
    for col in df.columns:
        df[col] = df[col].map(lambda x: str(x).replace('\n', ' ').strip() if pd.notnull(x) else "")
    
    lines = []
    headers = " | ".join(map(str, df.columns))
    lines.append(headers)
    lines.append("-" * len(headers))

    for _, row in df.iterrows():
        lines.append(" | ".join(map(str, row)))
    return "\n".join(lines)

# ----------------------------- #
#   TEXT + TABLE PER PAGE       #
# ----------------------------- #
def extract_sections_and_tables(pdf_path):
    chunks = []
    metadata = []
    filename = os.path.basename(pdf_path)

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            text = page.extract_text() or ""
            tables = page.extract_tables()

            # SECTION SPLIT: based on ALL-CAPS or numbered headers
            sections = re.split(r'\n(?=[A-Z\d][A-Z\d\s\-\(\)\.]{5,})', text)
            for section in sections:
                cleaned = re.sub(r'\s+', ' ', section.strip())
                if len(cleaned) > 50:
                    chunks.append(cleaned)
                    metadata.append({
                        "file": filename,
                        "page": page_num + 1,
                        "type": "text",
                        "section": None  # optionally extract heading from section[:50]
                    })

            # TABLES: standalone chunks
            for table in tables:
                try:
                    table_text = format_table_as_markdown(table)
                    chunks.append(f"📊 Table (Page {page_num+1}):\n{table_text}")
                    metadata.append({
                        "file": filename,
                        "page": page_num + 1,
                        "type": "table",
                        "section": None
                    })
                except Exception as e:
                    print(f"⚠️ Skipped malformed table on page {page_num+1}: {e}")

    return chunks, metadata

# ----------------------------- #
#         CHUNK CLEANER         #
# ----------------------------- #
def chunk_long_text(text, meta, max_tokens=700, overlap=100):
    words = word_tokenize(text)
    chunks = []
    metas = []

    i = 0
    while i < len(words):
        chunk_words = words[i:i+max_tokens]
        chunk_text = " ".join(chunk_words)
        chunks.append(chunk_text)
        metas.append(meta)
        i += max_tokens - overlap
    return chunks, metas

def clean_and_split_chunks(raw_chunks, raw_metadata):
    all_chunks = []
    all_meta = []

    for chunk, meta in zip(raw_chunks, raw_metadata):
        if len(word_tokenize(chunk)) <= 700:
            all_chunks.append(chunk)
            all_meta.append(meta)
        else:
            sub_chunks, sub_metas = chunk_long_text(chunk, meta)
            all_chunks.extend(sub_chunks)
            all_meta.extend(sub_metas)

    return all_chunks, all_meta


# ----------------------------- #
#       EMBEDDING GENERATOR     #
# ----------------------------- #
def get_openai_embeddings(texts, model=EMBEDDING_MODEL):
    embeddings = []
    batch_size = 10
    for i in tqdm(range(0, len(texts), batch_size), desc="🔗 Embedding"):
        batch = texts[i:i+batch_size]
        try:
            response = openai.embeddings.create(input=batch, model=model)
            batch_embeddings = [r.embedding for r in response.data]
            embeddings.extend(batch_embeddings)
        except Exception as e:
            print(f"⚠️ Embedding batch failed: {e}")
            embeddings.extend([[0.0] * EMBEDDING_DIM] * len(batch))
        time.sleep(1)  # stay below rate limit
    return np.array(embeddings).astype("float32")

# ----------------------------- #
#            MAIN               #
# ----------------------------- #
def process_pdf_folder(pdf_dir):
    all_chunks = []
    all_meta = []

    for filename in os.listdir(pdf_dir):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(pdf_dir, filename)
            print(f"📄 Processing: {filename}")
            chunks, metas = extract_sections_and_tables(pdf_path)
            sub_chunks, sub_metas = clean_and_split_chunks(chunks, metas)
            all_chunks.extend(sub_chunks)
            all_meta.extend(sub_metas)

    return all_chunks, all_meta



In [None]:

PDF_DIR = r"C:\Users\FahRe\Desktop\agentic-LLM-app\backend\data\MedicationGuides_2025_05_19"
print(f"📁 Loading PDFs from: {PDF_DIR}")
chunks, metadata = process_pdf_folder(PDF_DIR)
print(f"🔎 Total chunks: {len(chunks)}")
embeddings = get_openai_embeddings(chunks)
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)

print(f"✅ FAISS index created with {index.ntotal} entries.")
import pickle
with open(r"C:\Users\FahRe\Desktop\agentic-LLM-app\backend\data\faiss_metadata.pkl", "wb") as f:
    pickle.dump(metadata, f)
with open(r"C:\Users\FahRe\Desktop\agentic-LLM-app\backend\data\faiss_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)
with open(r"C:\Users\FahRe\Desktop\agentic-LLM-app\backend\data\faiss_chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)

faiss.write_index(index, r"C:\Users\FahRe\Desktop\agentic-LLM-app\backend\data\faiss_index.idx")


📁 Loading PDFs from: C:\Users\FahRe\Desktop\agentic-LLM-app\backend\data\MedicationGuides_2025_05_19
📄 Processing: ABACAVIR_and_LAMIVUDINE_ABACAVIR_SULFATE_LAMIVUDINE_TABLET_ORAL_204311_MYLAN_LABORATORIES_LIMITED_12-22-2023.pdf
📄 Processing: ABILIFY_ARIPIPRAZOLE_INJECTABLE_INTRAMUSCULAR_21866_OTSUKA_02-05-2020.pdf
📄 Processing: ABILIFY_ARIPIPRAZOLE_SOLUTION_ORAL_21713_OTSUKA_02-05-2020.pdf
📄 Processing: ABILIFY_ARIPIPRAZOLE_TABLET,_ORALLY_DISINTEGRATING_ORAL_21729_OTSUKA_02-05-2020.pdf
📄 Processing: ABILIFY_ARIPIPRAZOLE_TABLET_ORAL_21436_OTSUKA_01-22-2025.pdf
📄 Processing: ABILIFY_ASIMTUFII_ARIPIPRAZOLE_MONOHYDRATE_EXTENDED-RELEASE,_INJECTABLE_SUSPENSION__INTRAMUSCULAR_217006_OTSUKA_PHARMACEUTICAL_CO.,_LTD_01-22-2025.pdf
📄 Processing: ABILIFY_MAINTENA_KIT_ARIPIPRAZOLE_FOR_SUSPENSION,_EXTENDED_RELEASE_INTRAMUSCULAR_202971_OTSUKA_PHARM_CO_LTD_01-22-2025.pdf
📄 Processing: ABILIFY_MYCITE_KIT_ARIPIPRAZOLE_TABLET_ORAL_207202_OTSUKA_01-22-2025.pdf
📄 Processing: ABRILADA_ADALIMUMAB-AFZB_INJECT

🔗 Embedding:  24%|██▎       | 2961/12553 [1:52:44<5:03:34,  1.90s/it] 

In [None]:
import pdfplumber
import re
from nltk.tokenize import sent_tokenize
import pandas as pd
import os

def format_table_as_markdown(table):
    df = pd.DataFrame(table[1:], columns=table[0])
    df = df.applymap(lambda x: str(x).replace('\n', ' ').strip() if isinstance(x, str) else x)
    lines = []

    headers = " | ".join(map(str, df.columns))
    lines.append(headers)
    lines.append("-" * len(headers))

    for _, row in df.iterrows():
        lines.append(" | ".join(map(str, row)))
    return "\n".join(lines)


def extract_text_and_tables_by_page(pdf_path):
    combined_chunks = []
    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            print(f"📄 Processing page {i+1}/{len(pdf.pages)}")
            text = page.extract_text() or ""
            tables = page.extract_tables()

            # Format tables
            formatted_tables = []
            for table in tables:
                try:
                    formatted = format_table_as_markdown(table)
                    formatted_tables.append(f"\n📊 Table:\n{formatted}\n")
                except Exception as e:
                    print(f"⚠️ Skipping malformed table: {e}")

            # Combine text and tables
            combined = text + "\n\n" + "\n\n".join(formatted_tables)
            print(f"📄 Extracted text from page {i+1}:\n{combined}\n")
            combined_chunks.append(combined.strip())

    return combined_chunks

def clean_and_split_chunks(chunks, max_len=1200):
    all_chunks = []
    for chunk in chunks:
        sentences = sent_tokenize(re.sub(r'\s+', ' ', chunk))
        for i in range(0, len(sentences), 5):
            piece = " ".join(sentences[i:i+5])
            if piece:
                all_chunks.append(piece)
    return all_chunks

pdf_path = r"C:\Users\FahRe\Desktop\agentic-LLM-app\backend\data\MedicationGuides_2025_05_19\ABACAVIR_and_LAMIVUDINE_ABACAVIR_SULFATE_LAMIVUDINE_TABLET_ORAL_204311_MYLAN_LABORATORIES_LIMITED_12-22-2023.pdf"

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2') # use openai embedding!
embeddings = model.encode(clean_chunks)
dim = embeddings.shape[1]

index = faiss.IndexFlatL2(dim)
index.add(np.array(embeddings))



📄 Processing page 1/41
📄 Extracted text from page 1:
This label may not be the latest approved by FDA.
For current labeling information, please visit https://www.fda.gov/drugsatfda
HIGHLIGHTS OF PRESCRIBING INFORMATION • May be dispersed in water, swallowed whole or broken in half along the
These highlights do not include all the information needed to use score. Do not chew. (2.3).
ABACAVIR and LAMIVUDINE tablets for oral suspension safely and • Because the abacavir and lamivudine tablets for oral suspension is a fixed-
effectively. See full prescribing information for ABACAVIR and dose product and cannot be dose adjusted, abacavir and lamivudine tablets
LAMIVUDINE tablets for oral suspension. for oral suspension is not recommended in patients requiring dosage
adjustment or patients with hepatic impairment. (2.4)
ABACAVIR and LAMIVUDINE tablets for oral suspension
Initial U.S. Approval: 2004 --------------------- DOSAGE FORMS AND STRENGTHS ---------------------
Tablets for Oral Suspens

  df = df.applymap(lambda x: str(x).replace('\n', ' ').strip() if isinstance(x, str) else x)


📄 Extracted text from page 6:
This label may not be the latest approved by FDA.
For current labeling information, please visit https://www.fda.gov/drugsatfda
The tablets are white to off-white, round, scored tablets debossed with AL above the score and 7
below the score on one side of the tablet and M on the other side. The tablets are functionally
scored.
4 CONTRAINDICATIONS
Abacavir and lamivudine tablets for oral suspension is contraindicated in patients:
(5.1)].
• with moderate or severe hepatic impairment [see Use in Specific Populations (8.7)].
5.1 Hypersensitivity Reactions
Serious and sometimes fatal hypersensitivity reactions have occurred with abacavir, a component
of abacavir and lamivudine tablets for oral suspension. These hypersensitivity reactions have
included multi-organ failure and anaphylaxis and typically occurred within the first 6 weeks of
treatment with abacavir (median time to onset was 9 days); although abacavir hypersensitivity
reactions have occurred any time

  df = df.applymap(lambda x: str(x).replace('\n', ' ').strip() if isinstance(x, str) else x)
  df = df.applymap(lambda x: str(x).replace('\n', ' ').strip() if isinstance(x, str) else x)


📄 Extracted text from page 28:
This label may not be the latest approved by FDA.
For current labeling information, please visit https://www.fda.gov/drugsatfda
Respondera 69% (73%) 69% (71%)
Virologic failuresb 6% 4%
Discontinued due to adverse reactions 14% 16%
Discontinued due to other reasonsc 10% 11%
a Subjects achieved and maintained confirmed HIV-1 RNA less than or equal to 50 copies per
mL (less than 400 copies per mL) through Week 48 (Roche AMPLICOR Ultrasensitive HIV-
1 MONITOR standard test 1.0 PCR).
b Includes viral rebound, insufficient viral response according to the investigator, and failure to
achieve confirmed less than or equal to 50 copies per mL by Week 48.
c Includes consent withdrawn, lost to follow up, protocol violations, those with missing data,
clinical progression, and other.
After 48 weeks of therapy, the median CD4+ cell count increases from baseline were 209
cells/mm3 in the group receiving abacavir and 155 cells/mm3 in the zidovudine group. Through
Week 48,