
# 0 env loading (for GROQ_API_KEY, etc.)

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()  # Load environment variables, including GROQ_API_KEY

os.environ['JAVA_HOME'] = r"C:\Program Files\Java\jdk-21"

# 1. Imports

In [2]:
import jpype
import tabula
import base64
import pymupdf
import logging
import warnings
import numpy as np
from tqdm import tqdm
from botocore.exceptions import ClientError

# Summarization w/ GROQ
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Summarization w/ GPT-4o (OpenAI style)
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# For VectorStore & RAG
import uuid
from base64 import b64decode
from langchain_chroma import Chroma  # Updated import
from langchain_openai import OpenAIEmbeddings  # Updated import
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain.retrievers.multi_vector import MultiVectorRetriever
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.messages import HumanMessage

In [3]:
# Start JVM for Tabula if not running
jvm_path = r"C:\Program Files\Java\jdk-21\bin\server\jvm.dll"
if not jpype.isJVMStarted():
    jpype.startJVM(jvm_path)

if jpype.isJVMStarted():
    print("JVM loaded successfully!")
else:
    print("JVM not loaded.")

logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

JVM loaded successfully!


# 2. PDF Partitioning

In [4]:
base_dir = "data"
os.makedirs(base_dir, exist_ok=True)

filename = "attention.pdf"
filepath = os.path.join(base_dir, filename)
print("PDF Path:", filepath)

def create_directories(base_dir):
    directories = ["images", "text", "tables", "page_images"]
    for d in directories:
        os.makedirs(os.path.join(base_dir, d), exist_ok=True)

create_directories(base_dir)

PDF Path: data\attention.pdf


In [5]:
def process_tables(doc, page_num, base_dir, items):
    """Extract tables with Tabula and save them as .txt files."""
    try:
        tables = tabula.read_pdf(filepath, pages=page_num + 1, multiple_tables=True)
        if not tables:
            return
        for table_idx, table in enumerate(tables):
            table_text = "\n".join([" | ".join(map(str, row)) for row in table.values])
            table_file_name = f"{base_dir}/tables/{os.path.basename(filepath)}_table_{page_num}_{table_idx}.txt"
            with open(table_file_name, 'w', encoding='utf-8') as f:
                f.write(table_text)
            items.append({"page": page_num, "type": "table", "text": table_text, "path": table_file_name})
    except Exception as e:
        print(f"Error extracting tables from page {page_num}: {str(e)}")

In [6]:
class RecursiveCharacterTextSplitter:
    """A simple text splitter that chunks text by character length."""
    def __init__(self, chunk_size=700, chunk_overlap=200, length_function=len):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.length_function = length_function

    def split_text(self, text):
        chunks = []
        start = 0
        while start < len(text):
            end = start + self.chunk_size
            chunk = text[start:end]
            chunks.append(chunk)
            start += (self.chunk_size - self.chunk_overlap)
        return chunks

In [7]:
def process_text_chunks(text, text_splitter, page_num, base_dir, items):
    chunks = text_splitter.split_text(text)
    for i, chunk in enumerate(chunks):
        text_file_name = f"{base_dir}/text/{os.path.basename(filepath)}_text_{page_num}_{i}.txt"
        with open(text_file_name, 'w', encoding='utf-8') as f:
            f.write(chunk)
        items.append({"page": page_num, "type": "text", "text": chunk, "path": text_file_name})

In [8]:
def process_images(page, page_num, base_dir, items, doc):
    """Extract embedded images on a PDF page."""
    images = page.get_images()
    for idx, image in enumerate(images):
        xref = image[0]
        pix = pymupdf.Pixmap(doc, xref)
        image_name = f"{base_dir}/images/{os.path.basename(filepath)}_image_{page_num}_{idx}_{xref}.png"
        pix.save(image_name)
        with open(image_name, 'rb') as f:
            encoded_image = base64.b64encode(f.read()).decode('utf8')
        items.append({"page": page_num, "type": "image", "path": image_name, "image": encoded_image})

In [9]:
def process_page_images(page, page_num, base_dir, items):
    """Render the entire page as an image and store it."""
    pix = page.get_pixmap()
    page_path = os.path.join(base_dir, f"page_images/page_{page_num:03d}.png")
    pix.save(page_path)
    with open(page_path, 'rb') as f:
        page_image = base64.b64encode(f.read()).decode('utf8')
    items.append({"page": page_num, "type": "page", "path": page_path, "image": page_image})

# 3. Partition the PDF

In [10]:
import pymupdf
doc = pymupdf.open(filepath)
num_pages = len(doc)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=700, chunk_overlap=200)
items = []

print(f"Total pages: {num_pages}")

for page_num in tqdm(range(num_pages), desc="Processing PDF pages"):
    page = doc[page_num]
    text = page.get_text()

    # Table extraction
    process_tables(doc, page_num, base_dir, items)
    # Text chunking
    process_text_chunks(text, text_splitter, page_num, base_dir, items)
    # Images
    process_images(page, page_num, base_dir, items, doc)
    # Full-page images
    process_page_images(page, page_num, base_dir, items)

print("\nPartitioning complete!")
text_items = [i for i in items if i['type'] == 'text']
table_items = [i for i in items if i['type'] == 'table']
image_items = [i for i in items if i['type'] == 'image']

if text_items:
    print("Sample text item:", text_items[0])
if table_items:
    print("Sample table item:", table_items[0])
if image_items:
    print("Sample image item:", image_items[0])

Total pages: 2


KeyboardInterrupt: 

# 4. Gather Text & Table Chunks

In [12]:
text_dir = os.path.join(base_dir, "text")
table_dir = os.path.join(base_dir, "tables")

text_files = [f for f in os.listdir(text_dir) if f.endswith(".txt")]
table_files = [f for f in os.listdir(table_dir) if f.endswith(".txt")]

text_chunks = []
for filename in text_files:
    fp = os.path.join(text_dir, filename)
    with open(fp, 'r', encoding='utf-8') as f:
        content = f.read()
    text_chunks.append(content)

table_chunks = []
for filename in table_files:
    fp = os.path.join(table_dir, filename)
    with open(fp, 'r', encoding='utf-8') as f:
        content = f.read()
    table_chunks.append(content)

print(f"\nNumber of text chunks: {len(text_chunks)}")
print(f"Number of table chunks: {len(table_chunks)}")


Number of text chunks: 123
Number of table chunks: 2


# 5. Summarize Text & Tables (Groq)

In [13]:
prompt_text = """
You are an assistant tasked with summarizing tables and text.
Give a concise summary of the table or text.

Respond only with the summary, no additional comment.
Do not start your message by saying "Here is a summary" or anything like that.
Just give the summary as it is.

Table or text chunk: {element}
"""

groq_prompt = ChatPromptTemplate.from_template(prompt_text)
groq_model = ChatGroq(temperature=0.5, model="llama-3.1-8b-instant")

summarize_chain = (
    {"element": lambda x: x}
    | groq_prompt
    | groq_model
    | StrOutputParser()
)

text_summaries = summarize_chain.batch(text_chunks, {"max_concurrency": 3})
table_summaries = summarize_chain.batch(table_chunks, {"max_concurrency": 3})

print("\n=== EXAMPLE TEXT SUMMARIES ===")
for idx, summary in enumerate(text_summaries[:3]):
    print(f"Text Chunk #{idx+1} SUMMARY:\n{summary}\n---")

print("\n=== EXAMPLE TABLE SUMMARIES ===")
for idx, summary in enumerate(table_summaries[:3]):
    print(f"Table Chunk #{idx+1} SUMMARY:\n{summary}\n---")


=== EXAMPLE TEXT SUMMARIES ===
Text Chunk #1 SUMMARY:
This survey on Agentic AI for scientific discovery at ICLR 2025 highlights the progress, challenges, and future directions of AI systems that enable reasoning, planning, and autonomous decision-making in research automation, transforming tasks such as literature review, hypothesis generation, experimentation, and result analysis.
---
Text Chunk #2 SUMMARY:
The table or text chunk discusses Agentic AI for scientific discovery, providing an overview of existing systems, tools, and recent progress in fields like chemistry, biology, and materials science, while addressing challenges and outlining future research directions.
---
Text Chunk #3 SUMMARY:
The rapid advancements of Large Language Models have opened a new era in scientific discovery, enabling Agentic AI systems to automate complex research workflows with high autonomy.
---

=== EXAMPLE TABLE SUMMARIES ===
Table Chunk #1 SUMMARY:
Recent studies on AI applications were publishe

# 6. Summarize Images (GPT-4o)

In [14]:
img_prompt_template = """Describe the image in detail. For context,
the image is part of a research paper explaining the transformers architecture.
Be specific about graphs, such as bar plots."""

messages = [
    (
        "user",
        [
            {"type": "text", "text": img_prompt_template},
            {
                "type": "image_url",
                "image_url": {"url": "data:image/jpeg;base64,{image}"},
            },
        ],
    )
]
image_prompt = ChatPromptTemplate.from_messages(messages)

image_chain = image_prompt | ChatOpenAI(model="gpt-4o") | StrOutputParser()

image_dir = os.path.join(base_dir, "images")
image_files = [f for f in os.listdir(image_dir) if f.lower().endswith((".png", ".jpg", ".jpeg"))]

images_base64 = []
for filename in image_files:
    file_path = os.path.join(image_dir, filename)
    with open(file_path, 'rb') as f:
        content = f.read()
    images_base64.append(base64.b64encode(content).decode('utf8'))

image_summaries = image_chain.batch(images_base64)
if image_summaries:
    print("\n=== EXAMPLE IMAGE SUMMARY ===")
    print(image_summaries[0])



=== EXAMPLE IMAGE SUMMARY ===
The image represents a cyclic workflow diagram with six stages, depicting the process of research development. Each stage is represented by a circle containing an image, and these circles are connected with green arrows, illustrating the sequence of stages in the workflow.

1. **Idea Generation & Literature Review**:
   - Contains an image of books and a magnifying glass, representing research references and exploration of existing literature.
   - A small robot character is present in the image.

2. **Research Planning & Experiment Design**:
   - Displays bar plots and a line graph, symbolizing data collection and analysis planning.
   - A magnifying glass hovers over these graphs, indicating scrutinization or focus.
   - Includes the robot character.

3. **Data Preparation & Experiment Execution**:
   - Shows laboratory equipment with a liquid being poured into a beaker, symbolizing experimentation and data collection.
   - The robot character is includ

# 7. Vector Store & InMemoryStore Setup

In [17]:
import uuid
from langchain.vectorstores import Chroma
from langchain.storage import InMemoryStore
from langchain.schema.document import Document
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_vector import MultiVectorRetriever

# Define source data
texts = text_chunks      # From earlier PDF processing
tables = table_chunks    # From table extraction
images = images_base64   # From image extraction

# Make sure we have summaries
if not 'text_summaries' in locals():
    text_summaries = summarize_chain.batch(texts, {"max_concurrency": 3})
if not 'table_summaries' in locals():
    table_summaries = summarize_chain.batch(tables, {"max_concurrency": 3})
if not 'image_summaries' in locals():
    image_summaries = image_chain.batch(images)

vectorstore = Chroma(
    collection_name="multi_modal_rag",
    embedding_function=OpenAIEmbeddings()
)

store = InMemoryStore()
id_key = "doc_id"

# The retriever (empty to start)
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    docstore=store,
    id_key=id_key,
)

# 8. Add Summaries + Link Original Data

In [18]:
# First define your source data
texts = text_chunks  # or however you're getting your text data
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 3})

#  Add texts
doc_ids = [str(uuid.uuid4()) for _ in texts]
summary_texts = [
    Document(page_content=summary, metadata={id_key: doc_ids[i]}) for i, summary in enumerate(text_summaries)
]
retriever.vectorstore.add_documents(summary_texts)
retriever.docstore.mset(list(zip(doc_ids, texts)))

# Add tables
table_ids = [str(uuid.uuid4()) for _ in tables]
summary_tables = [
    Document(page_content=summary, metadata={id_key: table_ids[i]}) for i, summary in enumerate(table_summaries)
]
retriever.vectorstore.add_documents(summary_tables)
retriever.docstore.mset(list(zip(table_ids, tables)))

# Add image summaries
img_ids = [str(uuid.uuid4()) for _ in images]
summary_img = [
    Document(page_content=summary, metadata={id_key: img_ids[i]}) for i, summary in enumerate(image_summaries)
]
retriever.vectorstore.add_documents(summary_img)
retriever.docstore.mset(list(zip(img_ids, images)))

In [22]:
# Retrieve
docs = retriever.invoke(
    "What this document is about?"
)
for doc in docs:
    print(str(doc) + "\n\n" + "-" * 80)

hang Zeng, Zhang-Ren Chen, and Bowen Zhou.
Large language models are zero shot hypothesis proposers. arXiv preprint arXiv:2311.05965,
2023.
Chen Qian, Wei Liu, Hongzhang Liu, Nuo Chen, Yufan Dang, Jiahao Li, Cheng Yang, Weize Chen,
Yusheng Su, Xin Cong, et al. Chatdev: Communicative agents for software development. In Pro-
ceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume
1: Long Papers), pp. 15174–15186, 2024.
Yixiang Ruan, Chenyin Lu, Ning Xu, Jian Zhang, Jun Xuan, Jianzhang Pan, Qun Fang, Hanyu Gao,
Xiaodong Shen, Ning Ye, et al. Accelerated end-to-end chemical synthesis development with
large language models. doi:10.26434/chemrxiv-2024-6wmg4, 20

--------------------------------------------------------------------------------
ction
PubChem Kim et al. (2016)
Chemistry
Molecular feature extraction
Mol-Instructions Fang et al.
(2023)
Biology/Chemistry
Protein
and
biomolecular-
related tasks
MPcules Spotte-Smith et al.
(2023)
Materials Science

# 9. RAG pipeline

In [29]:
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_openai import ChatOpenAI
from base64 import b64decode
from base64 import b64decode
import imghdr

def parse_docs(docs):
    """
    Return {"images": [(b64, mime), ...], "texts": [str, ...]}
    If base64‑decoding succeeds, we treat it as an image and
    guess its mime ('png' / 'jpeg'); otherwise it's text.
    """
    images, texts = [], []
    for d in docs:
        try:
            raw = b64decode(d)
        except Exception:
            texts.append(d)                # not base64 → plain text
            continue

        # Guess the format from the first bytes
        kind = imghdr.what(None, h=raw)    # 'png', 'jpeg', etc.
        if kind in ("png", "jpeg", "gif", "webp"):
            images.append((d, kind))
        else:                              # something weird → treat as text
            texts.append(d)
    return {"images": images, "texts": texts}


def build_prompt(kwargs):
    ctx = kwargs["context"]
    question = kwargs["question"]

    # 2a. Add textual context
    context_text = "\n".join(ctx["texts"])

    prompt_template = f"""
Answer the question using **only** the following context (text, tables,
and any images provided). If the images are irrelevant, ignore them.

Context:
{context_text}

Question: {question}
"""
    prompt_content = [{"type": "text", "text": prompt_template}]

    # 2b. Attach images with the correct mime
    for b64_str, mime in ctx["images"]:
        prompt_content.append({
            "type": "image_url",
            "image_url": {"url": f"data:image/{mime};base64,{b64_str}"}
        })

    return ChatPromptTemplate.from_messages(
        [HumanMessage(content=prompt_content)]
    )

chain = (
    {
        "context": retriever | RunnableLambda(parse_docs),
        "question": RunnablePassthrough(),
    }
    | RunnableLambda(build_prompt)
    | ChatOpenAI(model="gpt-4o")
    | StrOutputParser()
)

chain_with_sources = {
    "context": retriever | RunnableLambda(parse_docs),
    "question": RunnablePassthrough(),
} | RunnablePassthrough().assign(
    response=(
        RunnableLambda(build_prompt)
        | ChatOpenAI(model="gpt-4o")
        | StrOutputParser()
    )
)

In [32]:
response = chain.invoke("How many Allrgies they do have?")
print(response)

The difference between single and multi-agent systems lies in their structure and application suitability:

- **Single Agent Systems**: These involve an individual agent capable of achieving its goals independently without relying on assistance or feedback from other AI agents, even if multiple agents are present in the environment. They are ideal for well-defined problems where user feedback is not necessary. A single agent system can have an LLM backbone and is able to perform tasks such as reasoning, planning, and tool execution independently.

- **Multi-Agent Systems**: These systems comprise two or more agents interacting with each other. They are inspired by the theory where smaller agents with specific functions interact to create intelligence. Multi-agent systems require interoperability for communication and information sharing and are suited for jobs that require collaboration across multiple domains, where each agent is an expert in a particular area. Multi-agent systems are

In [33]:
# Utility: show a base‑64 encoded image inline (works in Jupyter / VS Code)
from IPython.display import Image, display
import base64

def display_base64_image(img):
    """
    Accepts either a raw base64 string *or* a (b64, mime) tuple,
    decodes it, and renders the image inline.
    """
    if isinstance(img, tuple):         # the parser may return (b64, mime)
        img_b64, _mime = img
    else:
        img_b64 = img

    display(Image(data=base64.b64decode(img_b64)))

In [35]:
response = chain_with_sources.invoke(
    "What's there Surgical History?"
)

print("Response:", response['response'])

print("\n\nContext:")

Response: The AI agent frameworks for scientific discovery are categorized into autonomous and collaborative frameworks. These frameworks are involved in various domains including materials science, general science, and machine learning. They automate tasks such as hypothesis generation, experiment design, data analysis, and literature review. This automation helps accelerate scientific discoveries, reduce costs, and democratize access to research tools. The systems are designed to collaborate with researchers, generating novel ideas and handling repetitive tasks. AI agents are particularly impactful in biology, where they assist in areas like genomics, drug discovery, and synthetic biology, integrating with laboratory tools to enhance research accuracy and reproducibility.


Context:
