In [1]:
# ! pip install langchain langchain-chroma "unstructured[all-docs]" pydantic lxml

In [2]:
from typing import Any
import requests
import os
from pydantic import BaseModel
from unstructured.partition.pdf import partition_pdf

# URL of the PDF
url = "https://breast-cancer-research.biomedcentral.com/counter/pdf/10.1186/s13058-025-01973-3.pdf"

# Local file path to save the PDF
local_pdf_path = "temp.pdf"  

# Download the PDF
response = requests.get(url, stream=True)
with open(local_pdf_path, "wb") as pdf_file:
    for chunk in response.iter_content(chunk_size=1024):
        if chunk:
            pdf_file.write(chunk)

# Get elements using the local file path
raw_pdf_elements = partition_pdf(
    filename=local_pdf_path,
    # Using pdf format to find embedded image blocks
    extract_images_in_pdf=True,
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    # Hard max on chunks
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path=os.path.dirname(local_pdf_path), # Save images in the same directory
)

# Optional: Remove the temporary PDF file
# os.remove(local_pdf_path)

In [3]:
# Create a dictionary to store counts of each type
category_counts = {}

for element in raw_pdf_elements:
    category = str(type(element))
    if category in category_counts:
        category_counts[category] += 1
    else:
        category_counts[category] = 1

# Unique_categories will have unique elements
# TableChunk if Table > max chars set above
unique_categories = set(category_counts.keys())
category_counts

{"<class 'unstructured.documents.elements.CompositeElement'>": 21}

In [4]:
class Element(BaseModel):
    type: str
    text: Any


# Categorize by type
categorized_elements = []
for element in raw_pdf_elements:
    if "unstructured.documents.elements.Table" in str(type(element)):
        categorized_elements.append(Element(type="table", text=str(element)))
    elif "unstructured.documents.elements.CompositeElement" in str(type(element)):
        categorized_elements.append(Element(type="text", text=str(element)))

# Tables
table_elements = [e for e in categorized_elements if e.type == "table"]
print(len(table_elements))

# Text
text_elements = [e for e in categorized_elements if e.type == "text"]
print(len(text_elements))

0
21


In [5]:
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

In [6]:
# Prompt
prompt_text = """You are an assistant tasked with summarizing tables and text. \
Give a concise summary of the table or text. Table or text chunk: {element} """
prompt = ChatPromptTemplate.from_template(prompt_text)

# Summary chain
model = ChatOllama(base_url="http://127.0.0.1:11434", model="llama3")
summarize_chain = {"element": lambda x: x} | prompt | model | StrOutputParser()

  model = ChatOllama(base_url="http://127.0.0.1:11434", model="llama3")


In [7]:
# Apply to text
texts = [i.text for i in text_elements if i.text != ""]
text_summaries = summarize_chain.batch(texts, {"max_concurrency": 5})

In [8]:
print(text_summaries)

['Here is a concise summary of the text:\n\nThe study investigates the role of SMYD4 in breast cancer progression. Researchers found that SMYD4 acts as a tumor suppressor, downregulating its expression being associated with poor prognosis. The study reveals that SMYD4 promotes ubiquitination and degradation of MYH9 through lysine monomethylation modification, thereby inhibiting WNT signaling pathway. This discovery suggests that SMYD4 is a potential therapeutic target for breast cancer treatment.', 'Here is a concise summary of the text:\n\nThe article discusses the importance of understanding breast cancer, which is still the leading cause of cancer death in females worldwide despite advances in treatment. The disease is heterogeneous and influenced by multiple factors such as genetics, environment, and hormone levels. Recent research has highlighted the critical role of epigenetic reprogramming in breast cancer progression, metastasis, and therapy resistance. The article then focuses

In [9]:
# Apply to tables
tables = [i.text for i in table_elements]
table_summaries = summarize_chain.batch(tables, {"max_concurrency": 5})

In [10]:
import os
import subprocess

# Define the directory containing the images
IMG_DIR = r"C:\Users\sarra\OneDrive\Bureau\PFE\Bootcamp\Challenge 2\Medical-Reasearch-papers\Qa-Bot\figures"  # Change to your actual path

# Get all .jpg images in the directory
images = [f for f in os.listdir(IMG_DIR) if f.endswith(".jpg")]

# Define the path to your LLaVA executable
llava_cmd = r"C:\Users\YourUsername\Desktop\Code\llama.cpp\bin\llava.exe"

# Define the model paths
model_path = r"..\models\llava-7b\ggml-model-q5_k.gguf"
mmproj_path = r"..\models\llava-7b\mmproj-model-f16.gguf"

# Loop through each image
for img in images:
    img_path = os.path.join(IMG_DIR, img)
    base_name = os.path.splitext(img)[0]  # Get filename without extension
    output_file = os.path.join(IMG_DIR, f"{base_name}.txt")

    # Construct the command
    command = [
        llava_cmd,
        "-m", model_path,
        "--mmproj", mmproj_path,
        "--temp", "0.1",
        "-p", "Describe the image in detail. Be specific about graphs, such as bar plots.",
        "--image", img_path
    ]

    # Run the command and save output
    with open(output_file, "w") as f:
        subprocess.run(command, stdout=f, text=True)

    print(f"Processed {img} -> {output_file}")


FileNotFoundError: [WinError 2] Le fichier spécifié est introuvable