In [2]:
%load_ext autoreload
%autoreload 2

In [None]:
import argparse
import logging
import glob
import json

from utils.ingest import ingest_document
from utils.database_utils import generate_database_and_retriever, populate_database
from utils.summarize import summarize_objects


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
LOGGER = logging.getLogger(__name__)

In [5]:
folder = "./documents"
assert folder is not None and folder != "", "Folder needs to be specified"
LOGGER.info("Parsing all pdf documents in {}".format(folder))
all_documents = glob.glob(f"{folder}/*.pdf")

In [5]:
all_texts = []
all_tables = []
all_images = []

for doc in all_documents:
    LOGGER.info("Parsing document: {}".format(doc))
    text_objs, table_objs, images_objs = ingest_document(doc)
    all_texts.extend(text_objs)
    all_tables.extend(table_objs)
    all_images.extend(images_objs)

In [7]:
all_texts, all_images, all_tables = summarize_objects(all_texts, all_images, all_tables)

100%|██████████| 4/4 [00:26<00:00,  6.75s/it]
100%|██████████| 3/3 [00:11<00:00,  3.90s/it]
100%|██████████| 2/2 [00:12<00:00,  6.33s/it]


In [6]:
data_base = "./localdb"
retriever = generate_database_and_retriever(main_folder=data_base)

In [9]:
retriever = populate_database(retriever, all_texts, all_images, all_tables)

##### Get all documents in the docstore

In [None]:
all_keys = list(retriever.docstore.yield_keys())
all_documents = retriever.docstore.mget(all_keys)

## Maybe you should implement by batch size


In [14]:
all_documents[0]

b'{"content": "0.4\\n0.2\\n0.0 \\u2022\\n0.4 -\\nLos resultados del  an\\u00e1lisis  anterior  proporcionan una  comprensi\\u00f3n  cualitativa  de  los errores de clasificaci\\u00f3n del modelo. Para cuantificar c\\u00f3mo se realizan las predicciones y, en consecuencia, identificar los casos en los que el modelo podr\\u00eda equivocarse, se ha seguido  la  metodolog\\u00eda  descrita  en  la  secci\\u00f3n  4.3.3  para  entrenar  dos  modelos subrogados  interpretables:  un  modelo  de  regresi\\u00f3n  lineal  y  un  modelo  de  \\u00e1rbol  de decisi\\u00f3n. Estos modelos no est\\u00e1n dise\\u00f1ados para diferenciar entre im\\u00e1genes de control y de pacientes, sino para emular las predicciones del modelo original.\\nAs\\u00ed  pues,  utilizando  como  entrada  las  seis  principales  caracter\\u00edsticas  de  imagen identificadas previamente y como salida las predicciones del modelo original para la clase paciente se han entrenado ambos modelos subrogados. La curva ROC resu

In [22]:
import os
from typing import List
from pydantic import BaseModel, Field
from langchain_ollama import ChatOllama
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import PydanticOutputParser


# 1. Define the Schema (Improved version)
class Relationship(BaseModel):
    head: str = Field(
        description="Normalized name of the subject (e.g., 'Microsoft') in or translated to english language"
    )
    head_type: str = Field(
        description="Entity type (e.g., ORGANIZATION, PERSON) in or translated to english language"
    )
    relation: str = Field(
        description="Relationship verb in CONSTANT_CASE (e.g., FOUNDED_BY) in or translated to english language"
    )
    tail: str = Field(
        description="Normalized name of the object in or translated to english language"
    )
    tail_type: str = Field(
        description="Entity type of the object in or translated to english language"
    )
    confidence: float = Field(description="Confidence score between 0.0 and 1.0")


class KnowledgeGraph(BaseModel):
    """Encapsulates multiple relationships extracted from a text block."""

    relationships: List[Relationship]


# 2. Setup the LLM and Parser
# Use a model with high reasoning capabilities for extraction
llm = ChatOllama(temperature=0, model="gemma3:12b")
parser = PydanticOutputParser(pydantic_object=KnowledgeGraph)

# 3. Create the Extraction Prompt
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            (
                "You are an expert knowledge graph engineer. "
                "Extract all significant entities and their relationships from the provided text. "
                "Focus on accuracy and normalize entity names (e.g., 'Jeff Bezos' instead of 'Bezos').\n"
                "Even if the text is in spanish, the output should be in english.\n"
                "{format_instructions}"
            ),
        ),
        ("human", "Text to analyze: {text}"),
    ]
).partial(format_instructions=parser.get_format_instructions())


def extract_graph_from_retriever(retriever):
    # A. Retrieve ALL documents from the docstore
    print("Fetching documents from docstore...")
    all_keys = list(retriever.docstore.yield_keys())
    all_raw_data = retriever.docstore.mget(all_keys)

    full_graph = []

    # B. Process each document
    chain = prompt | llm | parser

    for i, data in enumerate(all_raw_data):
        try:
            # Decode bytes to string
            decoded_string = data.decode("utf-8")
            # Parse the JSON string
            json_data = json.loads(decoded_string)
            type_ = json_data.get("type")
            content = json_data.get("content")
            if type_ == "text":
                print(f"Processing doc {i}")
                response = chain.invoke({"text": content})
                full_graph.extend(response.relationships)
            else:
                print(f"Skipping doc {i}: Document type not implemented.")
                continue

        except Exception as e:
            print(f"Error processing document {i}: {e}")

    return full_graph


extracted_relationships = extract_graph_from_retriever(retriever)

Fetching documents from docstore...
Processing doc 0
Processing doc 1
Skipping doc 2: Document type not implemented.
Processing doc 3
Processing doc 4
Skipping doc 5: Document type not implemented.
Skipping doc 6: Document type not implemented.
Skipping doc 7: Document type not implemented.
Skipping doc 8: Document type not implemented.


In [23]:
extracted_relationships

[Relationship(head='analysis', head_type='ACTIVITY', relation='PROVIDES', tail='qualitative understanding', tail_type='CONCEPT', confidence=0.95),
 Relationship(head='model', head_type='ARTIFACT', relation='HAS_ERROR', tail='classification errors', tail_type='CONCEPT', confidence=0.85),
 Relationship(head='methodology', head_type='ACTIVITY', relation='DESCRIBED_IN', tail='section 4.3.3', tail_type='DOCUMENT_SECTION', confidence=0.9),
 Relationship(head='linear regression model', head_type='ARTIFACT', relation='IS_A', tail='subrogated model', tail_type='ARTIFACT', confidence=0.92),
 Relationship(head='decision tree model', head_type='ARTIFACT', relation='IS_A', tail='subrogated model', tail_type='ARTIFACT', confidence=0.92),
 Relationship(head='subrogated model', head_type='ARTIFACT', relation='DOES_NOT_DIFFERENTIATE', tail='images of control and patients', tail_type='ARTIFACT', confidence=0.88),
 Relationship(head='subrogated model', head_type='ARTIFACT', relation='EMULATES', tail='ori

In [18]:
import json

In [7]:
# For each chunk you need to extract the Graph:
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_ollama import ChatOllama
from langchain_core.documents import Document

llm = ChatOllama(temperature=0, model="gemma3:12b")
graph_transformer = LLMGraphTransformer(llm=llm)


In [8]:
retriever

MultiVectorRetriever(vectorstore=<langchain_chroma.vectorstores.Chroma object at 0x1333d5730>, docstore=<langchain_classic.storage.file_system.LocalFileStore object at 0x1333d5280>, search_kwargs={})

TypeError: Document.__init__() missing 1 required positional argument: 'page_content'

In [None]:
graph_documents = await graph_transformer.aconvert_to_graph_documents(documents)


'Curva ROC para el modelo Regresión Lineal\n1.0\nVIU\n0.8\n0.6 -\nCurva ROC para el modelo Árbol de Decisión\n1.0\n0.8 -'

In [1]:
documents

NameError: name 'documents' is not defined

In [None]:
text = document.page_content
raw_schema = await self.chain.ainvoke({"input": text}, config=config)