In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import argparse
import logging
import glob
import json

from utils.ingest import ingest_document
from utils.database_utils import generate_database_and_retriever, populate_database
from utils.summarize import summarize_objects


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
LOGGER = logging.getLogger(__name__)

In [4]:
folder = "./documents"
assert folder is not None and folder != "", "Folder needs to be specified"
LOGGER.info("Parsing all pdf documents in {}".format(folder))
all_documents = glob.glob(f"{folder}/*.pdf")

In [5]:
all_texts = []
all_tables = []
all_images = []

for doc in all_documents:
    LOGGER.info("Parsing document: {}".format(doc))
    text_objs, table_objs, images_objs = ingest_document(doc)
    all_texts.extend(text_objs)
    all_tables.extend(table_objs)
    all_images.extend(images_objs)

In [6]:
all_texts, all_images, all_tables = summarize_objects(all_texts, all_images, all_tables)

100%|██████████| 4/4 [00:25<00:00,  6.44s/it]
100%|██████████| 3/3 [00:12<00:00,  4.02s/it]
100%|██████████| 2/2 [00:12<00:00,  6.35s/it]


In [7]:
data_base = "./localdb"
retriever = generate_database_and_retriever(main_folder=data_base)
retriever = populate_database(retriever, all_texts, all_images, all_tables)

##### Get all documents in the docstore

In [12]:
all_keys = list(retriever.docstore.yield_keys())
all_documents = retriever.docstore.mget(all_keys)
docutments_dic = {all_keys[i]: all_documents[i] for i in range(len(all_keys))}
## Maybe you should implement by batch size


In [13]:
from utils.knowledge_graph import convert_to_graph_elements_pipeline
import asyncio

In [None]:
final_data = await convert_to_graph_elements_pipeline(docutments_dic, max_concurrency=1)

  0%|          | 0/9 [00:00<?, ?it/s]

In [9]:
docutments_dic

{'ad4f23557b79a1869da0347cd4f8532ac1232a6dd9bb6075a2cbcd4efe41c446': b'{"content": "0.4\\n0.2\\n0.0 \\u2022\\n0.4 -\\nLos resultados del  an\\u00e1lisis  anterior  proporcionan una  comprensi\\u00f3n  cualitativa  de  los errores de clasificaci\\u00f3n del modelo. Para cuantificar c\\u00f3mo se realizan las predicciones y, en consecuencia, identificar los casos en los que el modelo podr\\u00eda equivocarse, se ha seguido  la  metodolog\\u00eda  descrita  en  la  secci\\u00f3n  4.3.3  para  entrenar  dos  modelos subrogados  interpretables:  un  modelo  de  regresi\\u00f3n  lineal  y  un  modelo  de  \\u00e1rbol  de decisi\\u00f3n. Estos modelos no est\\u00e1n dise\\u00f1ados para diferenciar entre im\\u00e1genes de control y de pacientes, sino para emular las predicciones del modelo original.\\nAs\\u00ed  pues,  utilizando  como  entrada  las  seis  principales  caracter\\u00edsticas  de  imagen identificadas previamente y como salida las predicciones del modelo original para la clase 

In [10]:
json.loads(
    docutments_dic["e57660b2511d9dea96431d00135106b4482c8f83430bb4b60927a78c5458fb12"]
)

{'content': 'iVBORw0KGgoAAAANSUhEUgAAAYQAAAEpCAIAAAAcXcK0AACrcElEQVR4nOydB1QUV9j3t/cCS1l6L9KbIFURG1bsvcYk9h57SdTE3qKxRGMssWFH0YiKooiigiBIkV4XWJbtvcx8J9z325cXkJhEpc3veDzs7OzMnfafe5/7FDQMwygEBASE9gbT3g1AQEBA+AscchoQEDojWq22sLBQo9G4ubnh8XhU5+ezi5FOp5NIJHw+X6vVMplMQ0NDAoGA6gBotVqFQgE3gkajcTgciUTCYDAfOoSGhgYIgoyNjRkMBhaLbbkaBEFSqbShoUGj0TAYDBaL1UGO9B+h1WqfP3+el5c3duxYIyOjL7x3jUZz7949iUQydOhQJpPZ9spcLjc+Pt7Z2TkiIuKf7kir1SqVSp1Oh0Kh0Gg0FoslEok4XId7NysUinv37qFQqAEDBtBoNP1yGIZfv379448/9u/f38XF5d+JkboREonU8sDz8vL+/PPPmJgYR0dH1Oekvr7+4sWLQUFBwcHBn/fsczicCxcuPHr0iMvlolAoMpns4eExfvz40NBQEomEaleysrLWrVunUCjQaDQGg6FSqYGBgVOnTnVwcGi6Wl1d3fnz5+/duycQCGAYZjAYgwYNmjx5srW1ddPVOBzO5cuXExIS6uvrwZG6u7uPHj06MjKSSCSiOg86ne7PP/+8cOFCREREu4jR+fPnKyoqwsLC/laMampq9uzZM2rUqH8hRu/fv9+xY0dFRQW6ETKZ7O3tPXHiRF9fX1RHQiaTnT59GovFBgcHNxWjoqKigwcPWlpazpgx418/SvHx8ZcvX164cGF4eHizr96+fbt27VpnZ+fPLUY1NTWbNm1auXLl5xWjvLy8VatWvXv3LiQkZNCgQQwGo7q6+unTp7t27dqzZ4+npyeqXeHz+QkJCQMHDgwLC1MqlSUlJYcOHXr58uX+/ftdXFzAOhUVFcuWLUtLSwsPDx8