In [1]:
!pip install langchain-text-splitters langchain pymupdf langchain_ollama psycopg2-binary pgvector pulsar-client 

Collecting langchain-text-splitters
  Using cached langchain_text_splitters-0.3.7-py3-none-any.whl.metadata (1.9 kB)
Collecting langchain
  Using cached langchain-0.3.21-py3-none-any.whl.metadata (7.8 kB)
Collecting pymupdf
  Using cached pymupdf-1.25.4-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting langchain_ollama
  Using cached langchain_ollama-0.3.0-py3-none-any.whl.metadata (1.5 kB)
Collecting psycopg2-binary
  Using cached psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting pgvector
  Using cached pgvector-0.4.0-py3-none-any.whl.metadata (17 kB)
Collecting pulsar-client
  Using cached pulsar_client-3.6.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Collecting langchain-core<1.0.0,>=0.3.45 (from langchain-text-splitters)
  Downloading langchain_core-0.3.49-py3-none-any.whl.metadata (5.9 kB)
Collecting langsmith<0.4,>=0.1.17 (from langchain)
  Downloadin

In [1]:
import multiprocessing
from multiprocessing import Process
import time
import hashlib
from io import BytesIO
import io
import requests
import pymupdf  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
import pulsar
import psycopg2
from pdf_schema import PDFInfo
from pulsar.schema import JsonSchema
import signal
#import logging
#logging.getLogger("pulsar").setLevel(logging.ERROR)

In [8]:
# Insert queries
sources_insert = "INSERT INTO sources (id, uri, title, author, summary) VALUES (%s, %s, %s, %s, %s)"
#"""INSERT INTO sources (id, uri, title, summary, title_embedding, summary_embedding)
#VALUES (%s, %s, %s, %s, %s, %s)"""
semantic_insert = "INSERT INTO semantic_search (id, page, chunk, embedding) VALUES (%s, %s, %s, %s)"
keyword_insert = "INSERT INTO keyword_search (id, page, ts) VALUES (%s, %s, to_tsvector('english', %s))"

def pdf_retrieve(url):
    with BytesIO() as stream_buffer:
        response = requests.get(url, stream=True)
        if response.status_code != 200:
            raise Exception(f"Failed to download PDF; received HTTP {response.status_code} from underlying server")
        for chunk in response.iter_content(chunk_size=io.DEFAULT_BUFFER_SIZE):
            stream_buffer.write(chunk)
        doc = pymupdf.open(stream=stream_buffer, filetype="pdf")
        return doc

def thread_task(shutdown,db,user,password,host,chunk_size=500, chunk_overlap=50):
    con = psycopg2.connect(dbname=db, user=user, password=password, host=host)  # Dedicated connection per thread
    con.autocommit = True  # Ensure auto-commit mode to avoid locks
    pulcli = pulsar.Client('pulsar://pdf-service:6650')   
    consumer = pulcli.subscribe(
        topic='scraper-output-test',
        subscription_name=f"pdf-process-consumer",
        consumer_type=pulsar.ConsumerType.Shared,
        schema=JsonSchema(PDFInfo) )
    # Embeddings connection for each thread
    embed = OllamaEmbeddings(
        model="all-minilm",
        base_url ="http://host.docker.internal:11434"
    )
    # LangChain's Recursive Text Splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap,  
        separators=["\n\n", "\n", " ", ""],  # Prioritize logical breaks
    )
    while not shutdown.is_set():
        try:
            msg = consumer.receive(timeout_millis=5000)
            # Try to process message? Not totally sure why this is needed.
            try:
                pdf_json = msg.value()
                #if embed.embed_query('hello') is None:
                #    print("Ollama not available")
                # Acknowledge successful processing of the message
                consumer.acknowledge(msg)
                with con.cursor() as cur:
                    try:
                        key = hashlib.md5(pdf_json.url.encode()).hexdigest()
                        pdf_doc = pdf_retrieve(pdf_json.url)
                        print(f"Extracted pdf: {pdf_json.title}, number of pages: {len(pdf_doc)}, hash: {key}")
                        sem_data = []
                        key_data = []
                        for page in pdf_doc:
                            key_data.append((key,page.number,page.get_text().replace('\x00', ''),))
                            chunks = text_splitter.split_text(page.get_text())
                            embeddings = embed.embed_documents(chunks)
                            for chunk_id, (chunk,embedding) in enumerate(zip(chunks,embeddings)):
                                sem_data.append((key,page.number,chunk_id,embedding,))
                        # Insert records into Postgres
                        cur.execute(sources_insert,(key,pdf_json.url,pdf_json.title,pdf_json.authors,pdf_json.summary,))
                        execute_values(cur,
                                       "INSERT INTO semantic_search (id, page, chunk, embedding) VALUES %s"
                                       ,sem_data
                                      )
                        execute_values(cur,
                                       "INSERT INTO keyword_search (id, page, ts) VALUES %s",
                                       key_data,
                                       template="(%s, %s, to_tsvector('english', %s))"
                                      )
                        print(f"Completed insertion of pdf: {pdf_json.title}, hash: {key}")
                    except psycopg2.Error as e:
                        print(f"Database throwing errors, need to recreate connection: {e}")
                        break
                    except Exception as e:
                        print(f"Error downloading PDF from url: {pdf_json.url}: {e}")
                        continue  # Skip to the next page
            except Exception as e:
                print(f"Failed to process message: {e}")
                # Message failed to be processed
                consumer.negative_acknowledge(msg)
        except Exception:
            # We want timeout so we don't get stuck infinitely waiting for new message, need to evaluate shutdown variable every so often
            continue
    consumer.close()
    con.close()

if __name__ == '__main__':
    # Postgres connection parameters
    host = "pdf-service"
    user = "postgres"
    password = "admin"
    db = "search"
    def signal_handler(sig, frame):
        print('Interrupt, shutting down gracefully...')
        shutdown.set()
    
    signal.signal(signal.SIGINT, signal_handler)
    shutdown = multiprocessing.Event()
    cores = multiprocessing.cpu_count()
    
    join_list = []
    for _ in range(0,cores):
        worker = Process(target=thread_task, args=(shutdown,db,user,password,host,))
        worker.start()
        join_list.append(worker)
        
    print(f"Running {cores} workers") 

    # while not shutdown.is_set():
    #     time.sleep(10)
    #     for i in range(len(join_list)):
    #         print(f"Process {i} is running: {join_list[i].is_alive()}")

    
    # while not shutdown.is_set():
    #     time.sleep(10)
    #     # Go through list in reverse since we are deleting elements
    #     for i in range(len(join_list) - 1, -1, -1):
    #         if not thread.is_alive():
    #             join_list[i].join()
    #             join_list.pop(i)
    #             consumer = pulcli.subscribe(
    #                 topic='scraper-output-test',
    #                 subscription_name=f"pdf-process-consumer",
    #                 consumer_type=pulsar.ConsumerType.Shared,
    #                 schema=JsonSchema(PDFInfo) )
    #             start_worker_thread(consumer,db,user,password,host)
                
            
    for thread in join_list:
        thread.join()
    time.sleep(1)
    print("Completed")
    shutdown.clear()

2025-03-27 21:13:24.148 INFO  [140480507135808] Client:86 | Subscribing on Topic :scraper-output-test
2025-03-27 21:13:24.150 INFO  [140480507135808] ClientConnection:193 | [<none> -> pulsar://pdf-service:6650] Create ClientConnection, timeout=10000
2025-03-27 21:13:24.150 INFO  [140480507135808] ConnectionPool:124 | Created connection for pulsar://pdf-service:6650-pulsar://pdf-service:6650-0
2025-03-27 21:13:24.152 INFO  [140480507135808] Client:86 | Subscribing on Topic :scraper-output-test
2025-03-27 21:13:24.152 INFO  [140480159397440] ClientConnection:410 | [10.1.1.50:55112 -> 10.106.3.58:6650] Connected to broker
2025-03-27 21:13:24.153 INFO  [140480507135808] ClientConnection:193 | [<none> -> pulsar://pdf-service:6650] Create ClientConnection, timeout=10000
2025-03-27 21:13:24.153 INFO  [140480507135808] ConnectionPool:124 | Created connection for pulsar://pdf-service:6650-pulsar://pdf-service:6650-0
2025-03-27 21:13:24.155 INFO  [140480507135808] Client:86 | Subscribing on Topi