In [None]:
!pip install langchain-text-splitters langchain pymupdf langchain_ollama psycopg2-binary pgvector pulsar-client 

In [None]:
from queue import Queue
import multiprocessing
from threading import Thread
from psycopg2 import pool
import time
import hashlib
from io import BytesIO
import io
import requests
import pymupdf  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
import pulsar

In [None]:
# Clean shutdown handler
import signal
from queue import Queue
shutdown = Queue()
def signal_handler(sig, frame):
    print('Interrupt, shutting down gracefully...')
    shutdown.put(True)

signal.signal(signal.SIGINT, signal_handler)

# Class into which we'll process messages from Pulsar
from pulsar.schema import Record, String, JsonSchema
class PDFInfo(Record):
    url = String()
    title = String()
    authors = String()
    summary = String()

# Postgres connection parameters
host = "pdf-service"
user = "postgres"
password = "admin"
db = "search"

# Insert queries
sources_insert = "INSERT INTO sources (id, uri, title, author, summary) VALUES (%s, %s, %s, %s, %s)"
#"""INSERT INTO sources (id, uri, title, summary, title_embedding, summary_embedding)
#VALUES (%s, %s, %s, %s, %s, %s)"""
semantic_insert = "INSERT INTO semantic_search (id, page, chunk, embedding) VALUES (%s, %s, %s, %s)"
keyword_insert = "INSERT INTO keyword_search (id, page, ts) VALUES (%s, %s, to_tsvector('english', %s))"

def thread_task(squeue,con,core,chunk_size=500, chunk_overlap=50):
    # Embeddings connection for each thread
    embed = OllamaEmbeddings(
        model="all-minilm",
        base_url ="http://host.docker.internal:11434"
    )
    # LangChain's Recursive Text Splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap,  
        separators=["\n\n", "\n", " ", ""],  # Prioritize logical breaks
    )
    # Pulsar consumer for each thread
    pulcli = pulsar.Client('pulsar://pdf-service:6650')
    consumer = pulcli.subscribe(
                  topic='scraper-output',
                  subscription_name=f"pdf-process-consumer-{core}",
                  schema=JsonSchema(PDFInfo) )
    while shutdown.empty():
        try:
            msg = consumer.receive_timeout(5000)
            # Try to process message? Not totally sure why this is needed.
            try:
                pdf_json = msg.value()
                print(f"Processing message: {msg.message_id()}, PDF url: {pdf_json.url}")
                # Acknowledge successful processing of the message
                consumer.acknowledge(msg)
                with con.cursor() as cur:
                    try:
                        key = hashlib.md5(pdf_json.url.encode()).hexdigest()
                        with BytesIO() as stream_buffer:
                            response = requests.get(pdf_json.url, stream=True)
                            if response.status_code != 200:
                                raise Exception(f"Failed to download PDF; received HTTP {response.status_code} from underlying server")
                            for chunk in response.iter_content(chunk_size=io.DEFAULT_BUFFER_SIZE):
                                stream_buffer.write(chunk)
                            doc = pymupdf.open(stream=stream_buffer, filetype="pdf")
                            cur.execute(sources_insert,(key,pdf_json.url,pdf_json.title,pdf_json.authors,pdf_json.summary,))
                            for page in doc:
                                try:
                                    cur.execute(keyword_insert,(key,page.number,page.get_text().replace('\x00', ''),))
                                    chunks = text_splitter.split_text(page.get_text())
                                    embeddings = embed.embed_documents(chunks)
                                    for chunk_id, (chunk,embedding) in enumerate(zip(chunks,embeddings)):
                                        cur.execute(semantic_insert,(key,page.number,chunk_id,embedding,))
                                except Exception as e:
                                    print(f"Error processing page {page.number} of {url}: {e}")
                                    continue  # Skip to the next page
                    except Exception as e:
                        print(f"Error downloading PDF from url: {url}: {e}")
                        continue  # Skip to the next page
            except Exception:
                # Message failed to be processed
                consumer.negative_acknowledge(msg)
        except Exception:
            # We want timeout so we don't get stuck infinitely waiting for new message, need to evaluate shutdown variable every so often
            continue
    consumer.close()
    con.close()
        

cores = multiprocessing.cpu_count()

# Create a connection pool
pg_pool = pool.SimpleConnectionPool(
    minconn=cores,  # Minimum connections
    maxconn=cores, # Maximum connections
    dsn=f"dbname={db} user={user} password={password} host={host}"
)
join_list = []
for core in range(0,cores):
    worker = Thread(target=thread_task, args=(shutdown,pg_pool.getconn(),core))
    worker.start()
    join_list.append(worker)

print(f"Running {cores} workers") 
        
for thread in join_list:
    thread.join()

print("Completed")