In [None]:
# Loader
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import PyPDFLoader
from PyPDF2 import PdfReader
import fitz

# Splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Store
from langchain_community.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer, util
from langchain_community.vectorstores import Chroma

# additional libraries
import os
import nltk
from nltk.tokenize import sent_tokenize
import re
import logging

In [None]:
# Download punkt tokenizer
nltk.download('punkt')

## **Vector DB with Langchain chunking**

#### **Load and split PDFs with Langchain**

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 400, chunk_overlap = 100, add_start_index = False) # splits the text into chunks

def load_pdfs_langchain(directory_path):
    chunks = []
    if not os.path.exists(directory_path):
        logging.error(f"Directory path does not exist: {directory_path}")
        return chunks
    
    for fn in os.listdir(directory_path):
        if fn.endswith(".pdf"):
            filepath = os.path.join(directory_path, fn)
            try:
                loader = PyPDFLoader(filepath).load()
                split_text = text_splitter.split_documents(loader)
                chunks.extend(split_text) 
                logging.info(f"Processed {fn} with {len(split_text)} chunks.")
            except Exception as e:
                logging.error(f"Failed to process {fn}: {e}")
    return chunks

In [None]:
directory_path = "/home/ssever/ContraDoc/data/PDFs"
chunks = load_pdfs_langchain(directory_path=directory_path)

#### **Embedding and storing in vector database**

In [None]:
# create the embedding function
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L12-v2")

# Load pdfs into vector database
destination = "./vector_store/kb_langchain"
db = Chroma.from_documents(chunks, embedding_function, persist_directory=destination)

## **Vector DB with custom chunking**

#### **Load and split PDFs with custom chunking**

In [None]:
def chunk_text_by_sentences(text, max_chunk_length=400):
    # Tokenize the document into sentences
    sentences = sent_tokenize(text)

    # Initialize variables to store chunks and the current chunk content
    chunks = []
    current_chunk = ""
    last_sentence = ""

    # Loop through each sentence, grouping them into chunks
    for sentence in sentences:
        # If the current chunk plus the new sentence is too long, start a new chunk
        if len(current_chunk) + len(sentence) > max_chunk_length:         
            if current_chunk:
                chunks.append(current_chunk.strip())
            # Start new chunk with the last sentence of the previous chunk for overlap
            current_chunk = last_sentence + " " + sentence
        # Otherwise, add the sentence to the current chunk
        else:
            if current_chunk:
                current_chunk += " " + sentence
            else:
                current_chunk = sentence
        # Update last_sentence to the current one
        last_sentence = sentence

    # Add the last chunk if it's not empty
    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

In [None]:
def load_pdfs_custom_chunks(directory_path):
    all_chunks = []
    if not os.path.exists(directory_path):
        logging.error(f"Directory path does not exist: {directory_path}")
        return chunks
    
    for fn in os.listdir(directory_path):
        if fn.endswith(".pdf"):
            filepath = os.path.join(directory_path, fn)
            try:
                loader = PyPDFLoader(filepath).load()
                page_content = [loader[k].page_content for k in range(0, len(loader))]
                page_content = ' '.join(page_content)
                #metadata = [loader[k].metadata for k in range(0, len(loader))]
                chunks = chunk_text_by_sentences(page_content)
                all_chunks.extend(chunks)
                logging.info(f"Processed {fn} with {len(all_chunks)} chunks.")
            except Exception as e:
                logging.error(f"Failed to process {fn}: {e}")
    return all_chunks

In [None]:
directory_path = "/home/ssever/ContraDoc/data/PDFs"
chunked_sentences = load_pdfs_custom_chunks(directory_path=directory_path)

In [None]:
# Create document class to extract page content
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata

    def __str__(self):
        return f"Content: {self.page_content}, Metadata: {self.metadata}"

In [None]:
chunk_content = [Document(page_content=chunk) for chunk in chunked_sentences]

#### **Embedding and storing into vector database**

In [None]:
# create the embedding function
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L12-v2")

# Load pdfs into vector database
destination = "./vector_store/kb_custom_chunks"
db = Chroma.from_documents(chunk_content, embedding_function, persist_directory=destination)

## **Vector DB with custom sentences chunking**

#### **Load and split PDFs with sentence tokenization**

In [None]:
def load_pdfs_custom_sentences(directory_path):
    all_sentences = []
    if not os.path.exists(directory_path):
        logging.error(f"Directory path does not exist: {directory_path}")
        return chunks
    
    for fn in os.listdir(directory_path):
        if fn.endswith(".pdf"):
            filepath = os.path.join(directory_path, fn)
            try:
                loader = PyPDFLoader(filepath).load()
                page_content = [loader[k].page_content for k in range(0, len(loader))]
                page_content = ' '.join(page_content)
                #metadata = [loader[k].metadata for k in range(0, len(loader))]
                sentences = sent_tokenize(page_content)
                all_sentences.extend(sentences)
                logging.info(f"Processed {fn} with {len(all_sentences)} chunks.")
            except Exception as e:
                logging.error(f"Failed to process {fn}: {e}")
    return all_sentences

In [None]:
directory_path = "/home/ssever/ContraDoc/data/PDFs"
sentences = load_pdfs_custom_sentences(directory_path=directory_path)

In [None]:
sentences_content = [Document(page_content=sentence) for sentence in sentences]

#### **Embedding and storing into vector database**

In [None]:
# create the embedding function
embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L12-v2")

# Load pdfs into vector database
destination = "./vector_store/kb_custom_sentences"
db = Chroma.from_documents(sentences_content, embedding_function, persist_directory=destination)

## **Appendix**

#### **PDF highlighting test**

In [None]:
from fitz.utils import getColorList
cl = getColorList()

In [None]:
pdf_document = fitz.open("/home/user123/ContraDoc/data/PDFs/Transkript.pdf")
page = pdf_document.load_page(1)
matches = page.search_for("Before we go further, I would really like to thank and recognize our employees, our customers, our business partners all around\
                          the world for supporting our business in these extraordinary times.")

for match in matches:
            highlight = page.add_highlight_annot(match)
            highlight.set_colors(stroke=fitz.pdfcolor["skyblue"])
            highlight.update()

output_pdf_path = os.path.splitext("/home/user123/ContraDoc/data/PDFs/Transkript.pdf")[0] + "_highlighted.pdf"
pdf_document.save(output_pdf_path)
pdf_document.close()

#### **Query**

In [None]:
query = "Order intake grew 31% in EMEA, 54% in Americas, 15% in  APAC,  and  globally,  orders  were  up  29%,  reaching  87,000 units, and 87% of those units were electrified."
docs = db.similarity_search_with_score(query, k=5)
docs[0]

#### **Backup**

In [None]:
loader = PyPDFLoader("/home/user123/ContraDoc/data/PDFs/Transkript.pdf")

pages = loader.load_and_split()

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# load it into Chroma
db2 = Chroma.from_documents(pages, embedding_function)

# query it
query = "Order intake grew 31% in EMEA, 54% in Americas, 15% in  APAC,  and  globally,  orders  were  up  29%,  reaching  87,000 units, and 87% of those units were electrified."
docs = db2.similarity_search(query)

# print results
print(docs[0].page_content)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 400, chunk_overlap = 50, add_start_index = False) # splits the text into chunks

loader = PyPDFLoader("/home/user123/ContraDoc/data/PDFs/Transkript.pdf").load()
text = text_splitter.split_documents(loader)

db = Chroma.from_documents(text, embedding_function)

In [None]:
query = "Order intake grew 31% in EMEA, 54% in Americas, 15% in  APAC,  and  globally,  orders  were  up  29%,  reaching  87,000 units, and 87% of those units were electrified."
docs = db.similarity_search(query)
print(docs[0].page_content)