In [3]:
from rich import print
from langchain.docstore.document import Document
from langchain_community.chat_models import ChatOllama
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv

load_dotenv("../.env")

True

In [4]:
# this is for understanding how the chunking is happening 
# obtained from the first chapter of SLP 
text = """
## About Regular Expressions
One of the unsung successes in standardization in computer science has been the regular expression (often shortened to **regex**), a language for specifying text search regular expression strings. This practical language is used in every computer language, word processor, and text processing tools like the Unix tools grep or Emacs. Formally, a regular expression is an algebraic notation for characterizing a set of strings. Regular expressions are particularly useful for searching in texts, when we have a **pattern** to search for and a **corpus** of texts to search through. A regular expression search function corpus will search through the corpus, returning all texts that match the pattern. The corpus can be a single document or a collection. For example, the Unix command-line tool grep takes a regular expression and returns every line of the input document that matches the expression.

A search can be designed to return every match on a line, if there are more than one, or just the first match. In the following examples we generally underline the exact part of the pattern that matches the regular expression and show only the first match. We'll show regular expressions delimited by slashes but note that slashes are not part of the regular expressions.

Regular expressions come in many variants. We'll be describing extended regular expressions; different regular expression parsers may only recognize subsets of these, or treat some expressions slightly differently. Using an online regular expression tester is a handy way to test out your expressions and explore these variations.
"""

In [6]:
# 1. Character Text Splitting
print("#### Character Text Splitting ####")

# Manual Splitting
chunks = []
chunk_size = 35 # Characters

for i in range(0, len(text), chunk_size):
    chunk = text[i:i + chunk_size]
    chunks.append(chunk)

documents = [Document(page_content=chunk, metadata={"source": "slp"}) for chunk in chunks]
print(documents)

In [7]:
# Automatic Text Splitting
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    chunk_size=35, 
    chunk_overlap=0, 
    separator='', 
    strip_whitespace=False
)

documents = text_splitter.create_documents([text])
print(documents)

In [8]:
# 2. Recursive Character Text Splitting
print("#### Recursive Character Text Splitting ####")

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 65, 
    chunk_overlap=0
) # ["\n\n", "\n", " ", ""] 65,450

print(text_splitter.create_documents([text]))

In [9]:

# 3. Document Specific Splitting
print("#### Document Specific Splitting ####")

# Document Specific Splitting - Markdown
from langchain.text_splitter import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on
)

documents = splitter.split_text(text)
print(documents)

In [10]:
# 4. Semantic Chunking
print("#### Semantic Chunking ####")

from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

# Percentile - all differences between sentences are calculated, and then any difference greater than the X percentile is split
text_splitter = SemanticChunker(OpenAIEmbeddings())
text_splitter = SemanticChunker(
    OpenAIEmbeddings(), 
    breakpoint_threshold_type="percentile" # "standard_deviation", "interquartile"
)
documents = text_splitter.create_documents([text])
print(documents)

In [11]:
# 5. Agentic Chunking
print("#### Proposition-Based Chunking ####")

# Paper: https://arxiv.org/pdf/2312.06648.pdf

from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain
from typing import Optional, List
from langchain.chains import create_extraction_chain_pydantic
from langchain_core.pydantic_v1 import BaseModel
from langchain import hub

obj = hub.pull("wfh/proposal-indexing")

# define which llm to use for propositional chunking 
MODEL = "gpt-4-0125-preview"
llm = ChatOpenAI(model=MODEL)

runnable = obj | llm

class Sentences(BaseModel):
    sentences: List[str]

# Extraction
extraction_chain = create_extraction_chain_pydantic(pydantic_schema=Sentences, llm=llm)
def get_propositions(text):
    runnable_output = runnable.invoke({
    	"input": text
    }).content
    propositions = extraction_chain.invoke(runnable_output)["text"][0].sentences
    return propositions

text = text.strip()
paragraphs = text.split("\n\n")

print(paragraphs)

text_propositions = []
for i, para in enumerate(paragraphs):
    propositions = get_propositions(para)
    text_propositions.extend(propositions)
    print (f"Done with {i}")

print(text_propositions)
print (f"You have {len(text_propositions)} propositions")

print("#### Agentic Chunking ####")

from agenticchunker import AgenticChunker

ac = AgenticChunker()
ac.add_propositions(text_propositions)
print(ac.pretty_print_chunks())
chunks = ac.get_chunks(get_type='list_of_strings')
print(chunks)

documents = [Document(page_content=chunk, metadata={"source": "local"}) for chunk in chunks]

print(documents)

In [12]:
MODEL = "mistral"
TOP_P = 0.9
TOP_K = 100
MAX_TOKENS = 4096

local_llm = ChatOllama(
    model=MODEL,
    num_ctx=MAX_TOKENS,
    top_p=TOP_P,
    top_k=TOP_K
)

# RAG
def rag(chunks, collection_name):
    vectorstore = Chroma.from_documents(
        documents=documents,
        collection_name=collection_name,
        embedding=OpenAIEmbeddings(),
    )
    retriever = vectorstore.as_retriever()

    print(retriever.get_relevant_documents("What are Regular Expressions? Give me some ways I can use regular expressions?"))
    exit(0)

    prompt_template = """Answer the question based only on the following context:
    {context}
    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(prompt_template)

    chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | local_llm
        | StrOutputParser()
    )
    result = chain.invoke("What are Regular Expressions? Give me some ways I can use regular expressions?")
    print(result)

In [13]:
rag(documents, collection_name="test-x-3")

Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3


Number of requested results 4 is greater than number of elements in index 3, updating n_results = 3
