In [2]:
#jina_a3c15d3187934fe689af8f2fb68e5e1bsCqYPE5YBeI5MVTRCEo1Q2g34-BM

In [6]:
#!pip install -U chromadb langchain llama-index langchain_experimental langchain_openai


## IMPORT LIBRARIES

In [21]:
from rich import print
from langchain.docstore.document import Document
from langchain_community.chat_models import ChatOllama
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter, MarkdownTextSplitter, PythonCodeTextSplitter
from langchain.schema import Document
from PyPDF2 import PdfReader
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain
from typing import List
from langchain.chains import create_extraction_chain_pydantic
from langchain_core.pydantic_v1 import BaseModel
from langchain import hub
from langchain.schema import Document


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


## RAG

In [5]:
local_llm = ChatOllama(model="mistral")

# RAG
def rag(chunks, collection_name):
    vectorstore = Chroma.from_documents(
        documents=documents,
        collection_name=collection_name,
        embedding=embeddings.ollama.OllamaEmbeddings(model='nomic-embed-text'),
    )
    retriever = vectorstore.as_retriever()

    prompt_template = """Answer the question based only on the following context:
    {context}
    Question: {question}
    """
    prompt = ChatPromptTemplate.from_template(prompt_template)

    chain = (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | local_llm
        | StrOutputParser()
    )
    result = chain.invoke("What is the use of Text Splitting?")
    print(result)

  local_llm = ChatOllama(model="mistral")


## Manual Text Splitting

In [10]:
pdf_file_path = "NIPS-2017-attention-is-all-you-need-Paper.pdf"
reader = PdfReader(pdf_file_path)
extracted_text = ""
for page in reader.pages:
    extracted_text += page.extract_text()

# 1. Manual Splitting
chunk_size = 35  # Characters
chunks = [extracted_text[i:i + chunk_size] for i in range(0, len(extracted_text), chunk_size)]
documents = [Document(page_content=chunk, metadata={"source": "local"}) for chunk in chunks]
print("Manual Splitting:", documents)

## Character Text Splitting

In [12]:
text_splitter = CharacterTextSplitter(chunk_size=35, chunk_overlap=0, separator='', strip_whitespace=False)
documents = text_splitter.create_documents([extracted_text])
print("Automatic Text Splitting:", documents)

## Recurssive Character Splitting

In [14]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=65, chunk_overlap=0)
documents = text_splitter.create_documents([extracted_text])
print("Recursive Character Text Splitting:", documents)

## Document Specific Splitter - Markdown

In [17]:
markdown_text = """
# Example Header
## Subheader

Some markdown content goes here.
"""
splitter = MarkdownTextSplitter(chunk_size=40, chunk_overlap=0)
markdown_documents = splitter.create_documents([markdown_text])
print("Markdown Splitting:", markdown_documents)

## Document Specific Splitter - Python

In [20]:
python_text = """
class Example:
    def example_method(self):
        print("This is a sample method.")
"""
python_splitter = PythonCodeTextSplitter(chunk_size=100, chunk_overlap=0)
python_documents = python_splitter.create_documents([python_text])
print("Python Code Splitting:", python_documents)

## Semantic Chunking

In [23]:
import os 
os.environ["OPENAI_API_KEY"]="sk-proj--"

In [24]:
print("#### Semantic Chunking ####")
text_splitter = SemanticChunker(OpenAIEmbeddings())
text_splitter = SemanticChunker(
    OpenAIEmbeddings(), breakpoint_threshold_type="percentile"  # "standard_deviation", "interquartile"
)
documents = text_splitter.create_documents([extracted_text])
print("Semantic Chunking Documents:", documents)

## Agentic Chunking

In [34]:
from PyPDF2 import PdfReader
from langchain_openai import ChatOpenAI
from langchain.schema import Document
from typing import List

# Step 1: Extract text from PDF
def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from a PDF file."""
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Step 2: Initialize OpenAI model
def initialize_openai_model():
    """Initialize the OpenAI GPT model."""
    return ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

# Step 3: Extract propositions using GPT
def extract_propositions(llm, text: str) -> List[str]:
    """Use GPT model to extract logical propositions from the text."""
    prompt = f"""
    Extract logical propositions or key ideas from the following text:
    Text: {text}
    Propositions:
    """
    response = llm.predict(prompt)
    propositions = [prop.strip() for prop in response.split("\n") if prop.strip()]
    return propositions

# Step 4: Perform agentic chunking
def perform_agentic_chunking(propositions: List[str], chunk_size: int = 5) -> List[Document]:
    """Group propositions into logical chunks."""
    chunks = [
        " ".join(propositions[i:i + chunk_size]) 
        for i in range(0, len(propositions), chunk_size)
    ]
    documents = [Document(page_content=chunk, metadata={"source": "local"}) for chunk in chunks]
    return documents

# Step 5: Main function
def main(pdf_path: str):
    """Main function to process a PDF file and perform agentic chunking."""
    # Extract text from PDF
    pdf_text = extract_text_from_pdf(pdf_path)

    # Initialize GPT model
    llm = initialize_openai_model()

    # Split text into paragraphs and extract propositions
    paragraphs = pdf_text.split("\n\n")
    text_propositions = []
    for i, para in enumerate(paragraphs[:5]):  # Process first 5 paragraphs for demonstration
        propositions = extract_propositions(llm, para)
        text_propositions.extend(propositions)
        print(f"Processed paragraph {i + 1}/{len(paragraphs)}")

    print(f"\nYou have {len(text_propositions)} propositions:")
    print("Sample Propositions:", text_propositions[:10])

    # Perform agentic chunking
    documents = perform_agentic_chunking(text_propositions, chunk_size=5)

    # Output results
    print(f"\nGenerated {len(documents)} chunks:")
    for doc in documents:
        print(f"- {doc.page_content}")
    return documents

# Run the agentic chunking process
if __name__ == "__main__":
    pdf_path = "NIPS-2017-attention-is-all-you-need-Paper.pdf"  # Replace with the path to your PDF file
    documents = main(pdf_path)


  response = llm.predict(prompt)
