In [2]:
# !pip install -qU langchain_experimental langchain_openai langchain_community langchain ragas chromadb langchain-groq fastembed pypdf openai

In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
import torch

from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

In [22]:


class TextChunker:
    def __init__(self):
        pass

    def extract_text_from_pdf(self, path = "/"):
        text = ""
        loader = PyPDFLoader(path)
        documents = loader.load()
        for doc in documents:
            text = text + "\n" + str(doc.page_content)
        return text,documents

    def char_count_chunking_with_overlap(self, text, chunk_size=200, chunk_overlap=50, Recursive = False):
        # Instantiate the CharacterTextSplitter class
        text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        if Recursive:# Instantiate the RecursiveCharacterTextSplitter class
           text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        # Create documents using the text splitter
        docs = text_splitter.create_documents([text])
        chunks = [doc.page_content for doc in docs]
        return chunks

    def char_count_chunking_with_nonoveralp(self, text, chunk_size=200, Recursive = False):
       # Instantiate the CharacterTextSplitter class
        text_splitter = CharacterTextSplitter(chunk_size=chunk_size)
        if Recursive:# Instantiate the RecursiveCharacterTextSplitter class
           text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size)
        # Create documents using the text splitter
        docs = text_splitter.create_documents([text])
        chunks = [doc.page_content for doc in docs]
        return chunks


    def char_count_chunking_with_custom_delimiter(self, text, chunk_size=200, chunk_overlap=50, delimiter="\n\n",Recursive = False):
        # Instantiate the CharacterTextSplitter class
        text_splitter = CharacterTextSplitter(separator=delimiter, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        if Recursive:# Instantiate the RecursiveCharacterTextSplitter class
           text_splitter = RecursiveCharacterTextSplitter(separator=delimiter, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        # Create documents using the text splitter
        docs = text_splitter.create_documents([text])
        chunks = [doc.page_content for doc in docs]
        return chunks

    def semantic_section_chunking(self, text , text_embedding_model_name,  breakpoint_threshold_type = "percentile"):
        embed_model = FastEmbedEmbeddings(model_name = text_embedding_model_name)
        semantic_chunker = SemanticChunker(embed_model, breakpoint_threshold_type=breakpoint_threshold_type)
        semantic_chunks = semantic_chunker.create_documents([d.page_content for d in documents])
        return semantic_chunks



In [24]:
pdf_path =r"/content/drive/MyDrive/chunking/ds (1).pdf"

text_chunker = TextChunker()
text, documents  = text_chunker.extract_text_from_pdf(path = pdf_path )

In [20]:
chunks_char_count = text_chunker.char_count_chunking_with_overlap(text, chunk_size=500, chunk_overlap=40)
print("Char count chunking _with_overlap:")
for i, chunk in enumerate(chunks_char_count):
    print(f"Chunk {i+1}: {chunk}")

Char count chunking _with_overlap:
Chunk 1: Mathematical Foundations of Data Sciences
Gabriel Peyr´ e
CNRS & DMA
´Ecole Normale Sup´ erieure
gabriel.peyre@ens.fr
https://mathematical-tours.github.io
www.numerical-tours.com
August 14, 2019
2
Chapter 1
Optimal Transport
1.1 Radon Measures
Measures. We will interchangeably the term histogram or probability vector for any element a∈Σnthat
belongs to the probability simplex
Σndef.={
a∈Rn
+;n∑
i=1ai= 1}
.
A discrete measure with weights aand locations x1,...,xn∈X reads
α=n∑
i=1aiδxi (1.1)
whereδxis the Dirac at position x, intuitively a unit of mass which is inﬁnitely concentrated at location
x. Such as measure describes a probability measure if, additionally, a∈Σn, and more generally a positive
measure if each of the “weights” described in vector ais positive itself.
Remark 1 (General measures) .A convenient feature of OT is that it can deal with discrete and continuous
“objects” within the same framework. Such objects only need to be model

In [21]:
chunks_char_count = text_chunker.char_count_chunking_with_nonoveralp(text, chunk_size=500)
print("Char count chunking _with_nonoverlap:")
for i, chunk in enumerate(chunks_char_count):
    print(f"Chunk {i+1}: {chunk}")

Char count chunking _with_nonoverlap:
Chunk 1: Mathematical Foundations of Data Sciences
Gabriel Peyr´ e
CNRS & DMA
´Ecole Normale Sup´ erieure
gabriel.peyre@ens.fr
https://mathematical-tours.github.io
www.numerical-tours.com
August 14, 2019
2
Chapter 1
Optimal Transport
1.1 Radon Measures
Measures. We will interchangeably the term histogram or probability vector for any element a∈Σnthat
belongs to the probability simplex
Σndef.={
a∈Rn
+;n∑
i=1ai= 1}
.
A discrete measure with weights aand locations x1,...,xn∈X reads
α=n∑
i=1aiδxi (1.1)
whereδxis the Dirac at position x, intuitively a unit of mass which is inﬁnitely concentrated at location
x. Such as measure describes a probability measure if, additionally, a∈Σn, and more generally a positive
measure if each of the “weights” described in vector ais positive itself.
Remark 1 (General measures) .A convenient feature of OT is that it can deal with discrete and continuous
“objects” within the same framework. Such objects only need to be mo

In [25]:
chunks_char_count = text_chunker.char_count_chunking_with_custom_delimiter( text, chunk_size=200, chunk_overlap=50, delimiter="\n\n",Recursive = False)
print("Char count chunking _with_ _with_custom_delimiter:")
for i, chunk in enumerate(chunks_char_count):
    print(f"Chunk {i+1}: {chunk}")

Char count chunking _with_ _with_custom_delimiter:
Chunk 1: Mathematical Foundations of Data Sciences
Gabriel Peyr´ e
CNRS & DMA
´Ecole Normale Sup´ erieure
gabriel.peyre@ens.fr
https://mathematical-tours.github.io
www.numerical-tours.com
August 14, 2019
2
Chapter 1
Optimal Transport
1.1 Radon Measures
Measures. We will interchangeably the term histogram or probability vector for any element a∈Σnthat
belongs to the probability simplex
Σndef.={
a∈Rn
+;n∑
i=1ai= 1}
.
A discrete measure with weights aand locations x1,...,xn∈X reads
α=n∑
i=1aiδxi (1.1)
whereδxis the Dirac at position x, intuitively a unit of mass which is inﬁnitely concentrated at location
x. Such as measure describes a probability measure if, additionally, a∈Σn, and more generally a positive
measure if each of the “weights” described in vector ais positive itself.
Remark 1 (General measures) .A convenient feature of OT is that it can deal with discrete and continuous
“objects” within the same framework. Such objects only 

In [None]:
text_embedding_model_name = "BAAI/bge-base-en-v1.5"

semantic_chunks = text_chunker.semantic_section_chunking( documents , text_embedding_model_name,  breakpoint_threshold_type = "percentile")


print("\nsemantic chunking with text embeddings:")
for i, chunk in enumerate(semantic_chunks):
    print(f"Chunk {i+1}: {chunk}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/740 [00:00<?, ?B/s]

model_optimized.onnx:   0%|          | 0.00/218M [00:00<?, ?B/s]

In [None]:
text_embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"

semantic_chunks = text_chunker.semantic_section_chunking( documents , text_embedding_model_name,  breakpoint_threshold_type = "percentile")


print("\nsemantic chunking with text embeddings:")
for i, chunk in enumerate(semantic_chunks):
    print(f"Chunk {i+1}: {chunk}")