In [2]:

!pip install -qU langchain_experimental langchain_openai langchain_community langchain ragas chromadb langchain-groq fastembed pypdf openai

In [4]:
!pip install transformers -U

Collecting transformers
  Downloading transformers-4.41.0-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.23.0 (from transformers)
  Downloading huggingface_hub-0.23.0-py3-none-any.whl (401 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m401.2/401.2 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19 (from transformers)
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface-hub, tokenizers, transformers
  Attempting uninstall: huggingface-hub
    Found existing installation: huggingface-hub 0.20.3
    Uninstalling huggingface-hub-0.20.3:
      Successfully uninstalled huggingface-hub-0.20.3
  Attempting u

In [21]:


from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
import torch
from transformers import GPT2Tokenizer, GPT2Model
from transformers import BertTokenizer, BertModel
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

class TextChunker:
    def __init__(self, model_name="", tokenizer = "", model = ""):
        # Load pre-trained GPT-2 model and tokenizer
        self.tokenizer = tokenizer #GPT2Tokenizer.from_pretrained(model_name)
        self.model = model #GPT2Model.from_pretrained(model_name)
        self.model_name = model_name

    def extract_text_from_pdf(self, path = "/"):
        text = ""
        loader = PyPDFLoader(path)
        documents = loader.load()
        for doc in documents:
            text = text + "\n" + str(doc.page_content)
        return text,documents

    def char_count_chunking(self, text, chunk_size=200, chunk_overlap=50):
        # Instantiate the RecursiveCharacterTextSplitter class
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        # Create documents using the text splitter
        docs = text_splitter.create_documents([text])
        chunks = [doc.page_content for doc in docs]
        return chunks

    def char_count_chunking_with_custom_delimiter(self, text, chunk_size=200, chunk_overlap=50, delimiter="\n\n"):
        # Instantiate the CharacterTextSplitter class
        text_splitter = CharacterTextSplitter(separator=delimiter, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        # Create documents using the text splitter
        docs = text_splitter.create_documents([text])
        chunks = [doc.page_content for doc in docs]
        return chunks

    def semantic_section_chunking(self, text, max_chunk_size=200, overlap_size=50):
        # Tokenize the text
        input_ids = self.tokenizer.encode(text, return_tensors="pt", add_special_tokens=False)
        # Define the chunking parameters
        chunk_size = max_chunk_size - overlap_size
        stride = chunk_size
        # Perform chunking
        chunks = []
        for i in range(0, input_ids.size(1), stride):
            # Slice the input_ids to form a chunk
            chunk_input_ids = input_ids[:, i:i+chunk_size]
            # Decode the chunk
            chunk_text = self.tokenizer.decode(chunk_input_ids[0], skip_special_tokens=True)
            # Add the chunk to the list
            chunks.append(chunk_text)
        return chunks

    def semantic_section_chunking_with_TextEmbedding(self, text , text_embedding_model_name,  breakpoint_threshold_type = "percentile"):
        embed_model = FastEmbedEmbeddings(model_name = text_embedding_model_name)
        semantic_chunker = SemanticChunker(embed_model, breakpoint_threshold_type=breakpoint_threshold_type)
        semantic_chunks = semantic_chunker.create_documents([d.page_content for d in documents])
        return semantic_chunks



In [None]:
pdf_path =r"/content/drive/MyDrive/chunking/ds (1).pdf"


In [17]:
#using gpt2
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model  = GPT2Model.from_pretrained(model_name)
text_chunker = TextChunker(model_name= model_name, tokenizer = tokenizer, model = model)
text, documents  = text_chunker.extract_text_from_pdf(path = pdf_path )

chunks_char_count = text_chunker.char_count_chunking(text, chunk_size=50, chunk_overlap=40)
print("Char count chunking:")
for i, chunk in enumerate(chunks_char_count):
    print(f"Chunk {i+1}: {chunk}")

chunks_char_count = text_chunker.char_count_chunking(text, chunk_size=50, chunk_overlap=0)
print("Char count chunking non overlap:")
for i, chunk in enumerate(chunks_char_count):
    print(f"Chunk {i+1}: {chunk}")

chunks_custom_delimiter = text_chunker.char_count_chunking_with_custom_delimiter(text, chunk_size=200, chunk_overlap=50, delimiter="@")
print("\nChar count chunking with custom delimiter:")
for i, chunk in enumerate(chunks_custom_delimiter):
    print(f"Chunk {i+1}: {chunk}")


chunks_semantic = text_chunker.semantic_section_chunking(text, max_chunk_size=200, overlap_size=50)
print("\nSemantic section chunking:")
for i, chunk in enumerate(chunks_semantic):
    print(f"Chunk {i+1}: {chunk}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Chunk 2836: lar (??)). Using these notations, Sinkhorn’s
Chunk 2837: Using these notations, Sinkhorn’s iterates read
Chunk 2838: f(ℓ+1)= Minrow
ε(C−1ng(ℓ)T) +εloga, (1.65)
Chunk 2839: ε(C−1ng(ℓ)T) +εloga, (1.65)
g(ℓ+1)= Mincol
Chunk 2840: g(ℓ+1)= Mincol
ε(C−f(ℓ)1mT) +εlogb. (1.66)
Chunk 2841: Note that as ε→0, minεconverges to min, but the
Chunk 2842: as ε→0, minεconverges to min, but the iterations
Chunk 2843: to min, but the iterations do not converge
Chunk 2844: min, but the iterations do not converge anymore
Chunk 2845: the iterations do not converge anymore in the
Chunk 2846: do not converge anymore in the limit ε= 0,
Chunk 2847: because alternate minimization does not converge
Chunk 2848: minimization does not converge for constrained
Chunk 2849: does not converge for constrained problems (which
Chunk 2850: for constrained problems (which is the case for
Chunk 2851: problems (which is the case for the un-
Chunk 2852

Token indices sequence length is longer than the specified maximum sequence length for this model (26209 > 1024). Running this sequence through the model will result in indexing errors



Semantic section chunking:
Chunk 1: 
Mathematical Foundations of Data Sciences
Gabriel Peyr´ e
CNRS & DMA
´Ecole Normale Sup´ erieure
gabriel.peyre@ens.fr
https://mathematical-tours.github.io
www.numerical-tours.com
August 14, 2019
2
Chapter 1
Optimal Transport
1.1 Radon Measures
Measures. We will interchangeably the term histogram or probability vector for any element a∈Σnthat
belongs to the probability simplex
Σndef.={
a∈Rn
+;n∑
i=1ai= 1}
Chunk 2: 
.
A discrete measure with weights aand locations x1,...,xn∈X reads
α=n∑
i=1aiδxi (1.1)
whereδxis the Dirac at position x, intuitively a unit of mass which is inﬁnitely concentrated at location
x. Such as measure describes a probability measure if, additionally, a∈Σn, and more generally a positive
measure if each of the “weights” described in vector ais positive itself.
Remark 1 (General measures).A convenient feature of OT is that it can deal with discrete and continuous
“objects” within the same framework.
Chunk 3:  Such objects only nee

In [19]:
#using bert
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model  = BertModel.from_pretrained(model_name)
text_chunker = TextChunker(model_name= model_name, tokenizer = tokenizer, model = model)

chunks_char_count = text_chunker.char_count_chunking(text, chunk_size=50, chunk_overlap=40)
print("Char count chunking:")
for i, chunk in enumerate(chunks_char_count):
    print(f"Chunk {i+1}: {chunk}")

chunks_char_count = text_chunker.char_count_chunking(text, chunk_size=50, chunk_overlap=0)
print("Char count chunking non overlap:")
for i, chunk in enumerate(chunks_char_count):
    print(f"Chunk {i+1}: {chunk}")

chunks_custom_delimiter = text_chunker.char_count_chunking_with_custom_delimiter(text, chunk_size=200, chunk_overlap=50, delimiter="@")
print("\nChar count chunking with custom delimiter:")
for i, chunk in enumerate(chunks_custom_delimiter):
    print(f"Chunk {i+1}: {chunk}")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Chunk 2836: lar (??)). Using these notations, Sinkhorn’s
Chunk 2837: Using these notations, Sinkhorn’s iterates read
Chunk 2838: f(ℓ+1)= Minrow
ε(C−1ng(ℓ)T) +εloga, (1.65)
Chunk 2839: ε(C−1ng(ℓ)T) +εloga, (1.65)
g(ℓ+1)= Mincol
Chunk 2840: g(ℓ+1)= Mincol
ε(C−f(ℓ)1mT) +εlogb. (1.66)
Chunk 2841: Note that as ε→0, minεconverges to min, but the
Chunk 2842: as ε→0, minεconverges to min, but the iterations
Chunk 2843: to min, but the iterations do not converge
Chunk 2844: min, but the iterations do not converge anymore
Chunk 2845: the iterations do not converge anymore in the
Chunk 2846: do not converge anymore in the limit ε= 0,
Chunk 2847: because alternate minimization does not converge
Chunk 2848: minimization does not converge for constrained
Chunk 2849: does not converge for constrained problems (which
Chunk 2850: for constrained problems (which is the case for
Chunk 2851: problems (which is the case for the un-
Chunk 2852

In [24]:
#using text embeddings
pdf_path =r"/content/drive/MyDrive/chunking/ds (1).pdf"
text_chunker = TextChunker()
text, documents  = text_chunker.extract_text_from_pdf(path = pdf_path )

text_embedding_model_name = "BAAI/bge-base-en-v1.5"

semantic_chunks = text_chunker.semantic_section_chunking_with_TextEmbedding( documents , text_embedding_model_name,  breakpoint_threshold_type = "percentile")


print("\nsemantic chunking with text embeddings:")
for i, chunk in enumerate(semantic_chunks):
    print(f"Chunk {i+1}: {chunk}")


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/740 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

model_optimized.onnx:   0%|          | 0.00/218M [00:00<?, ?B/s]

[Document(page_content='Mathematical Foundations of Data Sciences\nGabriel Peyr´ e\nCNRS & DMA\n´Ecole Normale Sup´ erieure\ngabriel.peyre@ens.fr\nhttps://mathematical-tours.github.io\nwww.numerical-tours.com\nAugust 14, 2019'),
 Document(page_content='2'),
 Document(page_content='Chapter 1\nOptimal Transport\n1.1 Radon Measures\nMeasures. We will interchangeably the term histogram or probability vector for any element a∈Σnthat\nbelongs to the probability simplex\nΣndef.={\na∈Rn\n+;n∑\ni=1ai= 1}\n. A discrete measure with weights aand locations x1,...,xn∈X reads\nα=n∑\ni=1aiδxi (1.1)\nwhereδxis the Dirac at position x, intuitively a unit of mass which is inﬁnitely concentrated at location\nx. Such as measure describes a probability measure if, additionally, a∈Σn, and more generally a positive\nmeasure if each of the “weights” described in vector ais positive itself. Remark 1 (General measures) .A convenient feature of OT is that it can deal with discrete and continuous\n“objects” withi

In [26]:
#using text embeddings

text_embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"

semantic_chunks = text_chunker.semantic_section_chunking_with_TextEmbedding( documents , text_embedding_model_name,  breakpoint_threshold_type = "percentile")


print("\nsemantic chunking with text embeddings:")
for i, chunk in enumerate(semantic_chunks):
    print(f"Chunk {i+1}: {chunk}")

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]


semantic chunking with text embeddings:
Chunk 1: page_content='Mathematical Foundations of Data Sciences\nGabriel Peyr´ e\nCNRS & DMA\n´Ecole Normale Sup´ erieure\ngabriel.peyre@ens.fr\nhttps://mathematical-tours.github.io\nwww.numerical-tours.com\nAugust 14, 2019'
Chunk 2: page_content='2'
Chunk 3: page_content='Chapter 1\nOptimal Transport\n1.1 Radon Measures\nMeasures. We will interchangeably the term histogram or probability vector for any element a∈Σnthat\nbelongs to the probability simplex\nΣndef.={\na∈Rn\n+;n∑\ni=1ai= 1}\n. A discrete measure with weights aand locations x1,...,xn∈X reads\nα=n∑\ni=1aiδxi (1.1)\nwhereδxis the Dirac at position x, intuitively a unit of mass which is inﬁnitely concentrated at location\nx. Such as measure describes a probability measure if, additionally, a∈Σn, and more generally a positive\nmeasure if each of the “weights” described in vector ais positive itself. Remark 1 (General measures) .A convenient feature of OT is that it can deal with discre