In [53]:
#!pip install langchain
#!pip install langchain_community
#!pip install pypdf

from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
from transformers import GPT2Tokenizer, GPT2Model

class TextChunker:
    def __init__(self, model_name="gpt2"):
        # Load pre-trained GPT-2 model and tokenizer
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.model = GPT2Model.from_pretrained(model_name)

    def extract_text_from_pdf(self, path = "/"):
        text = ""
        loader = PyPDFLoader(path)
        documents = loader.load()
        for doc in documents:
            text = text + "\n" + str(doc.page_content)
        return text

    def char_count_chunking(self, text, chunk_size=200, chunk_overlap=50):
        # Instantiate the RecursiveCharacterTextSplitter class
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        # Create documents using the text splitter
        docs = text_splitter.create_documents([text])
        chunks = [doc.page_content for doc in docs]
        return chunks

    def char_count_chunking_with_custom_delimiter(self, text, chunk_size=200, chunk_overlap=50, delimiter="\n\n"):
        # Instantiate the CharacterTextSplitter class
        text_splitter = CharacterTextSplitter(separator=delimiter, chunk_size=chunk_size, chunk_overlap=chunk_overlap)
        # Create documents using the text splitter
        docs = text_splitter.create_documents([text])
        chunks = [doc.page_content for doc in docs]
        return chunks

    def semantic_section_chunking(self, text, max_chunk_size=200, overlap_size=50):
        # Tokenize the text
        input_ids = self.tokenizer.encode(text, return_tensors="pt", add_special_tokens=False)
        # Define the chunking parameters
        chunk_size = max_chunk_size - overlap_size
        stride = chunk_size
        # Perform chunking
        chunks = []
        for i in range(0, input_ids.size(1), stride):
            # Slice the input_ids to form a chunk
            chunk_input_ids = input_ids[:, i:i+chunk_size]
            # Decode the chunk
            chunk_text = self.tokenizer.decode(chunk_input_ids[0], skip_special_tokens=True)
            # Add the chunk to the list
            chunks.append(chunk_text)
        return chunks



In [57]:
# Example usage
text_chunker = TextChunker()

pdf_path =r"/content/drive/MyDrive/chunking/ds (1).pdf"
text = text_chunker.extract_text_from_pdf(path = pdf_path )

chunks_char_count = text_chunker.char_count_chunking(text, chunk_size=50, chunk_overlap=40)
print("Char count chunking:")
for i, chunk in enumerate(chunks_char_count):
    print(f"Chunk {i+1}: {chunk}")

chunks_char_count = text_chunker.char_count_chunking(text, chunk_size=50, chunk_overlap=0)
print("Char count chunking non overlap:")
for i, chunk in enumerate(chunks_char_count):
    print(f"Chunk {i+1}: {chunk}")

chunks_custom_delimiter = text_chunker.char_count_chunking_with_custom_delimiter(text, chunk_size=200, chunk_overlap=50, delimiter="@")
print("\nChar count chunking with custom delimiter:")
for i, chunk in enumerate(chunks_custom_delimiter):
    print(f"Chunk {i+1}: {chunk}")

chunks_semantic = text_chunker.semantic_section_chunking(text, max_chunk_size=200, overlap_size=50)
print("\nSemantic section chunking:")
for i, chunk in enumerate(chunks_semantic):
    print(f"Chunk {i+1}: {chunk}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Chunk 2836: lar (??)). Using these notations, Sinkhorn’s
Chunk 2837: Using these notations, Sinkhorn’s iterates read
Chunk 2838: f(ℓ+1)= Minrow
ε(C−1ng(ℓ)T) +εloga, (1.65)
Chunk 2839: ε(C−1ng(ℓ)T) +εloga, (1.65)
g(ℓ+1)= Mincol
Chunk 2840: g(ℓ+1)= Mincol
ε(C−f(ℓ)1mT) +εlogb. (1.66)
Chunk 2841: Note that as ε→0, minεconverges to min, but the
Chunk 2842: as ε→0, minεconverges to min, but the iterations
Chunk 2843: to min, but the iterations do not converge
Chunk 2844: min, but the iterations do not converge anymore
Chunk 2845: the iterations do not converge anymore in the
Chunk 2846: do not converge anymore in the limit ε= 0,
Chunk 2847: because alternate minimization does not converge
Chunk 2848: minimization does not converge for constrained
Chunk 2849: does not converge for constrained problems (which
Chunk 2850: for constrained problems (which is the case for
Chunk 2851: problems (which is the case for the un-
Chunk 2852

Token indices sequence length is longer than the specified maximum sequence length for this model (26209 > 1024). Running this sequence through the model will result in indexing errors



Semantic section chunking:
Chunk 1: 
Mathematical Foundations of Data Sciences
Gabriel Peyr´ e
CNRS & DMA
´Ecole Normale Sup´ erieure
gabriel.peyre@ens.fr
https://mathematical-tours.github.io
www.numerical-tours.com
August 14, 2019
2
Chapter 1
Optimal Transport
1.1 Radon Measures
Measures. We will interchangeably the term histogram or probability vector for any element a∈Σnthat
belongs to the probability simplex
Σndef.={
a∈Rn
+;n∑
i=1ai= 1}
Chunk 2: 
.
A discrete measure with weights aand locations x1,...,xn∈X reads
α=n∑
i=1aiδxi (1.1)
whereδxis the Dirac at position x, intuitively a unit of mass which is inﬁnitely concentrated at location
x. Such as measure describes a probability measure if, additionally, a∈Σn, and more generally a positive
measure if each of the “weights” described in vector ais positive itself.
Remark 1 (General measures).A convenient feature of OT is that it can deal with discrete and continuous
“objects” within the same framework.
Chunk 3:  Such objects only nee