In [3]:
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter
import time
from pinecone import Pinecone, ServerlessSpec
from pinecone.exceptions import PineconeApiException
import pandas as pd
import numpy as np
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModel
from dotenv import load_dotenv

In [44]:

# Function to split by sections and subsections using regex
def regex_split_documents(documents: list[Document]):
    # Define regex pattern to capture both section and subsection headers
    section_regex = r"(\d{1,2}\s{1,}[A-Za-z]+)"
    
    # Regex for subsections (e.g., 1.1, 2.2)
    subsection_regex = r"(\d{1,2}\.\d{1,2})"
    
    all_splits = []
    all_documents = []
    for document in documents:
        text = document.page_content
        metadata = document.metadata
        # First, split by main sections (e.g., 1 Introduction)
        sections = re.split(section_regex, text, flags=re.IGNORECASE)
        
        for section in sections:
            # If section is too small, skip it
            if len(section.strip()) < 100:
                continue
            
            # Further split by subsections within each section (e.g., 1.1, 1.2)
            subsections = re.split(subsection_regex, section, flags=re.IGNORECASE)
            for subsection in subsections:
                if len(subsection.strip()) < 100:
                    continue
                
                # Split subsections into smaller chunks by characters if needed
                text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=600,  # Adjust the chunk size
                    chunk_overlap=60,
                    length_function=len,
                    is_separator_regex=False
                )
                
                # Split the subsection text into smaller chunks
                split_chunks = text_splitter.split_text(subsection)
                for chunk in split_chunks:

                    doc = Document(
                        metadata=metadata,  # Keep the original metadata
                        page_content=chunk   # Assign the chunked content
                        )
                all_documents.append(doc)
    
    return all_documents

In [48]:
import re
from langchain.docstore.document import Document

# Function to split by sections and subsections using regex
def regex_split_documents(documents: list[Document]):
    # Define regex patterns
    # Section headers: e.g., 1 Introduction
    section_regex = r"(\n?\d{1,2}\s{1,}[A-Za-z ]+)"
    # Subsection headers: e.g., 1.1 Introduction
    subsection_regex = r"(\n?\d{1,2}\.\d{1,2}\s{1,}[A-Za-z ]+)"

    all_documents = []
    
    for document in documents:
        text = document.page_content
        metadata = document.metadata
        
        # Include the title and abstract by keeping the first paragraph intact
        title_and_abstract = text.split("\n", 1)  # Split by the first newline
        title = title_and_abstract[0].strip()     # Capture the title
        abstract = title_and_abstract[1].strip() if len(title_and_abstract) > 1 else ""  # Capture the abstract
        
        # Create a Document for the title and abstract
        all_documents.append(Document(
            metadata=metadata,
            page_content=title
        ))
        
        if abstract:
            all_documents.append(Document(
                metadata=metadata,
                page_content=abstract
            ))
        
        # Now split the rest of the document by sections
        sections = re.split(section_regex, text, flags=re.IGNORECASE)

        for section in sections:
            # Skip empty sections
            if not section.strip():
                continue
            
            # Further split by subsections within each section
            subsections = re.split(subsection_regex, section, flags=re.IGNORECASE)
            for subsection in subsections:
                # Skip if subsection is too small
                if len(subsection.strip()) < 100:
                    continue
                
                # Split subsections into smaller chunks if needed
                text_splitter = RecursiveCharacterTextSplitter(
                    chunk_size=600,  # Adjust the chunk size as necessary
                    chunk_overlap=60,
                    length_function=len,
                    is_separator_regex=False
                )
                
                # Split the subsection text into smaller chunks
                split_chunks = text_splitter.split_text(subsection)
                for chunk in split_chunks:
                    doc = Document(
                        metadata=metadata,  # Keep the original metadata
                        page_content=chunk   # Assign the chunked content
                    )
                    all_documents.append(doc)
    
    return all_documents

In [4]:
DATA_PATH = r'C:\QpiAi'

# Load PDF documents
def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()


In [10]:
document = load_documents()

In [49]:
chunk = regex_split_documents(document)

In [50]:
chunk

[Document(metadata={'source': 'C:\\QpiAi\\paper_2409.18119.pdf', 'page': 0}, page_content='Multi-View and Multi-Scale Alignment for Contrastive'),
 Document(metadata={'source': 'C:\\QpiAi\\paper_2409.18119.pdf', 'page': 0}, page_content='Language-Image Pre-training in Mammography\nYuexi Du1, John Onofrey1,2,3, Nicha C. Dvornek1,2\n1Department of Biomedical Engineering,\n2Department of Radiology & Biomedical Imaging,3Department of Urology,\nYale University, New Haven, CT, USA\nAbstract\nContrastive Language-Image Pre-training (CLIP) shows promise in medical image\nanalysis but requires substantial data and computational resources. Due to these\nrestrictions, existing CLIP applications in medical imaging focus mainly on modal-\nities like chest X-rays that have abundant image-report data available, leaving many\nother important modalities under-explored. Here, we propose the first adaptation of\nthe full CLIP model to mammography, which presents significant challenges due to\nlabeled dat

In [None]:
def split_documents_by_sections(documents: list):
    # List of section keywords
    section_keywords = [
        'Introduction', 'Methodology', 'Results', 'Discussion', 'Conclusion', 'Related Work', 'References'
    ]
    
    # Join the section keywords into a regex pattern
    section_keywords_pattern = r"|".join(section_keywords)
    
    # Regex for sections with a number and one of the section keywords
    section_regex = fr"(\n\d{{1,2}}\s+({section_keywords_pattern}))"
    
    all_splits = []
    
    for document in documents:
        text = document.page_content
        
        # Split by sections using the regex pattern
        sections = re.split(section_regex, text)
        
        for section in sections:
            # Skip sections that are too small
            if len(section.strip()) < 100:
                continue
            
            # Further split the section into smaller chunks if needed
            text_splitter = RecursiveCharacterTextSplitter(
                chunk_size=800, 
                chunk_overlap=100, 
                length_function=len
            )
            
            # Split the section into smaller chunks
            split_chunks = text_splitter.split_text(section)
            all_splits.extend(split_chunks)
    
    return all_splits

In [47]:
len(chunk)

479

In [26]:
def split_documents(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=600,
        chunk_overlap=60,
        length_function=len,          
        is_separator_regex=False,
    )
    return text_splitter.split_documents(documents)


In [28]:
from langchain.schema.document import Document

In [27]:
split_documents(document)

[Document(metadata={'source': 'C:\\QpiAi\\paper_2409.18119.pdf', 'page': 0}, page_content='Multi-View and Multi-Scale Alignment for Contrastive\nLanguage-Image Pre-training in Mammography\nYuexi Du1, John Onofrey1,2,3, Nicha C. Dvornek1,2\n1Department of Biomedical Engineering,\n2Department of Radiology & Biomedical Imaging,3Department of Urology,\nYale University, New Haven, CT, USA\nAbstract\nContrastive Language-Image Pre-training (CLIP) shows promise in medical image\nanalysis but requires substantial data and computational resources. Due to these\nrestrictions, existing CLIP applications in medical imaging focus mainly on modal-'),
 Document(metadata={'source': 'C:\\QpiAi\\paper_2409.18119.pdf', 'page': 0}, page_content='ities like chest X-rays that have abundant image-report data available, leaving many\nother important modalities under-explored. Here, we propose the first adaptation of\nthe full CLIP model to mammography, which presents significant challenges due to\nlabeled dat