In [1]:
import os
import re
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from langchain_experimental.text_splitter import SemanticChunker
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import SpacyTextSplitter

2024-10-11 19:46:29.223628: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-11 19:46:29.223669: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-11 19:46:29.225033: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-11 19:46:29.232917: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  warn(f"Failed to load image Python extension: {e}")

In [2]:
index_name = "document-structure-index"
chunk_size = 200
embedding_model = "all-MiniLM-L6-v2"

In [3]:
data = "only_english_data"
# Load API keys
with open("pinecone_api_key.txt") as f:
    PINECONE_API_KEY = f.read().strip()

# Load the models
sentence_transformer_model = SentenceTransformer(embedding_model)
dimension = sentence_transformer_model.get_sentence_embedding_dimension() 

# Initialize Pinecone
pc = Pinecone(
        api_key=PINECONE_API_KEY
    )
def create_index(pc, index_name, dimension):
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric='cosine',
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            ))



In [4]:
# Helper functions
def load_data(directory: str):
    """Load all text files from a directory and its subdirectories."""
    documents = []
    company_names = []
    for foldername, _, filenames in os.walk(directory):
        company_name = os.path.basename(foldername)
        if company_name == 'only_english_data':
            continue
        for filename in filenames:
            if filename.endswith(".md"):
                filepath = os.path.join(foldername, filename)
                with open(filepath, 'r', encoding='utf-8') as f:
                    documents.append(f.read())
                    company_names.append(company_name)  # Add company name for each document
    return documents, company_names

def simple_chunk_data(documents: list, chunk_size: int, company_names: list):
    """Split documents into smaller chunks based on word count and prepend company names."""
    chunks = []
    chunks_company = []
    
    for doc, company in zip(documents, company_names):
        # Split the document into words
        words = doc.split()
        
        # Create chunks based on word count
        for i in range(0, len(words), chunk_size):
            chunk = ' '.join(words[i:i + chunk_size])
            chunks.append(chunk)
            chunks_company.append(company)
            
    return chunks,chunks_company

def overlapping_chunking(documents: list, chunk_size: int, overlap_size: int, company_names: list):
    """Create overlapping chunks based on word count and prepend company names."""
    chunks = []
    chunks_company = []
    
    for doc, company in zip(documents, company_names):
        # Split the document into words
        words = doc.split()
        
        for i in range(0, len(words), chunk_size - overlap_size):
            chunk = ' '.join(words[i:i + chunk_size])
            if chunk:  # Ensure the chunk is not empty
                chunks.append(chunk.strip())
                chunks_company.append(company)
                
    return chunks, chunks_company

def semantic_chunking(documents: list, company_names: list, model_embedding_name: str):
    """Split documents into smaller chunks using semantic chunking and prepend company names."""
    
    # Create HuggingFaceEmbeddings wrapper for SentenceTransformer
    embeddings = HuggingFaceEmbeddings(model_name=model_embedding_name)
    
    # Create SemanticChunker
    text_splitter = SemanticChunker(
        embeddings,
        breakpoint_threshold_type='percentile',
        breakpoint_threshold_amount=90
    )
    
    chunks = []
    chunks_company = []
    
    for doc, company in zip(documents, company_names):
        # Use SemanticChunker to split the document
        doc_chunks = text_splitter.split_text(doc)
        
        # Add chunks and corresponding company names
        chunks.extend(doc_chunks)
        chunks_company.extend([company] * len(doc_chunks))
    
    return chunks, chunks_company

def spacy_chunking(documents: list, company_names: list, chunk_size: int=1000, overlap_size: int=500):
    """Split documents into smaller chunks using SpacyTextSplitter and prepend company names."""
    
    # Create SpacyTextSplitter
    text_splitter = SpacyTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=overlap_size
    )
    
    chunks = []
    chunks_company = []
    
    for doc, company in zip(documents, company_names):
        # Use SpacyTextSplitter to split the document
        doc_chunks = text_splitter.split_text(doc)
        
        # Add chunks and corresponding company names
        chunks.extend(doc_chunks)
        chunks_company.extend([company] * len(doc_chunks))
    
    return chunks, chunks_company



def embed_text(texts: list):
    """Embed texts using either Cohere or SentenceTransformer."""
    return sentence_transformer_model.encode(texts, convert_to_tensor=True).tolist()

def upsert_index(index, embeddings, metadata, company_names, batch_size=100):
    """Insert embeddings into Pinecone in batches with metadata."""
    batch = []
    
    for idx, (emb, md, cn) in enumerate(zip(embeddings, metadata, company_names)):
        vector = {"id": str(idx), "values": emb, "metadata": {"text": md, "company_name": cn}}
        batch.append(vector)
        
        # When batch is full, upsert it
        if len(batch) == batch_size:
            index.upsert(vectors=batch)
            batch = []  # Clear the batch

    # Upsert any remaining vectors
    if batch:
        index.upsert(vectors=batch)


In [7]:
# Load and embedd Data
documents, company_names = load_data(data)   
chunks, company_names_chunks = spacy_chunking(documents, company_names)
embeddings = embed_text(chunks)

In [25]:
# index the data
create_index(pc, index_name, dimension)
index = pc.Index(index_name)
upsert_index(index, embeddings, chunks, company_names_chunks)