In [54]:
import os
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer


In [55]:
index_name = "basic-200-index"
chunk_size = 100
embedding_model = "all-MiniLM-L6-v2"

In [56]:

data = "only_english_data"
# Load API keys
with open("pinecone_api_key.txt") as f:
    PINECONE_API_KEY = f.read().strip()

# Load the models
sentence_transformer_model = SentenceTransformer(embedding_model)
dimension = sentence_transformer_model.get_sentence_embedding_dimension() 

# Initialize Pinecone
pc = Pinecone(
        api_key=PINECONE_API_KEY
    )
def create_index(pc, index_name, dimension):
    existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]
    if index_name not in existing_indexes:
        pc.create_index(
            name=index_name,
            dimension=dimension,
            metric='cosine',
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            ))



In [57]:

# Helper functions
def load_data(directory: str):
    """Load all text files from a directory and its subdirectories."""
    documents = []
    company_names = []
    for foldername, _, filenames in os.walk(directory):
        company_name = os.path.basename(foldername)
        if company_name == 'only_english_data':
            continue
        for filename in filenames:
            if filename.endswith(".md"):
                filepath = os.path.join(foldername, filename)
                with open(filepath, 'r', encoding='utf-8') as f:
                    documents.append(f.read())
                    company_names.append(company_name)  # Add company name for each document
    return documents, company_names

def chunk_data(documents: list, chunk_size: int, company_names: list):
    """Split documents into smaller chunks based on word count and prepend company names."""
    chunks = []
    chunks_company = []
    
    for doc, company in zip(documents, company_names):
        # Split the document into words
        words = doc.split()
        
        # Create chunks based on word count
        for i in range(0, len(words), chunk_size):
            chunk = ' '.join(words[i:i + chunk_size])
            chunks.append(chunk)
            chunks_company.append(company)
            
    return chunks,chunks_company

def embed_text(texts: list):
    """Embed texts using either Cohere or SentenceTransformer."""
    return sentence_transformer_model.encode(texts, convert_to_tensor=True).tolist()

def upsert_index(index, embeddings, metadata, company_names, batch_size=100):
    """Insert embeddings into Pinecone in batches with metadata."""
    batch = []
    
    for idx, (emb, md, cn) in enumerate(zip(embeddings, metadata, company_names)):
        vector = {"id": str(idx), "values": emb, "metadata": {"text": md, "company_name": cn}}
        batch.append(vector)
        
        # When batch is full, upsert it
        if len(batch) == batch_size:
            index.upsert(vectors=batch)
            batch = []  # Clear the batch

    # Upsert any remaining vectors
    if batch:
        index.upsert(vectors=batch)






In [58]:
# Load and Index Data
documents, company_names = load_data(data)   
chunks, company_names_chunks = chunk_data(documents, chunk_size, company_names)
embeddings = embed_text(chunks)


In [59]:
create_index(pc, index_name, dimension)
index = pc.Index(index_name)
upsert_index(index, embeddings, chunks, company_names_chunks)