In [None]:
import json
from pinecone import Pinecone, ServerlessSpec
from openai import AzureOpenAI
import os

def create_pinecone_index(json_file: str, client: AzureOpenAI, 
                         embedding_model: str,
                         index_name: str = "sas-code-chunks"):
    """
    Create Pinecone index from chunks JSON
    
    Args:
        json_file: Path to chunks JSON file
        client: AzureOpenAI client
        embedding_model: Embedding model deployment name
        index_name: Name for the Pinecone index
    
    Returns:
        index: Pinecone index
    """
    # Load chunks
    with open(json_file, 'r', encoding='utf-8') as f:
        chunks = json.load(f)
    
    print(f"Loaded {len(chunks)} chunks")
    
    # Initialize Pinecone
    pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
    
    # Check if index exists, if not create it
    if index_name not in pc.list_indexes().names():
        print(f"Creating new index: {index_name}")
        pc.create_index(
            name=index_name,
            dimension=1536,  # OpenAI ada-002 dimension
            metric="cosine",
            spec=ServerlessSpec(
                cloud="aws",
                region=os.environ.get("PINECONE_REGION", "us-east-1")
            )
        )
    else:
        print(f"Using existing index: {index_name}")
    
    # Connect to index
    index = pc.Index(index_name)
    
    # Prepare vectors for upsert
    vectors_to_upsert = []
    
    print("Creating embeddings and preparing vectors...")
    for i, chunk in enumerate(chunks):
        print(f"Processing chunk {i+1}/{len(chunks)}...")
        
        # Create embedding for explanation
        explanation = chunk['explanation'] or "No explanation"
        response = client.embeddings.create(
            model=embedding_model,
            input=explanation
        )
        embedding = response.data[0].embedding
        
        # Prepare vector with metadata
        vector = {
            'id': f"chunk_{i}",
            'values': embedding,
            'metadata': {
                'chunk_type': chunk['chunk_type'],
                'name': chunk['name'] or '',
                'code': chunk['code'][:1000],  # Pinecone metadata limit
                'explanation': explanation[:1000],
                'comments': (chunk['comments'] or '')[:500],
                'line_start': chunk['line_start'],
                'line_end': chunk['line_end']
            }
        }
        vectors_to_upsert.append(vector)
        
        # Batch upsert every 100 vectors
        if len(vectors_to_upsert) >= 100:
            index.upsert(vectors=vectors_to_upsert)
            vectors_to_upsert = []
    
    # Upsert remaining vectors
    if vectors_to_upsert:
        index.upsert(vectors=vectors_to_upsert)
    
    print(f"✓ Added {len(chunks)} chunks to Pinecone index '{index_name}'")
    print(f"✓ Index stats: {index.describe_index_stats()}")
    
    return index



In [None]:
# Usage
index = create_pinecone_index(
    'all_sas_chunks_with_explanations.json',
    client,
    embedding_model=os.environ.get("AZURE_OPENAI_EMBEDDING_MODEL", "text-embedding-ada-002")
)