In [2]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

# Initialize Pinecone client with your API key
pc = Pinecone(api_key='14cfeeab-10fd-46dd-a514-0b3195137596')


In [3]:
index_name = "investments"  # Set this to the name of your index

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # Set this to 384 for your embeddings
        metric="cosine",  # You can also choose "euclidean" or "dotproduct"
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'  # You can choose a different region if needed
        ) 
    )


In [4]:
import sys
import json

def estimate_record_size(record):
    id_size = len(record['id'].encode('utf-8'))  # Size of the ID in bytes
    embedding_size = len(record['embedding']) * 4  # 4 bytes per float32 value
    metadata_size = len(json.dumps(record['metadata']).encode('utf-8'))  # Size of metadata as a JSON string in bytes
    total_size = id_size + embedding_size + metadata_size
    return total_size


In [5]:
MAX_BATCH_SIZE = 2 * 1024 * 1024  # 2 MB in bytes

def create_batches(data):
    batches = []
    current_batch = []
    current_batch_size = 0

    for record in data:
        record_size = estimate_record_size(record)
        
        # If adding this record would exceed the 2 MB limit, finalize the current batch and start a new one
        if current_batch_size + record_size > MAX_BATCH_SIZE:
            batches.append(current_batch)
            current_batch = [record]
            current_batch_size = record_size
        else:
            current_batch.append(record)
            current_batch_size += record_size

    # Don't forget to add the last batch
    if current_batch:
        batches.append(current_batch)
    
    return batches


In [13]:
import pickle
# Load the combined_data from the file
filename = 'combined_data.pkl'

with open(filename, 'rb') as file:
    combined_data = pickle.load(file)

print(f"Combined data loaded from {filename}")

Combined data loaded from combined_data.pkl


In [14]:
import json

def split_large_metadata(record, max_size=40000):
    metadata_str = json.dumps(record['metadata'])
    
    # Check if the metadata size exceeds the maximum allowed size
    if len(metadata_str.encode('utf-8')) > max_size:
        # Split the narrative_texts into smaller parts
        if 'narrative_texts' in record['metadata']:
            narratives = record['metadata']['narrative_texts']
            parts = []
            while narratives:
                part = narratives[:max_size]
                narratives = narratives[max_size:]
                parts.append(part)
            
            # Create new records for each part
            new_records = []
            for i, part in enumerate(parts):
                new_metadata = record['metadata'].copy()
                new_metadata['narrative_texts'] = part
                new_records.append({
                    'id': f"{record['id']}_part_{i}",
                    'embedding': record['embedding'],
                    'metadata': new_metadata,
                    'text': record['text']
                })
            return new_records
    
    # Return the original record if no splitting is needed
    return [record]


In [15]:
# Create a new list to store the split records
new_combined_data = []

# Apply the splitting function to each record
for record in combined_data:
    new_combined_data.extend(split_large_metadata(record))

# Replace the original combined_data with the new, split version
combined_data = new_combined_data

print(f"Data has been split into {len(combined_data)} records.")


Data has been split into 54371 records.


In [16]:
# Verify that no record exceeds the metadata size limit
for i, record in enumerate(combined_data):
    metadata_str = json.dumps(record['metadata'])
    if len(metadata_str.encode('utf-8')) > 40960:
        print(f"Record {i} exceeds the size limit with ID: {record['id']}")


In [18]:
MAX_BATCH_SIZE = 2 * 1024 * 1024  # 2 MB in bytes
MAX_VECTORS_PER_BATCH = 1000  # Pinecone's limit

def create_batches(data):
    batches = []
    current_batch = []
    current_batch_size = 0
    current_batch_count = 0

    for record in data:
        record_size = estimate_record_size(record)
        
        # If adding this record would exceed the 2 MB limit or the vector count limit, finalize the current batch and start a new one
        if (current_batch_size + record_size > MAX_BATCH_SIZE) or (current_batch_count >= MAX_VECTORS_PER_BATCH):
            batches.append(current_batch)
            current_batch = [record]
            current_batch_size = record_size
            current_batch_count = 1
        else:
            current_batch.append(record)
            current_batch_size += record_size
            current_batch_count += 1

    # Don't forget to add the last batch
    if current_batch:
        batches.append(current_batch)
    
    return batches


In [19]:
# Connect to the Pinecone index
index = pc.Index(index_name)

# Create and upload batches
batches = create_batches(combined_data)

for i, batch in enumerate(batches):
    vectors = [
        {"id": record['id'], "values": record['embedding'], "metadata": record['metadata']}
        for record in batch
    ]
    index.upsert(vectors=vectors)
    print(f"Batch {i+1}/{len(batches)} uploaded successfully!")

print("All data has been uploaded to Pinecone successfully!")


Batch 1/55 uploaded successfully!
Batch 2/55 uploaded successfully!
Batch 3/55 uploaded successfully!
Batch 4/55 uploaded successfully!
Batch 5/55 uploaded successfully!
Batch 6/55 uploaded successfully!
Batch 7/55 uploaded successfully!
Batch 8/55 uploaded successfully!
Batch 9/55 uploaded successfully!
Batch 10/55 uploaded successfully!
Batch 11/55 uploaded successfully!
Batch 12/55 uploaded successfully!
Batch 13/55 uploaded successfully!
Batch 14/55 uploaded successfully!
Batch 15/55 uploaded successfully!
Batch 16/55 uploaded successfully!
Batch 17/55 uploaded successfully!
Batch 18/55 uploaded successfully!
Batch 19/55 uploaded successfully!
Batch 20/55 uploaded successfully!
Batch 21/55 uploaded successfully!
Batch 22/55 uploaded successfully!
Batch 23/55 uploaded successfully!
Batch 24/55 uploaded successfully!
Batch 25/55 uploaded successfully!
Batch 26/55 uploaded successfully!
Batch 27/55 uploaded successfully!
Batch 28/55 uploaded successfully!
Batch 29/55 uploaded successf