In [None]:
import os
import hashlib
import base64
import numpy as np
from dotenv import load_dotenv
from pinecone.grpc import PineconeGRPC as Pinecone
from nomic import atlas
import nomic

# Load environment variables from the .env file
load_dotenv()

# Retrieve API keys from environment variables
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
NOMIC_API_KEY = os.getenv('NOMIC_API_KEY')
INDEX_NAME_KEY = os.getenv('INDEX_NAME_KEY')

# Initialize Pinecone and Nomic clients
pc = Pinecone(api_key=PINECONE_API_KEY)
nomic.login(NOMIC_API_KEY)
index = pc.Index(INDEX_NAME_KEY)

# Fetch all vector IDs from the index
vectors_ids = list(index.list())

# Flatten the list of vector IDs
flatten_list = [id for _list_ in vectors_ids for id in _list_]

# Chunk the flattened list into groups of 20 IDs
id_list = [flatten_list[i:i+20] for i in range(0, len(flatten_list), 20)]

# Fetch vectors for all IDs
vectors = index.fetch(ids=flatten_list)

def compress_string(original_string):
    """Compress a string using SHA-256 hashing and base64 encoding."""
    sha256_hash = hashlib.sha256(original_string.encode()).digest()
    compressed_string = base64.urlsafe_b64encode(sha256_hash).decode()[:36]
    return compressed_string

# Create a lookup table for compressed IDs
lookup_table = {}

ids = []
embeddings = []
for id, vector in vectors['vectors'].items():
    compressed_string = compress_string(id)
    lookup_table[compressed_string] = id
    print(f"Compressed String: {compressed_string}, id: {id}")  # For Mapping
    ids.append(compressed_string)
    embeddings.append(vector['values'])

# Convert embeddings to numpy array
embeddings = np.array(embeddings)

# Map data to Atlas
atlas.map_data(embeddings=embeddings, data=[{'id': id} for id in ids], id_field='id')