In [2]:
import os
import pandas as pd
import pinecone
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
from tqdm import tqdm

# Load environment variables
load_dotenv()
api_key = os.getenv("PINECONE_API_KEY")
index_name = "medical-supply-chain"

# Initialize Pinecone
pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,  # 384 for MiniLM
        metric="cosine",
        spec=ServerlessSpec(cloud='aws', region='us-east-1')
    )
index = pc.Index(index_name)

# Load datasets
inventory_df = pd.read_csv("datasets/inventory_data.csv")
transport_df = pd.read_csv("datasets/transport_history.csv")

# Load embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

def generate_embeddings(data, namespace, batch_size=100):
    """Generate embeddings and store in Pinecone in smaller batches."""
    vectors = []
    for _, row in tqdm(data.iterrows(), total=len(data)):
        text = " ".join(map(str, row.values))  # Combine all values into a single text string
        embedding = model.encode(text).tolist()
        metadata = {col: str(row[col]) for col in data.columns}  # Store all columns as metadata
        vectors.append((row["ItemID"] if "ItemID" in row else row["ShipmentID"], embedding, metadata))
        
        # If batch size reached, upsert and clear vectors list
        if len(vectors) >= batch_size:
            index.upsert(vectors, namespace=namespace)
            print(f"Upserted {len(vectors)} vectors to namespace '{namespace}'")
            vectors = []

    # Upsert any remaining vectors
    if vectors:
        index.upsert(vectors, namespace=namespace)
        print(f"Upserted {len(vectors)} remaining vectors to namespace '{namespace}'")

# Process inventory and transport data
generate_embeddings(inventory_df, "inventory")
generate_embeddings(transport_df, "transport")

print("Embeddings stored successfully!")


100%|██████████| 19/19 [00:00<00:00, 23.22it/s]


Upserted 19 remaining vectors to namespace 'inventory'


 11%|█         | 100/920 [00:13<03:46,  3.62it/s]

Upserted 100 vectors to namespace 'transport'


 22%|██▏       | 201/920 [00:31<03:57,  3.03it/s]

Upserted 100 vectors to namespace 'transport'


 33%|███▎      | 301/920 [00:45<02:43,  3.78it/s]

Upserted 100 vectors to namespace 'transport'


 43%|████▎     | 400/920 [01:04<02:57,  2.93it/s]

Upserted 100 vectors to namespace 'transport'


 54%|█████▍    | 500/920 [01:18<02:21,  2.97it/s]

Upserted 100 vectors to namespace 'transport'


 65%|██████▌   | 601/920 [01:33<01:17,  4.10it/s]

Upserted 100 vectors to namespace 'transport'


 76%|███████▌  | 701/920 [01:47<00:53,  4.12it/s]

Upserted 100 vectors to namespace 'transport'


 87%|████████▋ | 801/920 [02:02<00:30,  3.88it/s]

Upserted 100 vectors to namespace 'transport'


 98%|█████████▊| 901/920 [02:17<00:05,  3.30it/s]

Upserted 100 vectors to namespace 'transport'


100%|██████████| 920/920 [02:21<00:00,  6.52it/s]

Upserted 20 remaining vectors to namespace 'transport'
Embeddings stored successfully!



