Vectorize fulltext and upload to Neon

In [None]:
import psycopg2
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv
from urllib.parse import quote_plus

# 🔥 Load environment variables
load_dotenv(override=True)

# ✅ Retrieve credentials
dbname = os.getenv("NEON_DBNAME", "neondb")
user = os.getenv("NEON_USER", "neondb_owner")
password = os.getenv("NEON_PASSWORD")
host = os.getenv("NEON_HOST", "ep-morning-bonus-a8s0mhpb-pooler.eastus2.azure.neon.tech")
port = os.getenv("NEON_PORT", "5432")

# 🔥 Encode password (if necessary)
encoded_password = quote_plus(password)

# ✅ Construct connection string
conn_str = f"postgresql://{user}:{encoded_password}@{host}:{port}/{dbname}?sslmode=require"

# ✅ Load embedding model (Choose BGE or E5)
MODEL_NAME = "BAAI/bge-base-en"  # Change to "intfloat/e5-large-v2" if you prefer E5
model = SentenceTransformer(MODEL_NAME)

# ✅ Batch size for commits
BATCH_SIZE = 100

# ✅ Connect to PostgreSQL
try:
    conn = psycopg2.connect(conn_str)
    cursor = conn.cursor()
    print("✅ Connected to PostgreSQL!")

    # 1️⃣ Fetch only entries where embedding is NULL
    cursor.execute("SELECT id, content FROM sep_embeddings WHERE embedding IS NULL;")
    rows = cursor.fetchall()

    total_rows = len(rows)
    if total_rows == 0:
        print("🎉 All embeddings are already computed. No updates needed.")
    else:
        print(f"⚙️ Computing embeddings for {total_rows} new entries...")

        # 2️⃣ Compute embeddings in batches
        batch_count = 0
        for index, (text_id, content) in enumerate(rows, start=1):
            if content:  # Avoid empty content
                embedding = model.encode(content).tolist()  # Convert numpy array to list

                # ✅ Ensure PostgreSQL vector format (if using pgvector)
                embedding_str = "[" + ",".join(map(str, embedding)) + "]"

                # Queue update
                cursor.execute(
                    "UPDATE sep_embeddings SET embedding = %s WHERE id = %s;",
                    (embedding_str, text_id)
                )

                # Commit every `BATCH_SIZE` rows
                if index % BATCH_SIZE == 0:
                    conn.commit()
                    batch_count += 1
                    print(f"✅ Committed batch {batch_count} ({index}/{total_rows} processed)")

        # 4️⃣ Final commit for remaining updates
        conn.commit()
        print("🚀 All embeddings successfully updated!")

    # Close connection
    cursor.close()
    conn.close()

except psycopg2.OperationalError as e:
    print("🚨 Connection failed! Error:", e)


Add JSON embeddings to Pinecone

In [None]:
import os
import json
import sys
import psycopg2
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer
from urllib.parse import quote_plus

# 🔥 Load environment variables
load_dotenv()

# ✅ Retrieve PostgreSQL credentials
dbname = os.getenv("NEON_DBNAME", "neondb")
user = os.getenv("NEON_USER", "neondb_owner")
password = os.getenv("NEON_PASSWORD")
host = os.getenv("NEON_HOST", "ep-morning-bonus-a8s0mhpb-pooler.eastus2.azure.neon.tech")
port = os.getenv("NEON_PORT", "5432")

# 🔥 Encode password
encoded_password = quote_plus(password)

# ✅ Construct PostgreSQL connection string
conn_str = f"postgresql://{user}:{encoded_password}@{host}:{port}/{dbname}?sslmode=require"

# ✅ Load embedding model
MODEL_NAME = "BAAI/bge-base-en"
model = SentenceTransformer(MODEL_NAME)

# ✅ Connect to PostgreSQL
conn = psycopg2.connect(conn_str)
cursor = conn.cursor()
print("✅ Connected to PostgreSQL!")

# ✅ Retrieve Pinecone API key
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")


print(f"✅ Created new Pinecone index: {INDEX_NAME}")

# ✅ Connect to Pinecone index
index = pc.Index(INDEX_NAME)
print(f"✅ Connected to fresh Pinecone index: {INDEX_NAME}")

# ✅ Delete PostgreSQL tables and start fresh
print("⚠️ Dropping and recreating PostgreSQL tables...")
cursor.execute("DROP TABLE IF EXISTS pinecone_log;")
cursor.execute("""
    CREATE TABLE pinecone_log (
        sep_id INTEGER PRIMARY KEY
    );
""")
conn.commit()
print("✅ PostgreSQL tables reset.")



In [None]:
# Track errors
error_entries = []
last_processed_id = None  # Keep track of the last processed ID

# 1️⃣ Fetch JSON entries from PostgreSQL
cursor.execute("SELECT id, title, mistral_output FROM sep_embeddings ORDER BY id;")
rows = cursor.fetchall()

batch_count = 0
BATCH_SIZE = 100
UPSERT_THRESHOLD = 100  # ✅ Force an upsert every 100 items

while rows:
    print(f"⚙️ Processing {len(rows)} entries in batch {batch_count + 1}...")

    upserts = []  # Collect Pinecone upserts

    for text_id, document_title, mistral_json in rows:
        last_processed_id = text_id  # Track last processed ID
        try:
            print(f"\n🔍 Processing ID {text_id} - Raw Mistral JSON Type: {type(mistral_json)}")

            # Ensure mistral_json is a valid dictionary
            if isinstance(mistral_json, str):
                try:
                    mistral_json = json.loads(mistral_json)
                except json.JSONDecodeError:
                    raise ValueError("Invalid JSON format")

            if not isinstance(mistral_json, dict):
                raise ValueError(f"Expected dictionary, got {type(mistral_json)}")

            # Process beliefs
            for belief in mistral_json.get("key_beliefs", []):
                if not isinstance(belief, dict):
                    print(f"⚠️ Skipping belief: Expected dict, got {type(belief)}")
                    continue

                belief_text = belief.get("belief", "").strip()
                justification = belief.get("justification", "").strip()
                related_concepts = ", ".join(c.get("name", "").strip() for c in belief.get("related_concepts", []) if isinstance(c, dict))

                if not belief_text:
                    print("⚠️ Skipping belief: Text is missing or empty")
                    continue

                # Generate vector ID
                vector_id = f"{text_id}_belief_{hash(belief_text) % 10**8}"

                print(f"🟢 Belief: {belief_text} (Generating Embedding...)")

                # Generate embeddings
                combined_text = f"{belief_text}. Justification: {justification}. Related concepts: {related_concepts}"
                belief_embedding = model.encode(combined_text).tolist()

                if not belief_embedding:
                    print(f"❌ ERROR: Empty embedding for belief `{belief_text}` (ID {text_id})")
                    continue

                print(f"✅ Embedding Generated: {belief_embedding[:5]}...")  # Print first 5 dimensions for sanity check

                # ✅ Add belief to upserts
                upserts.append((
                    vector_id,
                    belief_embedding,
                    {
                        "sep_id": text_id,
                        "document_title": document_title,
                        "type": "belief",
                        "belief": belief_text,
                        "justification": justification,
                        "related_concepts": related_concepts
                    }
                ))

                print(f"🔹 Added belief `{belief_text}` to upserts! ({len(upserts)} total)")

            # Process key concepts
            for concept in mistral_json.get("key_concepts", []):
                if not isinstance(concept, dict):
                    print(f"⚠️ Skipping concept: Expected dict, got {type(concept)}")
                    continue

                concept_text = concept.get("name", "").strip()

                if not concept_text:
                    print("⚠️ Skipping concept: Text is missing or empty")
                    continue

                # Generate vector ID
                vector_id = f"{text_id}_concept_{hash(concept_text) % 10**8}"

                print(f"🔴 Concept: {concept_text} (Generating Embedding...)")

                # Generate embeddings
                concept_embedding = model.encode(concept_text).tolist()

                if not concept_embedding:
                    print(f"❌ ERROR: Empty embedding for concept `{concept_text}` (ID {text_id})")
                    continue

                print(f"✅ Embedding Generated: {concept_embedding[:5]}...")  # Print first 5 dimensions for sanity check

                # ✅ Add concept to upserts
                upserts.append((
                    vector_id,
                    concept_embedding,
                    {
                        "sep_id": text_id,
                        "document_title": document_title,
                        "type": "concept",
                        "concept": concept_text
                    }
                ))

                print(f"🔹 Added concept `{concept_text}` to upserts! ({len(upserts)} total)")

            # ✅ 🚨 **Trigger Upsert Every 100 Items** 🚨 ✅
            if len(upserts) >= UPSERT_THRESHOLD:
                print(f"🚀 🔄 Upserting {len(upserts)} items into Pinecone...")

                try:
                    response = index.upsert(upserts)
                    print(f"🔄 Pinecone Upsert Response: {response}")

                    index_stats = index.describe_index_stats()
                    print(f"📊 Pinecone Index Stats After Upsert: {index_stats}")

                    if index_stats["total_vector_count"] == 0:
                        print("🚨 ERROR: Pinecone is not storing the embeddings!")
                    else:
                        print("✅ Successfully inserted batch into Pinecone!")

                    upserts.clear()  # ✅ Clear upserts after inserting

                except Exception as e:
                    print(f"🚨 ERROR! Pinecone upsert failed: {e}")

        except Exception as e:
            print(f"\n🚨 ERROR! Skipping ID {text_id} due to error:\n{e}")
            error_entries.append(text_id)
            continue  # Skip to next entry

    # **Final Batch Upsert**
    if upserts:
        print(f"🚀 🔄 FINAL Upserting {len(upserts)} remaining items into Pinecone...")

        try:
            response = index.upsert(upserts)
            print(f"🔄 Pinecone Upsert Response: {response}")

            index_stats = index.describe_index_stats()
            print(f"📊 Pinecone Index Stats After Final Upsert: {index_stats}")

            upserts.clear()  # ✅ Clear upserts after inserting

        except Exception as e:
            print(f"🚨 ERROR! Final Pinecone upsert failed: {e}")

    batch_count += 1
    print(f"🚀 Committed batch {batch_count}")

    # Fetch next batch
    cursor.execute("SELECT id, title, mistral_output FROM sep_embeddings ORDER BY id LIMIT 100 OFFSET %s;", (batch_count * BATCH_SIZE,))
    rows = cursor.fetchall()

# ✅ Close PostgreSQL connection
cursor.close()
conn.close()
print("🚀 All embeddings successfully updated in Pinecone!")

# Print error report
if error_entries:
    print("\n⚠️ The following entries failed to process:")
    print(f"🛑 {len(error_entries)} error entries.")
    print(error_entries)

# ✅ Print last processed JSON ID before exit
if last_processed_id:
    print(f"\n🔄 Last successfully processed JSON ID before exit: {last_processed_id}")


In [None]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone

# 🔥 Load environment variables
load_dotenv()

# ✅ Retrieve Pinecone API key
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")

# ✅ Create Pinecone instance
pc = Pinecone(api_key=PINECONE_API_KEY)

# ✅ Define Index Name
INDEX_NAME = "belief-embeddings"  # Change this if necessary

# ✅ Connect to the Pinecone index
index = pc.Index(INDEX_NAME)

# ✅ Get index statistics
stats = index.describe_index_stats()

# ✅ Print the number of entries
num_entries = stats["total_vector_count"]
print(f"Total number of entries in Pinecone index '{INDEX_NAME}': {num_entries}")
