Iterate through the JSONs in the Postgres and extract valuable information. This is the main knowledge transformation. 

In [14]:
import os
import json
import psycopg2
import requests
import time
from tqdm import tqdm  # Progress bar
from dotenv import load_dotenv
from queue import Queue

# ✅ Load environment variables from .env file
load_dotenv()

# ✅ Ensure required environment variables are loaded
REQUIRED_ENV_VARS = ["MISTRAL_API_KEY", "MISTRAL_API_KEY_2", "POSTGRES_PASSWORD", "POSTGRES_DB"]
for var in REQUIRED_ENV_VARS:
    if not os.getenv(var):
        raise EnvironmentError(f"❌ Missing required environment variable: {var}")

MISTRAL_API_KEY_1 = os.getenv("MISTRAL_API_KEY")  # First API key
MISTRAL_API_KEY_2 = os.getenv("MISTRAL_API_KEY_2")  # Second API key
MISTRAL_API_KEY_3 = os.getenv("MISTRAL_API_KEY_3")  # Third API key
BATCH_SIZE = 10  # Upload to PostgreSQL every 10 processed entries

print("🔑 API Keys Loaded:")
print(f"MISTRAL_API_KEY_1: {MISTRAL_API_KEY_1[-6:]}")
print(f"MISTRAL_API_KEY_2: {MISTRAL_API_KEY_2[-6:]}")
print(f"MISTRAL_API_KEY_3: {MISTRAL_API_KEY_3[-6:]}")


# ✅ PostgreSQL Connection
try:
    conn = psycopg2.connect(
        dbname=os.getenv("POSTGRES_DB"),
        user="rohansharma",
        password=os.getenv("POSTGRES_PASSWORD"),
        host="localhost",
        port="5432"
    )
    cursor = conn.cursor()
    print("✅ Successfully connected to PostgreSQL!")
except psycopg2.Error as e:
    print(f"❌ Failed to connect to PostgreSQL: {e}")
    exit(1)

# ✅ Count total unprocessed entries
try:
    cursor.execute("SELECT COUNT(*) FROM sep_embeddings WHERE mistral_output IS NULL;")
    total_entries = cursor.fetchone()[0]
    print(f"📊 Total unprocessed entries: {total_entries}")
except psycopg2.Error as e:
    print(f"❌ Error executing query: {e}")
    exit(1)

# ✅ Fetch unprocessed rows
cursor.execute("""
    SELECT id, content FROM sep_embeddings 
    WHERE mistral_output IS NULL 
    ORDER BY id;
""")
rows = cursor.fetchall()
batch_updates = []

# ✅ Initialize progress bar
progress_bar = tqdm(total=total_entries, desc="Processing Entries", unit="entry")

# ✅ Separate queues for each API key
queue_1 = Queue()
queue_2 = Queue()
queue_3 = Queue()
response_queue = Queue()

# ✅ Distribute requests between the three API keys
for i, (entry_id, content) in enumerate(rows, start=1):
    if i % 3 == 0:
        queue_1.put((entry_id, content))  # ✅ Every third entry → API Key 1
    elif i % 3 == 1:
        queue_2.put((entry_id, content))  # ✅ Every third entry → API Key 2
    else:
        queue_3.put((entry_id, content))  # ✅ Every third entry → API Key 3


🔑 API Keys Loaded:
MISTRAL_API_KEY_1: ptbzbx
MISTRAL_API_KEY_2: pGxUxC
MISTRAL_API_KEY_3: m9WQW8
✅ Successfully connected to PostgreSQL!
📊 Total unprocessed entries: 34723


Processing Entries:   0%|          | 121/34828 [07:23<35:21:42,  3.67s/entry]


In [16]:
import requests
import json
import sys
import threading
import time
from queue import Empty

# ✅ Locks to enforce **1 RPS per key**
api_lock_1 = threading.Lock()
api_lock_2 = threading.Lock()
api_lock_3 = threading.Lock()

def call_mistral_api(api_key, lock, entry_id, content, attempt=0):
    """ Sends an API request with exponential backoff for 429 errors. """
    url = "https://api.mistral.ai/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    messages = [
        {"role": "system", "content": "Extract structured philosophical knowledge. Your response should always be a single valid JSON object."},
        {"role": "user", "content": f"""
        You are an AI tasked with extracting structured philosophical knowledge from a given text.
        Your goal is to **categorize the entry and extract key information in a structured format.**  
        Be **as detailed as possible while remaining concise.**  
        **Always return a single JSON object, never multiple JSON objects.**

        ## **Input:**
        {content}

        ## **Output Format (JSON):**
        {{
            "category": "thinker" | "concept" | "era",
            "metadata": {{
                "name": "...", 
                "description": "...",
                "time_period": "..."  
            }},
            "key_beliefs": [  
                {{ "belief": "...", "justification": "...", "related_concepts": ["...", "..."] }}
            ],
            "key_concepts": [  
                {{ "name": "...", "definition": "...", "related_fields": ["...", "..."] }}
            ],
            "associated_thinkers": ["...", "..."],
            "associated_eras": ["...", "..."]
        }}
        **Never return multiple JSON objects.**
        """}
    ]

    data = {
        "model": "mistral-medium-2312",
        "messages": messages,
        "temperature": 0.4
    }

    with lock:  # ✅ Ensures strict **1 request per second per key**
        time.sleep(1.1)  # ✅ Strict rate limit enforcement

    response = requests.post(url, headers=headers, json=data)

    if response.status_code == 200:
        try:
            parsed_response = response.json()
            return entry_id, json.loads(parsed_response["choices"][0]["message"]["content"])  # ✅ Ensure Valid JSON
        except json.JSONDecodeError:
            print(f"❌ Failed to parse API response for entry ID {entry_id}.")
            return None

    elif response.status_code == 429:
        if attempt < 5:  # 🚀 Retry up to 5 times with exponential backoff
            wait_time = 2 ** attempt  # Exponential backoff: 2s → 4s → 8s → 16s → 32s
            print(f"\n⏳ Rate limited (429) for entry {entry_id}. Retrying in {wait_time} seconds...")
            time.sleep(wait_time)
            return call_mistral_api(api_key, lock, entry_id, content, attempt + 1)
        else:
            print(f"\n❌ Entry {entry_id} failed after 5 retries. Skipping...")
            return None

    else:
        print(f"\n❌ API request failed for entry ID {entry_id}: {response.status_code} {response.text}")
        return None


In [None]:
import threading
import json
import time
import psycopg2
from queue import Queue, Empty

# ✅ Define API worker function
def api_worker(queue, api_key, lock):
    """ Worker that fetches requests from the queue, calls the API, and puts results into the response queue. """
    while not queue.empty():
        try:
            entry_id, content = queue.get(timeout=10)
            response = call_mistral_api(api_key, lock, entry_id, content)
            if response:
                response_queue.put(response)  # ✅ Add only successful responses
                print(f"✅ Processed entry {entry_id} with {api_key[-4:]}")
            queue.task_done()

        except Empty:
            break


# ✅ Define response processing function
def process_responses():
    """ Processes API responses and updates the database in batches. """
    batch_updates = []
    
    while True:
        try:
            entry_id, mistral_output_json = response_queue.get(timeout=10)
            batch_updates.append((json.dumps(mistral_output_json), entry_id))
            progress_bar.update(1)
            print(f"📦 Current batch size: {len(batch_updates)} | ⬆️ Processing entry {entry_id}")

            # ✅ Upload batch every 10 responses
            if len(batch_updates) >= BATCH_SIZE:
                cursor.executemany("""
                    UPDATE sep_embeddings 
                    SET mistral_output = %s 
                    WHERE id = %s;
                """, batch_updates)
                conn.commit()
                print(f"\n✅ Uploaded {BATCH_SIZE} entries to the database.")
                batch_updates = []  # ✅ Reset batch

        except Empty:
            # ✅ Stop processing **only when all API workers have finished**
            if queue_1.empty() and queue_2.empty() and queue_3.empty() and response_queue.empty():
                break  # ✅ Exit if everything is processed

    # ✅ Upload any remaining entries
    if batch_updates:
        cursor.executemany("""
            UPDATE sep_embeddings 
            SET mistral_output = %s 
            WHERE id = %s;
        """, batch_updates)
        conn.commit()
        print(f"\n✅ Final batch uploaded successfully ({len(batch_updates)} entries).")

    progress_bar.close()
    print("\n🎉 Processing complete!")
    cursor.close()
    conn.close()


# ✅ Start API worker threads (3 keys)
api_thread_1 = threading.Thread(target=api_worker, args=(queue_1, MISTRAL_API_KEY_1, api_lock_1))
api_thread_2 = threading.Thread(target=api_worker, args=(queue_2, MISTRAL_API_KEY_2, api_lock_2))
api_thread_3 = threading.Thread(target=api_worker, args=(queue_3, MISTRAL_API_KEY_3, api_lock_3))

response_thread = threading.Thread(target=process_responses)

# ✅ Start all threads
api_thread_1.start()
api_thread_2.start()
api_thread_3.start()
response_thread.start()

# ✅ Wait for API workers to complete first
api_thread_1.join()
api_thread_2.join()
api_thread_3.join()

# ✅ Ensure all responses are processed before exiting
response_thread.join()


Processing Entries:   0%|          | 8/34723 [00:20<22:02:44,  2.29s/entry]

📦 Current batch size: 1 | ⬆️ Processing entry 210


Processing Entries:   0%|          | 10/34723 [00:28<29:49:44,  3.09s/entry]

✅ Processed entry 118 with bzbx
📦 Current batch size: 2 | ⬆️ Processing entry 118
✅ Processed entry 115 with WQW8


Processing Entries:   0%|          | 12/34723 [00:29<19:23:08,  2.01s/entry]

📦 Current batch size: 3 | ⬆️ Processing entry 250


Processing Entries:   0%|          | 14/34723 [00:31<13:46:11,  1.43s/entry]

📦 Current batch size: 4 | ⬆️ Processing entry 239
✅ Processed entry 112 with xUxC


Processing Entries:   0%|          | 16/34723 [00:40<29:47:59,  3.09s/entry]

✅ Processed entry 124 with WQW8
📦 Current batch size: 5 | ⬆️ Processing entry 124
✅ Processed entry 127 with bzbx


Processing Entries:   0%|          | 18/34723 [00:43<23:18:05,  2.42s/entry]

✅ Processed entry 121 with xUxC
📦 Current batch size: 6 | ⬆️ Processing entry 121


Processing Entries:   0%|          | 20/34723 [00:51<32:32:01,  3.37s/entry]

📦 Current batch size: 7 | ⬆️ Processing entry 212


Processing Entries:   0%|          | 22/34723 [00:54<23:03:44,  2.39s/entry]

✅ Processed entry 130 with xUxC
📦 Current batch size: 8 | ⬆️ Processing entry 130
✅ Processed entry 133 with WQW8


Processing Entries:   0%|          | 24/34723 [00:57<20:06:56,  2.09s/entry]

✅ Processed entry 136 with bzbx
📦 Current batch size: 9 | ⬆️ Processing entry 136

⏳ Rate limited (429) for entry 145. Retrying in 1 seconds...


Processing Entries:   0%|          | 26/34723 [01:02<23:31:38,  2.44s/entry]

📦 Current batch size: 10 | ⬆️ Processing entry 215

✅ Uploaded 10 entries to the database.


Processing Entries:   0%|          | 27/34723 [01:09<34:44:11,  3.60s/entry]

📦 Current batch size: 1 | ⬆️ Processing entry 254
✅ Processed entry 142 with WQW8


Processing Entries:   0%|          | 29/34723 [01:10<21:11:05,  2.20s/entry]

✅ Processed entry 139 with xUxC
📦 Current batch size: 2 | ⬆️ Processing entry 139


Reupload this data to the Postgres.

Compute and upload the vector embeddings. 