diff --git a/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb b/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb index a6b6d68..9b0f054 100644 --- a/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb +++ b/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb @@ -36,7 +36,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install pdfplumber" + "!pip install pdfplumber==0.11.0" ] }, { @@ -53,12 +53,12 @@ { "cell_type": "code", "execution_count": 2, + "id": "87c6c286", "metadata": {}, "outputs": [], "source": [ - "!pip install \"openai\"" - ], - "id": "87c6c286" + "!pip install \"openai==0.28.1\"" + ] }, { "cell_type": "code", @@ -94,14 +94,14 @@ { "cell_type": "code", "execution_count": 4, + "id": "9dbe989a", "metadata": {}, "outputs": [], "source": [ "import os\n", "from getpass import getpass\n", "os.environ[\"OPENAI_API_KEY\"] = getpass(\"OpenAI API key: \")" - ], - "id": "9dbe989a" + ] }, { "cell_type": "code", @@ -135,12 +135,13 @@ "\n", "\n", "References:\n", - "- pdfplumber: https://github.com/jsvine/pdfplumber\n", - "- PyMuPDF (optional alternative): https://pymupdf.readthedocs.io/en/latest/" + "- pdfplumber: [https://github.com/jsvine/pdfplumber]\n", + "- PyMuPDF (optional alternative): [https://pymupdf.readthedocs.io/en/latest/]" ] }, { "cell_type": "markdown", + "id": "050463a1", "metadata": {}, "source": [ "## Uploading PDF File to Stage\n", @@ -149,19 +150,18 @@ "\n", "References:\n", "- [Stage documentation](https://docs.singlestore.com/cloud/load-data/load-data-from-files/stage/)" - ], - "id": "050463a1" + ] }, { "cell_type": "code", "execution_count": 6, + "id": "91b47930", "metadata": {}, "outputs": [], "source": [ "%%sql\n", - "DOWNLOAD STAGE FILE 'Employee-Handbook.pdf' TO 'Employee-Handbook.pdf'OVERWRITE" - ], - "id": "91b47930" + "DOWNLOAD STAGE FILE 'Employee-Handbook.pdf' TO 'Employee-Handbook.pdf' OVERWRITE" + ] }, { "cell_type": "code", @@ -176,6 +176,7 @@ { "cell_type": "code", "execution_count": 8, + "id": "53fc1109", "metadata": {}, "outputs": [], "source": [ @@ -229,8 +230,7 @@ " })\n", "\n", "print(f\"Chunking produced {len(chunks)} chunks.\")" - ], - "id": "53fc1109" + ] }, { "attachments": {}, @@ -358,57 +358,85 @@ { "cell_type": "code", "execution_count": 13, - "id": "00b7c77b", + "id": "2a82d48e", "metadata": {}, "outputs": [], "source": [ - "import time, os\n", - "\n", - "# Ensure API key is set (fallback to environment if not already assigned)\n", - "if not getattr(openai, 'api_key', None):\n", - " env_key = os.getenv('OPENAI_API_KEY')\n", - " if env_key:\n", - " openai.api_key = env_key.strip()\n", - " print('Hydrated openai.api_key from environment variable.')\n", - " else:\n", - " raise ValueError('OpenAI API key not set. Set OPENAI_API_KEY env or rerun key input cell.')\n", - "\n", - "# Re-initialize new SDK client if available and was None\n", - "if _use_new and _openai_client is None:\n", + "import os, time, json\n", + "DEFAULT_EMBED_MODEL = \"text-embedding-3-small\"\n", + "DEFAULT_CHAT_MODEL = \"gpt-4o-mini\"\n", + "\n", + "_openai_client = None\n", + "_use_new = False\n", + "\n", + "def _ensure_key():\n", + " key = os.getenv(\"OPENAI_API_KEY\")\n", + " if key and not getattr(openai, 'api_key', None):\n", + " openai.api_key = key.strip()\n", + " if not getattr(openai, 'api_key', None):\n", + " raise ValueError(\"OpenAI API key not set. Set OPENAI_API_KEY env or run the key input cell.\")\n", + "\n", + "def _init_client():\n", + " global _openai_client, _use_new\n", + " if _openai_client is not None:\n", + " return\n", " try:\n", " from openai import OpenAI\n", " _openai_client = OpenAI(api_key=openai.api_key)\n", - " print('Reinitialized OpenAI client.')\n", - " except Exception as e:\n", - " print(f'Failed to reinitialize OpenAI client: {e}')\n", + " _use_new = True\n", + " except Exception:\n", + " _openai_client = None\n", " _use_new = False\n", "\n", + "def embed_texts(text_list, model=DEFAULT_EMBED_MODEL):\n", + " _ensure_key(); _init_client()\n", + " if _use_new and _openai_client is not None:\n", + " resp = _openai_client.embeddings.create(model=model, input=text_list)\n", + " return [d.embedding for d in resp.data]\n", + " else:\n", + " resp = openai.Embedding.create(model=model, input=text_list)\n", + " return [d['embedding'] for d in resp['data']]\n", + "\n", + "def embed_text(text, model=DEFAULT_EMBED_MODEL):\n", + " return embed_texts([text], model=model)[0]\n", + "\n", + "def chat_completion(messages, model=DEFAULT_CHAT_MODEL, temperature=0):\n", + " _ensure_key(); _init_client()\n", + " if _use_new and _openai_client is not None:\n", + " resp = _openai_client.chat.completions.create(model=model, messages=messages, temperature=temperature)\n", + " return resp.choices[0].message.content\n", + " else:\n", + " resp = openai.ChatCompletion.create(model=model, messages=messages, temperature=temperature)\n", + " return resp['choices'][0]['message']['content']" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "00b7c77b", + "metadata": {}, + "outputs": [], + "source": [ + "import json, time\n", + "\n", + "EMBED_MODEL = \"text-embedding-3-small\"\n", "BATCH_SIZE = 10\n", - "MODEL = EMBED_MODEL\n", "MAX_RETRIES = 3\n", "\n", + "# Fetch rows needing embeddings\n", "s2_cur.execute(\"SELECT element_id, text FROM unstructured_data WHERE text_embedding IS NULL OR text_embedding = '';\")\n", "rows = s2_cur.fetchall()\n", "print(f\"Rows needing embeddings: {len(rows)}\")\n", "\n", - "use_new = _use_new\n", - "\n", - "def embed_batch(text_list):\n", - " if use_new and _openai_client is not None:\n", - " resp = _openai_client.embeddings.create(model=MODEL, input=text_list)\n", - " return [item.embedding for item in resp.data]\n", - " else:\n", - " resp = openai.Embedding.create(model=MODEL, input=text_list)\n", - " return [item['embedding'] for item in resp['data']]\n", - "\n", "for i in range(0, len(rows), BATCH_SIZE):\n", " batch = rows[i:i+BATCH_SIZE]\n", " texts = [t for _, t in batch]\n", " attempt = 0\n", " while True:\n", " try:\n", - " embeddings = embed_batch(texts)\n", + " embeddings = embed_texts(texts, model=EMBED_MODEL)\n", " break\n", + "\n", " except Exception as e:\n", " attempt += 1\n", " if attempt >= MAX_RETRIES:\n", @@ -440,7 +468,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "35e10fa7", "metadata": {}, "outputs": [], @@ -452,7 +480,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "876a636b", "metadata": {}, "outputs": [], @@ -491,7 +519,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "8a57d965", "metadata": {}, "outputs": [],