Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 75 additions & 47 deletions notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
"metadata": {},
"outputs": [],
"source": [
"!pip install pdfplumber"
"!pip install pdfplumber==0.11.0"
]
},
{
Expand All @@ -53,12 +53,12 @@
{
"cell_type": "code",
"execution_count": 2,
"id": "87c6c286",
"metadata": {},
"outputs": [],
"source": [
"!pip install \"openai\""
],
"id": "87c6c286"
"!pip install \"openai==0.28.1\""
]
},
{
"cell_type": "code",
Expand Down Expand Up @@ -94,14 +94,14 @@
{
"cell_type": "code",
"execution_count": 4,
"id": "9dbe989a",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from getpass import getpass\n",
"os.environ[\"OPENAI_API_KEY\"] = getpass(\"OpenAI API key: \")"
],
"id": "9dbe989a"
]
},
{
"cell_type": "code",
Expand Down Expand Up @@ -135,12 +135,13 @@
"\n",
"\n",
"References:\n",
"- pdfplumber: https://github.com/jsvine/pdfplumber\n",
"- PyMuPDF (optional alternative): https://pymupdf.readthedocs.io/en/latest/"
"- pdfplumber: [https://github.com/jsvine/pdfplumber]\n",
"- PyMuPDF (optional alternative): [https://pymupdf.readthedocs.io/en/latest/]"
]
},
{
"cell_type": "markdown",
"id": "050463a1",
"metadata": {},
"source": [
"## Uploading PDF File to Stage\n",
Expand All @@ -149,19 +150,18 @@
"\n",
"References:\n",
"- [Stage documentation](https://docs.singlestore.com/cloud/load-data/load-data-from-files/stage/)"
],
"id": "050463a1"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "91b47930",
"metadata": {},
"outputs": [],
"source": [
"%%sql\n",
"DOWNLOAD STAGE FILE 'Employee-Handbook.pdf' TO 'Employee-Handbook.pdf'OVERWRITE"
],
"id": "91b47930"
"DOWNLOAD STAGE FILE 'Employee-Handbook.pdf' TO 'Employee-Handbook.pdf' OVERWRITE"
]
},
{
"cell_type": "code",
Expand All @@ -176,6 +176,7 @@
{
"cell_type": "code",
"execution_count": 8,
"id": "53fc1109",
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -229,8 +230,7 @@
" })\n",
"\n",
"print(f\"Chunking produced {len(chunks)} chunks.\")"
],
"id": "53fc1109"
]
},
{
"attachments": {},
Expand Down Expand Up @@ -358,57 +358,85 @@
{
"cell_type": "code",
"execution_count": 13,
"id": "00b7c77b",
"id": "2a82d48e",
"metadata": {},
"outputs": [],
"source": [
"import time, os\n",
"\n",
"# Ensure API key is set (fallback to environment if not already assigned)\n",
"if not getattr(openai, 'api_key', None):\n",
" env_key = os.getenv('OPENAI_API_KEY')\n",
" if env_key:\n",
" openai.api_key = env_key.strip()\n",
" print('Hydrated openai.api_key from environment variable.')\n",
" else:\n",
" raise ValueError('OpenAI API key not set. Set OPENAI_API_KEY env or rerun key input cell.')\n",
"\n",
"# Re-initialize new SDK client if available and was None\n",
"if _use_new and _openai_client is None:\n",
"import os, time, json\n",
"DEFAULT_EMBED_MODEL = \"text-embedding-3-small\"\n",
"DEFAULT_CHAT_MODEL = \"gpt-4o-mini\"\n",
"\n",
"_openai_client = None\n",
"_use_new = False\n",
"\n",
"def _ensure_key():\n",
" key = os.getenv(\"OPENAI_API_KEY\")\n",
" if key and not getattr(openai, 'api_key', None):\n",
" openai.api_key = key.strip()\n",
" if not getattr(openai, 'api_key', None):\n",
" raise ValueError(\"OpenAI API key not set. Set OPENAI_API_KEY env or run the key input cell.\")\n",
"\n",
"def _init_client():\n",
" global _openai_client, _use_new\n",
" if _openai_client is not None:\n",
" return\n",
" try:\n",
" from openai import OpenAI\n",
" _openai_client = OpenAI(api_key=openai.api_key)\n",
" print('Reinitialized OpenAI client.')\n",
" except Exception as e:\n",
" print(f'Failed to reinitialize OpenAI client: {e}')\n",
" _use_new = True\n",
" except Exception:\n",
" _openai_client = None\n",
" _use_new = False\n",
"\n",
"def embed_texts(text_list, model=DEFAULT_EMBED_MODEL):\n",
" _ensure_key(); _init_client()\n",
" if _use_new and _openai_client is not None:\n",
" resp = _openai_client.embeddings.create(model=model, input=text_list)\n",
" return [d.embedding for d in resp.data]\n",
" else:\n",
" resp = openai.Embedding.create(model=model, input=text_list)\n",
" return [d['embedding'] for d in resp['data']]\n",
"\n",
"def embed_text(text, model=DEFAULT_EMBED_MODEL):\n",
" return embed_texts([text], model=model)[0]\n",
"\n",
"def chat_completion(messages, model=DEFAULT_CHAT_MODEL, temperature=0):\n",
" _ensure_key(); _init_client()\n",
" if _use_new and _openai_client is not None:\n",
" resp = _openai_client.chat.completions.create(model=model, messages=messages, temperature=temperature)\n",
" return resp.choices[0].message.content\n",
" else:\n",
" resp = openai.ChatCompletion.create(model=model, messages=messages, temperature=temperature)\n",
" return resp['choices'][0]['message']['content']"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "00b7c77b",
"metadata": {},
"outputs": [],
"source": [
"import json, time\n",
"\n",
"EMBED_MODEL = \"text-embedding-3-small\"\n",
"BATCH_SIZE = 10\n",
"MODEL = EMBED_MODEL\n",
"MAX_RETRIES = 3\n",
"\n",
"# Fetch rows needing embeddings\n",
"s2_cur.execute(\"SELECT element_id, text FROM unstructured_data WHERE text_embedding IS NULL OR text_embedding = '';\")\n",
"rows = s2_cur.fetchall()\n",
"print(f\"Rows needing embeddings: {len(rows)}\")\n",
"\n",
"use_new = _use_new\n",
"\n",
"def embed_batch(text_list):\n",
" if use_new and _openai_client is not None:\n",
" resp = _openai_client.embeddings.create(model=MODEL, input=text_list)\n",
" return [item.embedding for item in resp.data]\n",
" else:\n",
" resp = openai.Embedding.create(model=MODEL, input=text_list)\n",
" return [item['embedding'] for item in resp['data']]\n",
"\n",
"for i in range(0, len(rows), BATCH_SIZE):\n",
" batch = rows[i:i+BATCH_SIZE]\n",
" texts = [t for _, t in batch]\n",
" attempt = 0\n",
" while True:\n",
" try:\n",
" embeddings = embed_batch(texts)\n",
" embeddings = embed_texts(texts, model=EMBED_MODEL)\n",
" break\n",
"\n",
" except Exception as e:\n",
" attempt += 1\n",
" if attempt >= MAX_RETRIES:\n",
Expand Down Expand Up @@ -440,7 +468,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 15,
"id": "35e10fa7",
"metadata": {},
"outputs": [],
Expand All @@ -452,7 +480,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 16,
"id": "876a636b",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -491,7 +519,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 17,
"id": "8a57d965",
"metadata": {},
"outputs": [],
Expand Down