From 85cb773c58e8bc4bceb68874abf0fcb2a4814d0b Mon Sep 17 00:00:00 2001 From: lsingh4634426 Date: Wed, 19 Nov 2025 11:20:19 +0530 Subject: [PATCH 1/4] fix two cells --- .../notebook.ipynb | 116 +++++++++++------- 1 file changed, 72 insertions(+), 44 deletions(-) diff --git a/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb b/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb index a6b6d68..8c59b87 100644 --- a/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb +++ b/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb @@ -53,12 +53,12 @@ { "cell_type": "code", "execution_count": 2, + "id": "87c6c286", "metadata": {}, "outputs": [], "source": [ "!pip install \"openai\"" - ], - "id": "87c6c286" + ] }, { "cell_type": "code", @@ -94,14 +94,14 @@ { "cell_type": "code", "execution_count": 4, + "id": "9dbe989a", "metadata": {}, "outputs": [], "source": [ "import os\n", "from getpass import getpass\n", "os.environ[\"OPENAI_API_KEY\"] = getpass(\"OpenAI API key: \")" - ], - "id": "9dbe989a" + ] }, { "cell_type": "code", @@ -135,12 +135,13 @@ "\n", "\n", "References:\n", - "- pdfplumber: https://github.com/jsvine/pdfplumber\n", - "- PyMuPDF (optional alternative): https://pymupdf.readthedocs.io/en/latest/" + "- pdfplumber: [https://github.com/jsvine/pdfplumber]\n", + "- PyMuPDF (optional alternative): [https://pymupdf.readthedocs.io/en/latest/]" ] }, { "cell_type": "markdown", + "id": "050463a1", "metadata": {}, "source": [ "## Uploading PDF File to Stage\n", @@ -149,19 +150,18 @@ "\n", "References:\n", "- [Stage documentation](https://docs.singlestore.com/cloud/load-data/load-data-from-files/stage/)" - ], - "id": "050463a1" + ] }, { "cell_type": "code", "execution_count": 6, + "id": "91b47930", "metadata": {}, "outputs": [], "source": [ "%%sql\n", "DOWNLOAD STAGE FILE 'Employee-Handbook.pdf' TO 'Employee-Handbook.pdf'OVERWRITE" - ], - "id": "91b47930" + ] }, { "cell_type": "code", @@ -176,6 +176,7 @@ { "cell_type": "code", "execution_count": 8, + "id": "53fc1109", "metadata": {}, "outputs": [], "source": [ @@ -229,8 +230,7 @@ " })\n", "\n", "print(f\"Chunking produced {len(chunks)} chunks.\")" - ], - "id": "53fc1109" + ] }, { "attachments": {}, @@ -358,57 +358,85 @@ { "cell_type": "code", "execution_count": 13, - "id": "00b7c77b", + "id": "2a82d48e", "metadata": {}, "outputs": [], "source": [ - "import time, os\n", - "\n", - "# Ensure API key is set (fallback to environment if not already assigned)\n", - "if not getattr(openai, 'api_key', None):\n", - " env_key = os.getenv('OPENAI_API_KEY')\n", - " if env_key:\n", - " openai.api_key = env_key.strip()\n", - " print('Hydrated openai.api_key from environment variable.')\n", - " else:\n", - " raise ValueError('OpenAI API key not set. Set OPENAI_API_KEY env or rerun key input cell.')\n", - "\n", - "# Re-initialize new SDK client if available and was None\n", - "if _use_new and _openai_client is None:\n", + "import os, time, json\n", + "DEFAULT_EMBED_MODEL = \"text-embedding-3-small\"\n", + "DEFAULT_CHAT_MODEL = \"gpt-4o-mini\"\n", + "\n", + "_openai_client = None\n", + "_use_new = False\n", + "\n", + "def _ensure_key():\n", + " key = os.getenv(\"OPENAI_API_KEY\")\n", + " if key and not getattr(openai, 'api_key', None):\n", + " openai.api_key = key.strip()\n", + " if not getattr(openai, 'api_key', None):\n", + " raise ValueError(\"OpenAI API key not set. Set OPENAI_API_KEY env or run the key input cell.\")\n", + "\n", + "def _init_client():\n", + " global _openai_client, _use_new\n", + " if _openai_client is not None:\n", + " return\n", " try:\n", " from openai import OpenAI\n", " _openai_client = OpenAI(api_key=openai.api_key)\n", - " print('Reinitialized OpenAI client.')\n", - " except Exception as e:\n", - " print(f'Failed to reinitialize OpenAI client: {e}')\n", + " _use_new = True\n", + " except Exception:\n", + " _openai_client = None\n", " _use_new = False\n", "\n", + "def embed_texts(text_list, model=DEFAULT_EMBED_MODEL):\n", + " _ensure_key(); _init_client()\n", + " if _use_new and _openai_client is not None:\n", + " resp = _openai_client.embeddings.create(model=model, input=text_list)\n", + " return [d.embedding for d in resp.data]\n", + " else:\n", + " resp = openai.Embedding.create(model=model, input=text_list)\n", + " return [d['embedding'] for d in resp['data']]\n", + "\n", + "def embed_text(text, model=DEFAULT_EMBED_MODEL):\n", + " return embed_texts([text], model=model)[0]\n", + "\n", + "def chat_completion(messages, model=DEFAULT_CHAT_MODEL, temperature=0):\n", + " _ensure_key(); _init_client()\n", + " if _use_new and _openai_client is not None:\n", + " resp = _openai_client.chat.completions.create(model=model, messages=messages, temperature=temperature)\n", + " return resp.choices[0].message.content\n", + " else:\n", + " resp = openai.ChatCompletion.create(model=model, messages=messages, temperature=temperature)\n", + " return resp['choices'][0]['message']['content']" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "00b7c77b", + "metadata": {}, + "outputs": [], + "source": [ + "import json, time\n", + "\n", + "EMBED_MODEL = \"text-embedding-3-small\"\n", "BATCH_SIZE = 10\n", - "MODEL = EMBED_MODEL\n", "MAX_RETRIES = 3\n", "\n", + "# Fetch rows needing embeddings\n", "s2_cur.execute(\"SELECT element_id, text FROM unstructured_data WHERE text_embedding IS NULL OR text_embedding = '';\")\n", "rows = s2_cur.fetchall()\n", "print(f\"Rows needing embeddings: {len(rows)}\")\n", "\n", - "use_new = _use_new\n", - "\n", - "def embed_batch(text_list):\n", - " if use_new and _openai_client is not None:\n", - " resp = _openai_client.embeddings.create(model=MODEL, input=text_list)\n", - " return [item.embedding for item in resp.data]\n", - " else:\n", - " resp = openai.Embedding.create(model=MODEL, input=text_list)\n", - " return [item['embedding'] for item in resp['data']]\n", - "\n", "for i in range(0, len(rows), BATCH_SIZE):\n", " batch = rows[i:i+BATCH_SIZE]\n", " texts = [t for _, t in batch]\n", " attempt = 0\n", " while True:\n", " try:\n", - " embeddings = embed_batch(texts)\n", + " embeddings = embed_texts(texts, model=EMBED_MODEL)\n", " break\n", + "\n", " except Exception as e:\n", " attempt += 1\n", " if attempt >= MAX_RETRIES:\n", @@ -440,7 +468,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "35e10fa7", "metadata": {}, "outputs": [], @@ -452,7 +480,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "876a636b", "metadata": {}, "outputs": [], @@ -491,7 +519,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "8a57d965", "metadata": {}, "outputs": [], From 9108de8efba74b9acff176b0abc16077b84a78bf Mon Sep 17 00:00:00 2001 From: lsingh4634426 Date: Wed, 19 Nov 2025 12:07:34 +0530 Subject: [PATCH 2/4] add cell --- .../notebook.ipynb | 79 ++++++++++++++++--- 1 file changed, 67 insertions(+), 12 deletions(-) diff --git a/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb b/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb index 8c59b87..e3425e1 100644 --- a/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb +++ b/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb @@ -119,6 +119,61 @@ " raise RuntimeError(f\"SingleStore connection failed: {e}\")" ] }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e5065e46", + "metadata": {}, + "outputs": [], + "source": [ + "import os, time, json\n", + "DEFAULT_EMBED_MODEL = \"text-embedding-3-small\"\n", + "DEFAULT_CHAT_MODEL = \"gpt-4o-mini\"\n", + "\n", + "_openai_client = None\n", + "_use_new = False\n", + "\n", + "def _ensure_key():\n", + " key = os.getenv(\"OPENAI_API_KEY\")\n", + " if key and not getattr(openai, 'api_key', None):\n", + " openai.api_key = key.strip()\n", + " if not getattr(openai, 'api_key', None):\n", + " raise ValueError(\"OpenAI API key not set. Set OPENAI_API_KEY env or run the key input cell.\")\n", + "\n", + "def _init_client():\n", + " global _openai_client, _use_new\n", + " if _openai_client is not None:\n", + " return\n", + " try:\n", + " from openai import OpenAI\n", + " _openai_client = OpenAI(api_key=openai.api_key)\n", + " _use_new = True\n", + " except Exception:\n", + " _openai_client = None\n", + " _use_new = False\n", + "\n", + "def embed_texts(text_list, model=DEFAULT_EMBED_MODEL):\n", + " _ensure_key(); _init_client()\n", + " if _use_new and _openai_client is not None:\n", + " resp = _openai_client.embeddings.create(model=model, input=text_list)\n", + " return [d.embedding for d in resp.data]\n", + " else:\n", + " resp = openai.Embedding.create(model=model, input=text_list)\n", + " return [d['embedding'] for d in resp['data']]\n", + "\n", + "def embed_text(text, model=DEFAULT_EMBED_MODEL):\n", + " return embed_texts([text], model=model)[0]\n", + "\n", + "def chat_completion(messages, model=DEFAULT_CHAT_MODEL, temperature=0):\n", + " _ensure_key(); _init_client()\n", + " if _use_new and _openai_client is not None:\n", + " resp = _openai_client.chat.completions.create(model=model, messages=messages, temperature=temperature)\n", + " return resp.choices[0].message.content\n", + " else:\n", + " resp = openai.ChatCompletion.create(model=model, messages=messages, temperature=temperature)\n", + " return resp['choices'][0]['message']['content']" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -154,7 +209,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "91b47930", "metadata": {}, "outputs": [], @@ -165,7 +220,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "5f4be9dc", "metadata": {}, "outputs": [], @@ -175,7 +230,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "53fc1109", "metadata": {}, "outputs": [], @@ -247,7 +302,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "b4f19b22", "metadata": {}, "outputs": [], @@ -293,7 +348,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "7a9d094a", "metadata": {}, "outputs": [], @@ -315,7 +370,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "ba220cc1", "metadata": {}, "outputs": [], @@ -346,7 +401,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "c95bc511", "metadata": {}, "outputs": [], @@ -357,7 +412,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 14, "id": "2a82d48e", "metadata": {}, "outputs": [], @@ -412,7 +467,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "00b7c77b", "metadata": {}, "outputs": [], @@ -468,7 +523,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "35e10fa7", "metadata": {}, "outputs": [], @@ -480,7 +535,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "876a636b", "metadata": {}, "outputs": [], @@ -519,7 +574,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "8a57d965", "metadata": {}, "outputs": [], From f3657124a6b3b4a0236db91e47acc84629b86a7e Mon Sep 17 00:00:00 2001 From: lsingh4634426 Date: Wed, 19 Nov 2025 14:50:34 +0530 Subject: [PATCH 3/4] address comments --- notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb b/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb index e3425e1..e3b6a6a 100644 --- a/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb +++ b/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb @@ -215,7 +215,7 @@ "outputs": [], "source": [ "%%sql\n", - "DOWNLOAD STAGE FILE 'Employee-Handbook.pdf' TO 'Employee-Handbook.pdf'OVERWRITE" + "DOWNLOAD STAGE FILE 'Employee-Handbook.pdf' TO 'Employee-Handbook.pdf' OVERWRITE" ] }, { From a99d801f4563a67009e03933cdb380134c8ca0e3 Mon Sep 17 00:00:00 2001 From: lsingh4634426 Date: Wed, 19 Nov 2025 21:19:06 +0530 Subject: [PATCH 4/4] addres comments --- .../notebook.ipynb | 83 ++++--------------- 1 file changed, 14 insertions(+), 69 deletions(-) diff --git a/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb b/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb index e3b6a6a..9b0f054 100644 --- a/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb +++ b/notebooks/ingest-pdfs-with-pdfplumber/notebook.ipynb @@ -36,7 +36,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install pdfplumber" + "!pip install pdfplumber==0.11.0" ] }, { @@ -57,7 +57,7 @@ "metadata": {}, "outputs": [], "source": [ - "!pip install \"openai\"" + "!pip install \"openai==0.28.1\"" ] }, { @@ -119,61 +119,6 @@ " raise RuntimeError(f\"SingleStore connection failed: {e}\")" ] }, - { - "cell_type": "code", - "execution_count": 6, - "id": "e5065e46", - "metadata": {}, - "outputs": [], - "source": [ - "import os, time, json\n", - "DEFAULT_EMBED_MODEL = \"text-embedding-3-small\"\n", - "DEFAULT_CHAT_MODEL = \"gpt-4o-mini\"\n", - "\n", - "_openai_client = None\n", - "_use_new = False\n", - "\n", - "def _ensure_key():\n", - " key = os.getenv(\"OPENAI_API_KEY\")\n", - " if key and not getattr(openai, 'api_key', None):\n", - " openai.api_key = key.strip()\n", - " if not getattr(openai, 'api_key', None):\n", - " raise ValueError(\"OpenAI API key not set. Set OPENAI_API_KEY env or run the key input cell.\")\n", - "\n", - "def _init_client():\n", - " global _openai_client, _use_new\n", - " if _openai_client is not None:\n", - " return\n", - " try:\n", - " from openai import OpenAI\n", - " _openai_client = OpenAI(api_key=openai.api_key)\n", - " _use_new = True\n", - " except Exception:\n", - " _openai_client = None\n", - " _use_new = False\n", - "\n", - "def embed_texts(text_list, model=DEFAULT_EMBED_MODEL):\n", - " _ensure_key(); _init_client()\n", - " if _use_new and _openai_client is not None:\n", - " resp = _openai_client.embeddings.create(model=model, input=text_list)\n", - " return [d.embedding for d in resp.data]\n", - " else:\n", - " resp = openai.Embedding.create(model=model, input=text_list)\n", - " return [d['embedding'] for d in resp['data']]\n", - "\n", - "def embed_text(text, model=DEFAULT_EMBED_MODEL):\n", - " return embed_texts([text], model=model)[0]\n", - "\n", - "def chat_completion(messages, model=DEFAULT_CHAT_MODEL, temperature=0):\n", - " _ensure_key(); _init_client()\n", - " if _use_new and _openai_client is not None:\n", - " resp = _openai_client.chat.completions.create(model=model, messages=messages, temperature=temperature)\n", - " return resp.choices[0].message.content\n", - " else:\n", - " resp = openai.ChatCompletion.create(model=model, messages=messages, temperature=temperature)\n", - " return resp['choices'][0]['message']['content']" - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -209,7 +154,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "91b47930", "metadata": {}, "outputs": [], @@ -220,7 +165,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "5f4be9dc", "metadata": {}, "outputs": [], @@ -230,7 +175,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "53fc1109", "metadata": {}, "outputs": [], @@ -302,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "b4f19b22", "metadata": {}, "outputs": [], @@ -348,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "7a9d094a", "metadata": {}, "outputs": [], @@ -370,7 +315,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "ba220cc1", "metadata": {}, "outputs": [], @@ -401,7 +346,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "c95bc511", "metadata": {}, "outputs": [], @@ -412,7 +357,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "id": "2a82d48e", "metadata": {}, "outputs": [], @@ -467,7 +412,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "id": "00b7c77b", "metadata": {}, "outputs": [], @@ -523,7 +468,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "id": "35e10fa7", "metadata": {}, "outputs": [], @@ -535,7 +480,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "id": "876a636b", "metadata": {}, "outputs": [], @@ -574,7 +519,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "id": "8a57d965", "metadata": {}, "outputs": [],