## VectorSearch.ipynb

### Author: Taiob Ali

- Contact: taiob@sqlworlwide.com
- https://bsky.app/profile/sqlworldwide.bsky.social
- https://sqlworldwide.com/
- https://www.linkedin.com/in/sqlworldwide/

Last Modefied
September 10, 2025

In [None]:
# must install jupysql first
#pip install jupysql
import urllib.parse
from sqlalchemy import create_engine

params = urllib.parse.quote_plus(
        "Driver={ODBC Driver 17 for SQL Server};"
        f"Server=tcp:{'ta-aidemosqlserver.database.windows.net'},1433;"
        f"Database={'testdata'};"   
        f"Uid={'taiob'};"
        f"Pwd={'September19$$'};"
        "Encrypt=yes;"
        "TrustServerCertificate=yes;" #I had this as yes locally otherwise it didn't work
        "Connection Timeout=240;"
    )

connection_string = f"mssql+pyodbc:///?odbc_connect={params}"
engine = create_engine(connection_string)

In [None]:
%load_ext sql
%sql engine

Reference: [Azure OpenAI Embeddings](https:\github.com\AzureSQLDB\GenAILab\blob\main\docs\2-creating-embedding-and-storing-in-SQL-database.md)

Create a function to create embeddings. You will need to change the the url and api-key value.

An embedding is a special format of data representation that machine learning models and algorithms can easily use. The embedding is an information dense representation of the semantic meaning of a piece of text. Each embedding is a vector of floating-point numbers, such that the distance between two embeddings in the vector space is correlated with semantic similarity between two inputs in the original format. For example, if two texts are similar, then their vector representations should also be similar.

In [None]:
%%sql

CREATE OR ALTER PROCEDURE dbo.create_embeddings
@inputText nvarchar(max),
@embedding vector(1536) OUT
AS
DECLARE @url nvarchar(4000) = N'https://ta-openai2.openai.azure.com/openai/deployments/ta-model-text-embedding-ada-002/embeddings?api-version=2023-05-15';

DECLARE @headers nvarchar(300) = N'{"api-key": "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"}';

DECLARE @message nvarchar(max)
DECLARE @payload nvarchar(max) = N'{"input": "' + @inputText + '"}'
DECLARE @retval int, @response nvarchar(max)

exec @retval = sp_invoke_external_rest_endpoint 
    @url = @url,
    @method = 'POST',
    @headers = @headers,
    @payload = @payload,
    @timeout = 230,
    @response = @response output;

DECLARE @re vector(1536)
IF (@retval = 0) 
	BEGIN
    SET @re = cast(json_query(@response, '$.result.data[0].embedding') AS vector(1536))
	END ELSE BEGIN
	DECLARE @msg nvarchar(max) =  
			'Error calling OpenAI API' + char(13) + char(10) + 
			'[HTTP Status: ' + json_value(@response, '$.response.status.http.code') + '] ' +
			json_value(@response, '$.result.error.message');
	THROW 50000, @msg, 1
END

SET @embedding = @re
RETURN @retval


Do not execute this cell during live demo.

In [None]:
%%sql
/*
A function to clean up your data (My colleague Howard Dunn wrote this)
*/
CREATE OR ALTER FUNCTION [dbo].[cleanString] (@str NVARCHAR(MAX))
RETURNS NVARCHAR(MAX)
AS
BEGIN
  DECLARE @i INT = 1
  DECLARE @cleaned NVARCHAR(MAX) = ''

  WHILE @i <= LEN(@str)
  BEGIN
    IF SUBSTRING(@str, @i, 1) LIKE '[a-zA-Z0-9 .,!?]'
      SET @cleaned = @cleaned + SUBSTRING(@str, @i, 1)
      SET @i = @i + 1
  END
  RETURN @cleaned
END

Do not execute this cell during live demo.

In [None]:
%%sql
/*
This is a simple example of how to use the function above to clean up your data
*/
DROP TABLE IF EXISTS  walmartProductsNew
SELECT
  sku, 
  brand, 
  review_count, 
  trim(dbo.cleanString(description)) as description, 
  product_id, 
  product_name, 
  root_category_name, 
  unit_price, 
  unit, aisle, 
  free_returns, 
  discount, id
INTO dbo.walmartProductsNew
FROM [dbo].[walmart-products]

In [None]:
%%sql

/*
Creating a table to store the embeddings
*/
DROP TABLE IF EXISTS vectorTable

SELECT TOP 250 
  ID, 
  product_name, 
  sku, 
  brand, 
  review_count, 
  description
INTO dbo.vectortable
FROM [dbo].[walmartProducts]
WHERE ID not IN (2, 7)
ORDER BY [ID]

In [None]:
%%sql

/*
Creating a vector column to store the embeddings
*/
ALTER TABLE vectorTable
ADD [description_vector] vector(1536)

<br>Run this cell in SQL Server Management Studio (SSMS).<br> In Jupyter notebook I am getting NULL in most records but works in SSMS.<br>This works SQL notebook but I could not get this work in Visual Studio code using Jupyter Notebook.

In [None]:
%%sql

DECLARE @id INT
DECLARE @text NVARCHAR(MAX)
DECLARE @vector VECTOR(1536)

DECLARE row_cursor CURSOR FOR
SELECT 
  ID, 
  ISNULL(product_name, '') + ': ' + ISNULL(brand, '') + ': ' + ISNULL(description, '') AS text
FROM dbo.vectortable

OPEN row_cursor
FETCH NEXT FROM row_cursor INTO @id, @text

WHILE @@FETCH_STATUS = 0
BEGIN
	IF (@text <> '')
	BEGIN TRY
	EXEC dbo.create_embeddings @text, @vector OUTPUT
	UPDATE dbo.vectortable 
	SET description_vector = @vector 
	WHERE ID = @id
	END TRY
	BEGIN CATCH
		SELECT 
			ERROR_NUMBER() AS ErrorNumber,
			ERROR_MESSAGE() AS ErrorMessage
	END CATCH
  FETCH NEXT FROM row_cursor INTO @id, @text
END

CLOSE row_cursor
DEALLOCATE row_cursor

In [None]:
%%sql
/*
Cleaning up the vector table by removing rows with NULL vectors
*/
DELETE FROM dbo.vectortable WHERE description_vector IS NULL
SELECT Count(*) FROM dbo.vectortable
SELECT TOP 10 * FROM dbo.vectortable

In [None]:
%%sql
/*
  DECLARE the search text
  DECLARE a variable to hold the search vector
*/
DECLARE @search_text NVARCHAR(MAX) = 'help me plan a high school graduation party'
DECLARE @search_vector VECTOR(1536)

/*
  GENERATE the search vector using the 'create_embeddings' stored procedure
*/
EXEC dbo.create_embeddings @search_text, @search_vector OUTPUT

/*
  PERFORM the search query
  CALCULATE the cosine distance between the search vector and product description vectors
  ORDER BY the closest distance
*/
SELECT TOP(10) 
  product_name, 
  brand, 
  DESCRIPTION,
  vector_distance('cosine', @search_vector, description_vector) AS distance
FROM [dbo].[vectorTable]
WHERE vector_distance('cosine', @search_vector, description_vector) IS NOT NULL
ORDER BY distance

### Filtered Semantic Search with SQL

[](https:\github.com\AzureSQLDB\GenAILab\blob\main\docs\4-filtered-semantic-search.md#filtered-semantic-search-with-sql)

This section explains how to implement a Filtered Search query in SQL. Hybrid Search combines traditional SQL queries with vector-based search capabilities to enhance search results.

### SQL Query for Hybrid Search

[](https:\github.com\AzureSQLDB\GenAILab\blob\main\docs\4-filtered-semantic-search.md#sql-query-for-hybrid-search)

The following SQL script demonstrates a hybrid search in an SQL database. It uses vector embeddings to find the most relevant products based on a textual description and combines with the availability of free returns

In [None]:
%%sql
/*
  DECLARE the search text
  DECLARE a variable to hold the search vector
*/
DECLARE @search_text NVARCHAR(MAX) = 'help me plan a high school graduation party' 
DECLARE @search_vector VECTOR(1536)

/*
  GENERATE the search vector using the 'create_embeddings' stored procedure
*/
EXEC dbo.create_embeddings @search_text, @search_vector OUTPUT

/*
  PERFORM the search query
  CALCULATE the cosine distance between the search vector and product description vectors
  ORDER BY the closest distance plus filter for free returns
*/
SELECT TOP(10) 
  vt.product_name, 
  vt.brand, 
  vt.DESCRIPTION,
  vector_distance('cosine', @search_vector, description_vector) AS distance
FROM [dbo].[vectorTable] AS vt
JOIN dbo.walmartProducts AS wpn
  ON vt.id = wpn.id
WHERE vector_distance('cosine', @search_vector, description_vector) IS NOT NULL
  AND wpn.free_returns = 'Free 30-day returns'
ORDER BY distance

### Azure OpenAi Recommendations

Copied and edited from [here](https:\github.com\AzureSQLDB\GenAILab\blob\main\docs\5-azure-openai-recommendation.md).

<br>Run this cell in SQL Server Management Studio (SSMS).<br> In Jupyter notebook I am getting error about decalring variables but works in SSMS.<br>This works SQL notebook but I could not get this work in Visual Studio code using Jupyter Notebook.

In [None]:
%%sql

/*
  DECLARE the search text
  DECLARE a variable to hold the search vector
*/
DECLARE @search_text NVARCHAR(MAX) = 'help me plan a high school graduation party' 
DECLARE @search_vector VECTOR(1536)

/*
  GENERATE the search vector using the 'create_embeddings' stored procedure
*/
EXEC dbo.create_embeddings @search_text, @search_vector OUTPUT

/*
  GET the top 50 products that are closest to the search vector
*/
DROP TABLE IF EXISTS tempTable;
WITH cte AS 
(
  SELECT         
    id, 
    product_name, 
    [description], 
    description_vector,        
    ROW_NUMBER() OVER (PARTITION BY product_name ORDER BY id) AS rn
  FROM [dbo].[vectorTable]
  WHERE vector_distance('cosine', @search_vector, description_vector) IS NOT NULL
), 
cte2 AS 
(
  SELECT 
    *
  FROM cte 
  WHERE rn = 1
)
SELECT TOP(25)
  id, 
  product_name, 
  [description],
  vector_distance('cosine', @search_vector, description_vector) AS distance
INTO tempTable
FROM cte2
ORDER BY distance

/*
  AGGREGATE the search results to make them easily consumable by the LLM
*/
DECLARE @search_output NVARCHAR(MAX)
SELECT 
  @search_output = STRING_AGG(CAST(t.[id] AS VARCHAR(10)) + '=>' + t.product_name + '=>' + t.description, CHAR(13) + CHAR(10))
FROM tempTable AS t

/*
  GENERATE the payload for the LLM
*/
DECLARE @llm_payload NVARCHAR(MAX)
SET @llm_payload = 
JSON_OBJECT(
  'messages': JSON_ARRAY(
    JSON_OBJECT(
      'role': 'system',
      'content': '
        You are an awesome AI shopping assistant tasked with helping users find appropriate items they are looking for the occasion. 
        You have access to a list of products, each with an ID, product name, and description, provided to you in the format of "Id=>Product=>Description". 
        When users ask for products for specific occasions, you can leverage this information to provide creative and personalized suggestions. 
        Your goal is to assist users in planning memorable celebrations using the available products.
      '
    ),
    JSON_OBJECT(
      'role': 'user',
      'content': '## Source ##
        ' + @search_output + '
        ## End ##

        Your answer needs to be a JSON object with the following format.
        {
          "answer": // the answer to the question, add a source reference to the end of each sentence. Source reference is the product Id.
          "products": // a comma-separated list of product ids that you used to come up with the answer.
          "thoughts": // brief thoughts on how you came up with the answer, e.g. what sources you used, what you thought about, etc.
        }'
    ),
    JSON_OBJECT(
      'role': 'user',
      'content': + @search_text
    )
  ),
  'max_tokens': 800,
  'temperature': 0.3,
  'frequency_penalty': 0,
  'presence_penalty': 0,
  'top_p': 0.95,
  'stop': NULL
)

/*
  INVOKE the LLM to get the response
*/
DECLARE @retval INT, @response NVARCHAR(MAX)
DECLARE @headers NVARCHAR(300) = N'{"api-key": "16xvG4wDuaNHyoAI2tQKRuY1wUdjB9k9L1ZtqAH48DVYFxk9DBMUJQQJ99BDACYeBjFXJ3w3AAABACOGhJIQ", "content-type": "application/json"}'
EXEC @retval = sp_invoke_external_rest_endpoint
  @url = N'https://ta-openai2.openai.azure.com/openai/deployments/ta-model-gpt-4/chat/completions?api-version=2025-01-01-preview',
  @headers = @headers,
  @method = 'POST',    
  @timeout = 120,
  @payload = @llm_payload,
  @response = @response OUTPUT;
SELECT @retval AS 'Return Code', @response AS 'Response'

/*
  GET the answer from the response
*/
SELECT 
  [key], 
  [value] 
FROM OPENJSON(( 
  SELECT 
    t.value 
  FROM OPENJSON(@response, '$.result.choices') c 
  CROSS APPLY OPENJSON(c.value, '$.message') t
  WHERE t.[key] = 'content'
))