In [None]:
import snowflake.snowpark as snowpark

from snowflake.snowpark.context import get_active_session
session = get_active_session()

In [None]:
DB_NAME = "SUMMIT_AI_OBS_DEMO"
SCHEMA_NAME = "DATA"
STAGE_NAME = "DOCS"
WH_NAME = "AI_OBS_WAREHOUSE"

In [None]:
-- List files in the stage to identify PDFs
LS @{{DB_NAME}}.{{SCHEMA_NAME}}.{{STAGE_NAME}}

In [None]:
import os

#Define quick function to add local file to snowflake stage
def upload_file_to_stage(filename: str, target_stage: str):
    print(f"Adding file {filename} to {target_stage} stage...")
    put_result =  session.file.put(
        local_file_name = f"pdfs/{filename}",
        stage_location =  f'@"{target_stage}"',
        auto_compress = False,
        source_compression= 'AUTO_DETECT',
        overwrite = True)
    return put_result


local_pdfs = []
for root, dirs, files  in os.walk('pdfs/'):
    for file in files:
        if file.endswith('.pdf'):
            local_pdfs.append(file)


#Get list of filenames already in stage
stage_pdfs = [row[0][row[0].find('/')+1:] for row in session.sql(f'LS @{DB_NAME}.{SCHEMA_NAME}.{STAGE_NAME}').select('"name"').collect()]

for pdf in local_pdfs:
    if pdf in stage_pdfs:
        print(f"File {pdf} already in {STAGE_NAME} stage!")
    else:
        upload_file_to_stage(pdf, STAGE_NAME)

## Step 1: Parse and Chunk Text from PDFs
We begin by parsing the content of uploaded PDFs and chunking the text using Snowflake's [PARSED_TEXT](https://docs.snowflake.com/sql-reference/functions/parse_document-snowflake-cortex) and [SPLIT_TEXT_RECURSIVE_CHARACTER](https://docs.snowflake.com/sql-reference/functions/split_text_recursive_character-snowflake-cortex) features. These steps structure the text into manageable segments optimized for retrieval. To ensure that the PDF parsing and chunking have been processed correctly, we run queries on the parsed and chunked tables. This step helps verify the integrity of the content.

Objective: **Transform unstructured content into indexed chunks for efficient search and retrieval.**

Key Outputs:
- SKO.HOP.PARSED_TEXT: Table containing the raw text.
- SKO.HOP.CORTEX_CHUNK: Chunked, searchable content.

In [1]:
-- Create a table to hold the extracted text from the PDF files
CREATE OR REPLACE TABLE {{DB_NAME}}.{{SCHEMA_NAME}}.PARSED_TEXT (relative_path VARCHAR(500), raw_text VARIANT);

SyntaxError: invalid syntax (579301450.py, line 1)

In [None]:
INSERT INTO {{DB_NAME}}.{{SCHEMA_NAME}}.PARSED_TEXT (relative_path, raw_text)
WITH pdf_files AS (
    SELECT DISTINCT
        METADATA$FILENAME AS relative_path
    FROM @{{DB_NAME}}.{{SCHEMA_NAME}}.DOCS
    WHERE METADATA$FILENAME ILIKE '%.pdf'
      -- Exclude files that have already been parsed
      AND METADATA$FILENAME NOT IN (SELECT relative_path FROM PARSED_TEXT)
)
SELECT 
    relative_path,
    SNOWFLAKE.CORTEX.PARSE_DOCUMENT(
        '@{{DB_NAME}}.{{SCHEMA_NAME}}.DOCS',  -- Your stage name
        relative_path,  -- File path
        {'mode': 'layout'}  -- Adjust mode as needed ('layout', 'ocr')
    ) AS raw_text
FROM pdf_files;

In [None]:
-- inspect the results and count the tokens for each document
SELECT *, SNOWFLAKE.CORTEX.COUNT_TOKENS('mistral-7b', RAW_TEXT) as token_count
FROM {{DB_NAME}}.{{SCHEMA_NAME}}.PARSED_TEXT;

In [None]:
-- Chunk the text based on paragraph seperators and write into DOC_CHUNKS_TABLE;

CREATE OR REPLACE TABLE {{DB_NAME}}.{{SCHEMA_NAME}}.DOC_CHUNKS AS
WITH text_chunks AS (
    SELECT
        relative_path,
        SNOWFLAKE.CORTEX.SPLIT_TEXT_RECURSIVE_CHARACTER(
            raw_text:content::STRING,  -- Extract the 'content' field from the JSON
            'markdown', -- Adjust to 'markdown' if needed
            2000,       -- Adjust chunk size
            100,        -- Adjust overlap size
            ['\n\n', '\n']    -- Adjust separators
        ) AS chunks
    FROM {{DB_NAME}}.{{SCHEMA_NAME}}.PARSED_TEXT
)
SELECT
    relative_path,
    c.value AS chunk  -- Extract each chunk of the parsed text
FROM text_chunks,
LATERAL FLATTEN(INPUT => chunks) c;

In [None]:
-- Check the results and 

SELECT *, SNOWFLAKE.CORTEX.COUNT_TOKENS('mistral-7b', CHUNK) as token_count
FROM {{DB_NAME}}.{{SCHEMA_NAME}}.DOC_CHUNKS ;

In [None]:
-- Use cortex to classify the text chunk into categories

ALTER TABLE DOC_CHUNKS ADD COLUMN IF NOT EXISTS CHUNK_TOPIC VARCHAR(100);

UPDATE DOC_CHUNKS 
    SET CHUNK_TOPIC = SNOWFLAKE.CORTEX.COMPLETE('llama4-maverick', 
    concat('Categorize the following text as one of the following categories 
    [Customer Reference, Code Example, Benchmark, Technical Blog] 
    and only return the name of the category. No additional text.', CHUNK));

SELECT * FROM DOC_CHUNKS;    

In [None]:
-- Concatenate the doc title, doc type, and text into a single column. This is what we will build our vector index on top of
SELECT 
        ('DOCUMENT_TITLE: ' || RELATIVE_PATH || 
        '\nDOCUMENT_TYPE: ' || CHUNK_TOPIC || 
        '\nDOCUMENT_TEXT:\n' || CHUNK) AS SEARCH_COL
FROM {{DB_NAME}}.{{SCHEMA_NAME}}.DOC_CHUNKS;

In [None]:
-- Create a search service over your new chunked pdf table

CREATE OR REPLACE CORTEX SEARCH SERVICE {{DB_NAME}}.{{SCHEMA_NAME}}.SNOWFLAKE_BLOG_RETRIEVAL
    ON SEARCH_COL
    ATTRIBUTES CHUNK_TOPIC
    WAREHOUSE = AI_OBS_WAREHOUSE
    TARGET_LAG = '1 hour'
    AS SELECT 
        RELATIVE_PATH,
        CHUNK_TOPIC,
        ('DOCUMENT_TITLE: ' || RELATIVE_PATH || 
        '\nDOCUMENT_TYPE: ' || CHUNK_TOPIC || 
        '\nDOCUMENT_TEXT:\n' || CHUNK) AS SEARCH_COL
    FROM {{DB_NAME}}.{{SCHEMA_NAME}}.DOC_CHUNKS;

In [None]:
# Query your Snowflake Cortex Search Service using the Snowpark Python API to retrieve and process search results.

from snowflake.snowpark import Session
from snowflake.core import Root
root = Root(session)

transcript_search_service = (root
  .databases[DB_NAME]
  .schemas[SCHEMA_NAME]
  .cortex_search_services['SNOWFLAKE_BLOG_RETRIEVAL']
)

resp = transcript_search_service.search(
  query="""Who has used cortex search?""",
  columns=['SEARCH_COL'],
  limit=3
)
results = resp.results

context_str = ""
for i, r in enumerate(results):
    context_str += f"Context document {i+1}: {r['SEARCH_COL']}\n****************\n"

    print(context_str)

In [None]:
# Query your Snowflake Cortex Search Service using the Snowpark Python API to retrieve and process search results.
resp = transcript_search_service.search(
  query="""Who has used cortex search?""",
  columns=['SEARCH_COL'],
  limit=10,
  experimental={"returnConfidenceScores": True})


confidence_score_threshold=2
    
filtered_results = list(filter(lambda x: int(x['@CONFIDENCE_SCORE']) >=confidence_score_threshold, resp.results))
context_chunks = list(map(lambda x: x['SEARCH_COL'], filtered_results))

print(f'Found {len(context_chunks)} results!')

for i in context_chunks:
    print(i)
    print('------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n')