In [None]:
# Import python packages
import streamlit as st
import pandas as pd

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


In [None]:
CREATE OR REPLACE TABLE DOCUMENTS_RAW_PARSED AS
SELECT 
    RELATIVE_PATH AS FILENAME,
    FILE_URL,
    SIZE,
    LAST_MODIFIED,
    TO_VARCHAR (
        SNOWFLAKE.CORTEX.PARSE_DOCUMENT(
            '@documents',
             relative_path,
             {'mode': 'OCR'}
        )
    ) AS PARSED_CONTENT
FROM 
    DIRECTORY(@documents)
WHERE 
    1 = 1 
    --AND RELATIVE_PATH = 'Briefing - proactive-disclosure-2024-12-12-secu-borders-en.pdf'

    

In [None]:
SELECT * FROM DOCUMENTS_RAW_PARSED;

In [None]:
CREATE OR REPLACE TABLE DOCUMENTS_RAW_PARSED_METADATA AS (
    SELECT
        FILENAME,
        FILE_URL,
        PARSED_CONTENT,
        SNOWFLAKE.CORTEX.COMPLETE(
            'claude-4-sonnet', 
'I am going to provide a document which will be indexed by a retrieval system containing many similar documents. I want you to provide key information associated with this document that can help differentiate this document in the index. Follow these instructions:
    
        1. Do not dwell on low level details. Only provide key high level information that a human might be expected to provide when searching for this doc.
    
        2. Do not use any formatting, just provide keys and values using a colon to separate key and value. Have each key and value be on a new line.
    
        3. Only extract at most the most important keys and values that could be relevant for this document and used in retrieval'   
        || '\n\nDoc starts here:\n' 
        || SUBSTR(PARSED_CONTENT, 0, 4000) 
        || '\nDoc ends here\n\n'
        ) CONTENT_METADATA,
    FROM
        DOCUMENTS_RAW_PARSED ) ;

In [None]:
SELECT * FROM DOCUMENTS_RAW_PARSED_METADATA
where content_metadata ilike '%Exhibit 10.33%';

In [None]:
-- CREATE CORTEX SEARCH SERVICE FOR METADATA
CREATE OR REPLACE CORTEX SEARCH SERVICE CS_DOCUMENTS_METADATA
ON CONTENT_METADATA 
ATTRIBUTES FILENAME
WAREHOUSE = DEMO_WH 
TARGET_LAG = '1 minute' 
EMBEDDING_MODEL = 'snowflake-arctic-embed-l-v2.0'
AS ( SELECT * FROM DOCUMENTS_RAW_PARSED_METADATA ); 

In [None]:
-- STEP 2: GENERATE CHUNKS AND PREPEND CONTEXT TO CHUNKb

CREATE OR REPLACE TABLE CHUNKS_CONTEXTUALIZED AS (
    WITH SPLIT_TEXT_CHUNKS AS (
        SELECT
            FILENAME,
            FILE_URL,
            C.VALUE AS CHUNK,
        FROM
           DOCUMENTS_RAW_PARSED_METADATA,
           LATERAL FLATTEN( input => SNOWFLAKE.CORTEX.SPLIT_TEXT_RECURSIVE_CHARACTER (
              PARSED_CONTENT,
              'none',
              1800, -- SET CHUNK SIZE
              300 -- SET CHUNK OVERLAP
           )) C
    )
    SELECT
        M.FILENAME,
        M.FILE_URL,
        CONCAT(M.CONTENT_METADATA, '\n\n', C.CHUNK) AS CONTEXTUALIZED_CHUNK,
    FROM
        SPLIT_TEXT_CHUNKS C
    JOIN
        DOCUMENTS_RAW_PARSED_METADATA M ON C.FILENAME = M.FILENAME
);

In [None]:
SELECT * FROM CHUNKS_CONTEXTUALIZED;

In [None]:
-- CREATE CORTEX SEARCH SERVICE FOR CHUNKS
CREATE OR REPLACE CORTEX SEARCH SERVICE CS_DOCUMENTS_CHUNKS
ON CONTEXTUALIZED_CHUNK 
ATTRIBUTES FILENAME
WAREHOUSE = DEMO_WH 
TARGET_LAG = '1 minute' 
EMBEDDING_MODEL = 'snowflake-arctic-embed-l-v2.0'
AS ( SELECT * FROM CHUNKS_CONTEXTUALIZED     ); 