In [None]:
# Import python packages
import streamlit as st
import pandas as pd

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


## Create DB, Schema, WH
if needed

In [None]:
-- CREATE DATABASE If NOT EXISTS MASS_SEARCH;
-- USE DATABASE MASS_SEARCH;
-- CREATE SCHEMA If NOT EXISTS DATA;
-- USE SCHEMA DATA;

-- CREATE OR REPLACE WAREHOUSE CHAT_WH
-- WITH WAREHOUSE_SIZE = 'MEDIUM'
-- AUTO_SUSPEND = 120
-- AUTO_RESUME = TRUE
-- INITIALLY_SUSPENDED = TRUE;

-- USE WAREHOUSE cortex_search_wh;

## SLIT FUNCTION - UDF

In [None]:
-- CREATE FUNCTION TO SPLIT PDFs
create or replace function <DB_NAME>.<SCHEMA_NAME>.text_chunker(pdf_text string)
returns table (chunk_order integer, chunk varchar)
language python
runtime_version = '3.9'
handler = 'text_chunker'
packages = ('snowflake-snowpark-python', 'langchain')
as
$$
from snowflake.snowpark.types import StringType, StructField, StructType
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd

class text_chunker:

    def process(self, pdf_text: str):
        
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = 1512, #Adjust this as you see fit
            chunk_overlap  = 256, #This let's text have some form of overlap. Useful for keeping chunks contextual
            length_function = len
        )
    
        chunks = text_splitter.split_text(pdf_text)
        df = pd.DataFrame(chunks, columns=['chunks'])
        
        yield from df.itertuples(index=True, name=None)
$$;

## CREATE STAGE

In [None]:
-- CREATE STAGE TO HOLD DOCS 
create or replace stage <DB_NAME>.<SCHEMA_NAME>.<STAGE_NAME> ENCRYPTION = (TYPE = 'SNOWFLAKE_SSE') DIRECTORY = ( ENABLE = true );

## UPLOAD DOCS
Before moving on to the next step, upload the docs to stage

In [None]:
-- CHECK TO SEE IF LOADING IS SUCCESSFUL
ls @<DB_NAME>.<SCHEMA_NAME>.<STAGE_NAME>; 

## CREATE TABLE FOR THE CHUNKS TO BE STORED FOR EACH DOC

In [None]:
-- TABLE FOR THE CHUNKS TO BE STORED FOR EACH PDF

create or replace TABLE <DB_NAME>.<SCHEMA_NAME>.<TABLE_NAME> ( 
    RELATIVE_PATH VARCHAR(16777216), -- Relative path to the PDF file
    SIZE NUMBER(38,0), -- Size of the PDF
    FILE_URL VARCHAR(16777216), -- URL for the PDF
    SCOPED_FILE_URL VARCHAR(16777216), -- Scoped url (you can choose which one to keep depending on your use case)
    CHUNK_ORDER INTEGER, -- Order of the chunk in the original document
    CHUNK VARCHAR(16777216) -- Piece of text
);

## PARSE & CHUNK

In [None]:
-- USE CORTEX PARSE_DOCUMENT TO READ AND USE FUNCTION CREATED TO CHUNK
insert into <DB_NAME>.<SCHEMA_NAME>.<TABLE_NAME> (relative_path, size, file_url,
                            scoped_file_url, chunk_order, chunk)

    select relative_path, 
            size,
            file_url, 
            build_scoped_file_url(@<DB_NAME>.<SCHEMA_NAME>.<STAGE_NAME>, relative_path) as scoped_file_url,
            func.chunk_order as chunk_order,
            func.chunk as chunk
    from 
        directory(@<DB_NAME>.<SCHEMA_NAME>.<STAGE_NAME>),
        TABLE(text_chunker (TO_VARCHAR(SNOWFLAKE.CORTEX.PARSE_DOCUMENT(@<DB_NAME>.<SCHEMA_NAME>.<STAGE_NAME>, relative_path, {'mode': 'LAYOUT'})))) as func;

In [None]:
-- CHECK CHUNKS TABLE
select *
from  @<DB_NAME>.<SCHEMA_NAME>.<TABLE_NAME>;

## CREATE CORTEX SEARCH SERVICE

In [None]:
-- CREATE CORTEX SEARCH SERVICE
create or replace CORTEX SEARCH SERVICE @<DB_NAME>.<SCHEMA_NAME>.<SEARCH_SERVICE_NAME>
ON chunk
ATTRIBUTES RELATIVE_PATH, CHUNK_ORDER
warehouse = CHAT_WH
TARGET_LAG = '365 DAYS'
as (
    select chunk,
        relative_path,
        chunk_order,
        file_url
    from @<DB_NAME>.<SCHEMA_NAME>.<TABLE_NAME>
);

In [None]:
-- Query the service using SEARCH_PREVIEW
SELECT SNOWFLAKE.CORTEX.SEARCH_PREVIEW(
    '<DB_NAME>.<SCHEMA_NAME>.<SEARCH_SERVICE_NAME>',
    '{
        "query": "<QUERY>",
        "columns": ["chunk"],
        "limit": 5
    }'
);


In [None]:
-- Dropping Cortex Search service
--DROP CORTEX SEARCH SERVICE IF EXISTS <DB_NAME>.<SCHEMA_NAME>.<SEARCH_SERVICE_NAME>;

-- Dropping document chunks table
--DROP TABLE IF EXISTS <DB_NAME>.<SCHEMA_NAME>.<TABLE_NAME>;

-- Dropping stage with PDF files
--DROP STAGE IF EXISTS <DB_NAME>.<SCHEMA_NAME>.<STAGE_NAME>;


-- drop schema and database
-- DROP SCHEMA IF EXISTS <DB_NAME>.<SCHEMA_NAME>;
-- DROP DATABASE IF EXISTS <DB_NAME>;