In [None]:
import warnings
warnings.filterwarnings("ignore")
import streamlit as st
import pandas as pd
from snowflake.snowpark.context import get_active_session
from snowflake.snowpark.functions import col
from snowflake.snowpark import types as T
from snowflake.core import Root
from snowflake.cortex import Complete
session = get_active_session()
root = Root(get_active_session())

# Cortex Agents
In this notebook you will setup multiple Cortex Search and Cortex Analyst Services which will be used by Cortex Agents to answer user queries on unstructured and structured data.
![text](https://github.com/michaelgorkow/snowflake_cortex_agents_demo/blob/main/resources/cortex_agents_notebook_small.png?raw=true)

# Setup the Cortex Search Service [Unstructured Data]

We have some PDF documents in our stage **DOCUMENTS** that we want business users to be able to ask questions about.  
To achieve this, we need to extract the contents of the PDF files and make them searchable.

## Extracting Content from PDF Files

### [`PARSE_DOCUMENT`](https://docs.snowflake.com/en/sql-reference/functions/parse_document-snowflake-cortex)  
This function returns the extracted content from a document on a Snowflake stage as an **OBJECT** that contains JSON-encoded objects as strings.  

It supports two types of extractions:  
- **Optical Character Recognition (OCR)**  
- **Layout Extraction**  

### [`SPLIT_TEXT_RECURSIVE_CHARACTER`](https://docs.snowflake.com/en/sql-reference/functions/split_text_recursive_character-snowflake-cortex)  
The `SPLIT_TEXT_RECURSIVE_CHARACTER` function splits a string into shorter strings recursively. It is useful for preprocessing text to be used with text embedding or search indexing functions.

In [None]:
-- List documents in stage
SELECT * FROM DIRECTORY('@DOCUMENTS');

In [None]:
-- Layout extraction for PDF documents
CREATE TABLE IF NOT EXISTS RAW_TEXT AS
SELECT 
    RELATIVE_PATH,
    TO_VARCHAR (
        SNOWFLAKE.CORTEX.PARSE_DOCUMENT (
            '@DOCUMENTS',
            RELATIVE_PATH,
            {'mode': 'LAYOUT'} ):content
        ) AS EXTRACTED_LAYOUT 
FROM 
    DIRECTORY('@DOCUMENTS');

SELECT * FROM RAW_TEXT;

In [None]:
-- Create chunks from extracted content
CREATE TABLE IF NOT EXISTS CHUNKED_TEXT AS
SELECT
   RELATIVE_PATH,
   c.INDEX::INTEGER AS CHUNK_INDEX,
   c.value::TEXT AS CHUNK_TEXT
FROM
   RAW_TEXT,
   LATERAL FLATTEN( input => SNOWFLAKE.CORTEX.SPLIT_TEXT_RECURSIVE_CHARACTER (
      EXTRACTED_LAYOUT,
      'markdown',
      4000,
      0,
      ['\n\n', '\n', ' ', '']
   )) c;

SELECT * FROM CHUNKED_TEXT;

In [None]:
-- Create a Cortex Search Service for Annual Reports
CREATE CORTEX SEARCH SERVICE IF NOT EXISTS ANNUAL_REPORTS_SEARCH
  ON CHUNK_TEXT
  ATTRIBUTES RELATIVE_PATH, CHUNK_INDEX
  WAREHOUSE = COMPUTE_WH
  TARGET_LAG = '1 hour'
  EMBEDDING_MODEL = 'snowflake-arctic-embed-l-v2.0'
AS (
  SELECT
      CHUNK_TEXT,
      RELATIVE_PATH,
      CHUNK_INDEX
  FROM CHUNKED_TEXT
  WHERE startswith(RELATIVE_PATH,'ANNUAL_REPORT')
);

### [Optional] Test Your Service in a Simple RAG Pipeline  

In this small example, we **combine Cortex Search with Cortex LLMs** to generate a response from context—also known as **Retrieval-Augmented Generation (RAG)**.  
This approach enhances responses by retrieving relevant data before generating an answer, improving accuracy and contextual relevance. 🚀  

In [None]:
question = 'What was the Cloud Revenue for Googol in 2024?'

# Fetch service
my_service = (root
  .databases["CORTEX_AGENTS_DEMO"]
  .schemas["MAIN"]
  .cortex_search_services["ANNUAL_REPORTS_SEARCH"]
)

# Query service
resp = my_service.search(
  query=question,
  columns=["CHUNK_INDEX", "CHUNK_TEXT", "RELATIVE_PATH"],
  limit=1
)
resp = resp.results[0]

st.info(f'**File:** {resp["RELATIVE_PATH"]}\n\n {resp["CHUNK_TEXT"]}')

# Generate Response
model = 'mistral-large2'
prompt = f"{question} Answer based on the provided context: {resp['CHUNK_TEXT']}"
response = Complete(model, prompt).strip()

st.info(f'**LLM Response:**\n\n**{response}**')

# Setup the Cortex Analyst Service [Structured Data]  

We generate some **random sales orders** that users will be able to **query in natural language**.  

## Dataset Overview  

The dataset consists of **three tables**:  

- **`CUSTOMER_ORDERS`**  
  - Stores **customers** and their **orders**  

- **`ORDERS`**  
  - Stores **order details**, including:  
    - `PRODUCT`  
    - `QUANTITY`  
    - `COUNTRY`  
    - `STATUS`  

- **`PRODUCTS`**  
  - Stores **product details**, such as:  
    - `PRICE`  

This structured dataset will allow **Cortex Analyst** to process user queries efficiently and return meaningful results. 🚀  


In [None]:
from snowflake.snowpark import functions as F
import pandas as pd
import random
from datetime import datetime, timedelta

# Define parameters
num_orders = 10000  # Number of orders to generate

# Sample data
customers = ["Alpha Corp", "Beta Ltd", "Gamma Inc", "Delta LLC", "Epsilon SA"]
products = ["Steel Rods", "Copper Wires", "Aluminum Sheets", "Brass Fittings", "Iron Pipes"]
countries = ["Germany", "USA", "UK", "France", "Italy"]
statuses = ["Pending", "Shipped", "Delivered", "Cancelled"]

# Generate random sales orders
orders = []
start_date = datetime(2024, 1, 1)

for i in range(num_orders):
    order_id = f"ORD{1000 + i}"
    customer = random.choice(customers)
    product = random.choice(products)
    quantity = random.randint(10, 500)
    unit_price = round(random.uniform(5.0, 50.0), 2)
    total_price = round(quantity * unit_price, 2)
    order_date = start_date + timedelta(days=random.randint(0, 365))
    country = random.choice(countries)
    status = random.choice(statuses)
    
    orders.append([order_id, customer, product, quantity, unit_price, total_price, order_date, country, status])

# Create DataFrame
df_orders = pd.DataFrame(orders, columns=[
    "ORDER_ID", "CUSTOMER", "PRODUCT", "QUANTITY", "UNIT_PRICE", "TOTAL_PRICE", "ORDER_DATE", "COUNTRY", "STATUS"
])

customer_orders = session.write_pandas(df_orders[['CUSTOMER','ORDER_ID']].drop_duplicates(), table_name='CUSTOMER_ORDERS', overwrite=True)
orders = session.create_dataframe(df_orders[['ORDER_ID','ORDER_DATE','PRODUCT','QUANTITY','COUNTRY','STATUS']].drop_duplicates())
orders = orders.with_column("ORDER_DATE", col("ORDER_DATE").cast(T.DateType()))
orders.write.mode("overwrite").save_as_table('ORDERS', mode='overwrite')
products = df_orders[['PRODUCT','UNIT_PRICE']].groupby('PRODUCT').max().reset_index()
products = session.write_pandas(products, table_name='PRODUCTS', overwrite=True)

In [None]:
print('Customer Orders:')
session.table('CUSTOMER_ORDERS').show(n=5)

print('Orders:')
session.table('CUSTOMER_ORDERS').show(n=5)

print('Products:')
session.table('PRODUCTS').show(n=5)

# Dynamic Literal Retrieval with Cortex Analyst

Business users may not have detailed knowledge of how data is stored in Snowflake.  
Instead of ingesting all possible values of a column into **Cortex Analyst**, we will use **dynamic literal retrieval** via the [Cortex Search Integration](https://docs.snowflake.com/en/user-guide/snowflake-cortex/cortex-analyst/cortex-analyst-search-integration).

## How It Works  
When a user asks a question about their **sales orders** that requires the `CUSTOMER`, `PRODUCT`, `COUNTRY` column, **Cortex Analyst** will:  
1. Retrieve the relevant literal dynamically from **Cortex Search**  
2. Use it for **SQL generation**  

This approach ensures efficient and accurate query generation without preloading all possible values into Cortex Analyst.  


In [None]:
CREATE CORTEX SEARCH SERVICE IF NOT EXISTS _ANALYST_PRODUCT_SEARCH
  ON PRODUCT
  WAREHOUSE = COMPUTE_WH
  TARGET_LAG = '1 hour'
  EMBEDDING_MODEL = 'snowflake-arctic-embed-l-v2.0'
AS (
  SELECT
      DISTINCT PRODUCT
  FROM PRODUCTS
);

In [None]:
CREATE CORTEX SEARCH SERVICE IF NOT EXISTS _ANALYST_COUNTRY_SEARCH
  ON COUNTRY
  WAREHOUSE = COMPUTE_WH
  TARGET_LAG = '1 hour'
  EMBEDDING_MODEL = 'snowflake-arctic-embed-l-v2.0'
AS (
  SELECT
      DISTINCT COUNTRY,
  FROM ORDERS
);

In [None]:
CREATE CORTEX SEARCH SERVICE IF NOT EXISTS _ANALYST_CUSTOMER_SEARCH
  ON CUSTOMER
  WAREHOUSE = COMPUTE_WH
  TARGET_LAG = '1 hour'
  EMBEDDING_MODEL = 'snowflake-arctic-embed-l-v2.0'
AS (
  SELECT
      DISTINCT CUSTOMER,
  FROM CUSTOMER_ORDERS
);

### [Optional] Test Literal Retrievals

In [None]:
question = 'What was the total order quantity per month for United Kingdom with status shipped?'

# Fetch service
my_service = (root
  .databases["CORTEX_AGENTS_DEMO"]
  .schemas["MAIN"]
  .cortex_search_services["_ANALYST_COUNTRY_SEARCH"]
)

# Query service
resp = my_service.search(
  query=question,
  columns=["COUNTRY"],
  limit=1
)
resp = resp.results[0]

st.info(f'**Search Results: {resp["COUNTRY"]}**')

In [None]:
question = 'What was the order revenue per month for steel for my customer Delta?'

# Fetch service
my_service = (root
  .databases["CORTEX_AGENTS_DEMO"]
  .schemas["MAIN"]
  .cortex_search_services["_ANALYST_PRODUCT_SEARCH"]
)

# Query service
resp = my_service.search(
  query=question,
  columns=["PRODUCT"],
  limit=2
)

for r in resp.results:
    st.info(r['PRODUCT'])

# Explore the Semantic Model in the Snowflake UI  

With our data available in **Snowflake** and the **Search Services** set up, it's time to explore the **native Semantic Model Generator** in **Snowsight**.  
![text](https://github.com/michaelgorkow/snowflake_cortex_agents_demo/blob/main/resources/semantic_model_ui.png?raw=true)

## Why Do You Need a Semantic Model?  

**Cortex Analyst** allows users to query **Snowflake** data using **natural language**. However, business users often use terminology that does not align with the database schema.  

### The Problem  
- Users specify **domain-specific business terms** in their questions  
- Underlying data is stored using **technical abbreviations**  
- Example:  
  - **"CUST"** is used for **customers**  
  - **Schema lacks semantic context**, making queries harder to interpret  

### The Solution: Semantic Models  
Semantic models **map business terminology to database schemas** and provide **contextual meaning**.  

#### Example  
When a user asks:  
🗣️ *"Total revenue last month"*  

The **semantic model** can:  
✅ Define **"revenue"** as **net revenue**  
✅ Define **"last month"** as **the previous calendar month**  

This mapping helps **Cortex Analyst** understand **user intent** and generate **accurate answers**.  

🔗 More details can be found in the [Semantic Model Specification](https://docs.snowflake.com/en/user-guide/snowflake-cortex/cortex-analyst/semantic-model-spec).  
