In [None]:
# Import python packages
import streamlit as st
import pandas as pd

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


# Invoice Processing 

In this notebook we will be running the extraction on existing historical invoice documents. 



### Upload PDFs to a Snowflake (internal/external) stage with SSE 

To start this demo, we created a Snowflake stage called Invoice_Documents and we uploaded ~200 historical invoice PDF examples to the stage. 

Documents need to be 

In [None]:
LIST @INVOICE_DOCUMENTS

In [None]:
# List files in INVOICE_DOCUMENTS stage
list_result = session.sql("LIST @INVOICE_DOCUMENTS").to_pandas()
print(f"Found {len(list_result)} files")
print(list_result.head())

## Processing Documents with Snowflake 

Next, we'll take a look at what it looks like to process these documents in Snowflake. We will first look at pulling the text and the format from these documents. This is one way teams may start to process documents. 

Then, we will look at AI_Extract which can work on the extracted text from these documents from parse_document. Additionally, we will look at how AI_Extract works multimodally on the underlying PDF as well. 

These functions can be run with both SQL or Python. 

In [None]:
SELECT AI_PARSE_DOCUMENT (
    TO_FILE('@INVOICE_DOCUMENTS','invoice_10248.pdf'),
    {'mode': 'LAYOUT' , 'page_split': true}) AS text_example;


Here we can see the layout format is able to take in some contextual information about the document. This includes things like the table of line items. 

We can see how well we can extract based on this text information 

### AI_Extract 

Next, we are going to tell the model what we want to extract. 

In [None]:
WITH text_example AS (
    -- Assuming this is correctly loading the data into a VARIANT column named text_data
    SELECT AI_PARSE_DOCUMENT (
        TO_FILE('@INVOICE_DOCUMENTS', 'invoice_10248.pdf'),
        {'mode': 'LAYOUT' , 'page_split': true}
    ) AS text_data
),

flatten_content AS (
    -- This CTE correctly flattens the 'pages' array and extracts 'page_content'
    SELECT
        f.value:content::STRING AS page_content,
        f.value:index::INT AS page_index
    FROM
        text_example AS t,
        LATERAL FLATTEN(INPUT => t.text_data:pages) AS f
)

-- The final SELECT statement applies AI_EXTRACT to the 'page_content' column
-- which is available from the flatten_content CTE.
SELECT
    AI_EXTRACT(
        -- FIX: Name the first argument explicitly as 'text_input'
        text => page_content,
        responseFormat => [
            'contact_name: Who is receiving this invoice (first and last name)',
            'Order_ID: What is the order number',
            'customer_id: What is the customer number',
            'customer_city: What is the customer city',
            'customer_country: What is the customer country',
            'order_date: When was this ordered',
            'product_name_description: Product or service name for each line item and any description about the product',
            'product_id: Product ID or SKU for each line item',
            'quantity: Quantity for each line item',
            'rate: Unit price or rate for each line item',
            'amount: Total amount for each line item',
            'total_amount: Total invoice amount'
        ]
    ) AS extracted_data,
    page_index
FROM
    flatten_content;

We can see overall it does well. However, 

## AI_EXTRACT on PDFs 

Beyond being able to extract on our text data, we can use AI_Extract multimodally directly on our PDF documents. This is often the ideal solution for invoice documents as different providers may be providing different information visually or with tables etc in the documents. 

In [None]:
SELECT AI_EXTRACT(
    file => TO_FILE('@INVOICE_DOCUMENTS', 'invoice_10248.pdf'),
    responseFormat => [
            'contact_name: Who is receiving this invoice (first and last name)',
            'Order_ID: What is the order number',
            'customer_id: What is the customer number',
            'customer_city: What is the customer city',
            'customer_country: What is the customer country',
            'order_date: When was this ordered',
            'product_name_description: Product or service name for each line item and any description about the product',
            'product_id: Product ID or SKU for each line item',
            'quantity: Quantity for each line item',
            'rate: Unit price or rate for each line item',
            'amount: Total amount for each line item',
            'total_amount: Total invoice amount'
        ]
) as extracted_data

Writing this into a readable table 

In [None]:
WITH your_extracted_table AS (
    SELECT
        AI_EXTRACT(
            file => TO_FILE('@INVOICE_DOCUMENTS', 'invoice_10250.pdf'),
            responseFormat => [
                'contact_name: Who is receiving this invoice (first and last name)',
                'Order_ID: What is the order number',
                'customer_id: What is the customer number',
                'customer_city: What is the customer city',
                'customer_country: What is the customer country',
                'order_date: When was this ordered',
                'product_name_description: Product or service name for each line item and any description about the product',
                'product_id: Product ID or SKU for each line item',
                'quantity: Quantity for each line item',
                'rate: Unit price or rate for each line item',
                'amount: Total amount for each line item',
                'total_amount: Total invoice amount'
            ]
        ) AS extracted_data
)
SELECT
    -- 1. HEADER FIELDS
    t.extracted_data:response:contact_name::STRING AS customer_name,
    t.extracted_data:response:Order_ID::STRING AS invoice_number,
    t.extracted_data:response:customer_id::STRING AS customer_id, -- Added for completeness
    t.extracted_data:response:customer_city::STRING AS customer_city, -- Added for completeness
    t.extracted_data:response:total_amount::FLOAT AS total_amount,

    -- 2. LINE ITEM FIELDS (Extracted via the index from the FLATTEN operation)
    f_line.index AS line_item_index,

    -- Value of the array that was flattened (Product Name/Description)
    f_line.value::STRING AS product_name_description,

    -- Get corresponding values from other parallel arrays using the index
    t.extracted_data:response:product_id[f_line.index]::STRING AS product_id,
    t.extracted_data:response:quantity[f_line.index]::INT AS quantity,
    t.extracted_data:response:rate[f_line.index]::FLOAT AS rate,
    t.extracted_data:response:amount[f_line.index]::FLOAT AS line_item_amount

FROM
    your_extracted_table AS t,

    -- FLATTEN ONE line-item array to drive the row creation
    LATERAL FLATTEN(
        INPUT => t.extracted_data:response:product_name_description
    ) AS f_line
ORDER BY
    invoice_number, line_item_index;

## Running This On All Existing Data 

In [None]:
CREATE OR REPLACE TEMPORARY TABLE Invoice_Data AS
SELECT
    d.RELATIVE_PATH AS file_path, -- File path for reference (guaranteed column from DIRECTORY)
    AI_EXTRACT(
        file => TO_FILE(
            '@INVOICE_DOCUMENTS',
            d.RELATIVE_PATH
        ),
        responseFormat => [
            'contact_name: Who is receiving this invoice (first and last name)',
            'Order_ID: What is the order number',
            'customer_id: What is the customer number',
            'customer_city: What is the customer city',
            'customer_country: What is the customer country',
            'order_date: When was this ordered',
            'product_name_description: Product or service name for each line item and any description about the product',
            'product_id: Product ID or SKU for each line item',
            'quantity: Quantity for each line item',
            'rate: Unit price or rate for each line item',
            'amount: Total amount for each line item',
            'total_amount: Total invoice amount'
        ]
    ) AS extracted_data
FROM
    DIRECTORY(@INVOICE_DOCUMENTS) AS d;

In [None]:
CREATE OR REPLACE TABLE Invoice_Line_Items AS (

SELECT
    -- 1. HEADER FIELDS
    t.extracted_data:response:contact_name::STRING AS customer_name,
    t.extracted_data:response:Order_ID::STRING AS invoice_number,
    t.extracted_data:response:customer_id::STRING AS customer_id, -- Added for completeness
    t.extracted_data:response:customer_city::STRING AS customer_city, -- Added for completeness
    t.extracted_data:response:total_amount::FLOAT AS total_amount,

    -- 2. LINE ITEM FIELDS (Extracted via the index from the FLATTEN operation)
    f_line.index AS line_item_index,

    -- Value of the array that was flattened (Product Name/Description)
    f_line.value::STRING AS product_name_description,

    -- Get corresponding values from other parallel arrays using the index
    t.extracted_data:response:product_id[f_line.index]::STRING AS product_id,
    t.extracted_data:response:quantity[f_line.index]::INT AS quantity,
    t.extracted_data:response:rate[f_line.index]::FLOAT AS rate,
    t.extracted_data:response:amount[f_line.index]::FLOAT AS line_item_amount

FROM
    invoice_data AS t,

    -- FLATTEN ONE line-item array to drive the row creation
    LATERAL FLATTEN(
        INPUT => t.extracted_data:response:product_name_description
    ) AS f_line
ORDER BY
    invoice_number, line_item_index ) 

In [None]:
SELECT * 
FROM INVOICE_LINE_ITEMS
LIMIT 10

## Ongoing Maintenance (Optional)

We just ran through running the AI Extract over all of the historical invoices available to the team. However, what happens as new invoices come in? 

We can easily set up this system to take in new invoices through setting up document ingestion pipeline with a stream and a task in Snowflake. In this, we will use our underlying directory table. 

1. Store Files in a Stage with a Directory Table 
2. Create a Stream to Read in any new documents to the Directory table 
3. At some batch cadence, run a task to process the new documents since the last task 
4. Repeat 

Can provide greater insight. Here is a link to get started with more streams and tasks https://quickstarts.snowflake.com/guide/getting_started_with_streams_and_tasks/#0 

# Part Two: Create a Search Service 

Next, we can create a search service to help teams find the most relevant line items similar to their invoices.

This search service can be built through the UI or by calling the service as we are doing below. 

There are a few things to keep in mind when looking at this search service:
* **ON Column**: This is the text column we are going to be searching over. In this example, we are going to be searching for similar product_name_descriptions to find the most similar example invoice line items. 
* **Attribute Columns**: These are columns that we are not using for search specifically, but can be used in filters or ways that teams may choose to interact with what values they'd like to get back from search. For example, we may make Customer_ID an attribute to search over, because maybe we only want to look for invoices that came from a specific customer. Or line item amount, because we may only look for products that are similar below a specific price. 
* **Warehouse**: This is the warehouse that will be used to re-index the search service 
* **Target Lag**: This is how often the search service will look for new records. So similar to the stream and task described in the resources above to bring in new invoices, how often should the service look for new invoices to add into the service. This will automatically add those new product line items added to the table to our search service at a specific cadence. 
* **Embedding Model**: This is the model that will transform our text into vector embeddings. This will allow us to find semantically similar terms. 
* **Select columns:** These are the columns (including our search column and attribute columns) that our service can return in a select statement. We may want to add in additional values that would be helpful for the end user that we may not necessarily want to search or filter over. For example, invoice ID will be impactful for our end user to know -- however, they may not actually need this information to perform our search. 




In [None]:
CREATE OR REPLACE CORTEX SEARCH SERVICE Invoice_Product_Search
  ON Product_Name_Description
  ATTRIBUTES Customer_ID,
                Customer_City,
                line_item_amount
  WAREHOUSE = DATASCIENCECOLLEGE
  TARGET_LAG = '365 days'
  EMBEDDING_MODEL = 'snowflake-arctic-embed-l-v2.0'
AS (
  SELECT
      Product_Name_Description,
      Customer_ID,
      Customer_city,
      Invoice_number,
      product_id,
      line_item_amount
  FROM DEMODB.INVOICE_ANALYSIS.INVOICE_LINE_ITEMS
)

In [None]:
SELECT PARSE_JSON(
  SNOWFLAKE.CORTEX.SEARCH_PREVIEW(
      'Invoice_Product_Search',
      '{
         "query": "Haileys Penne Vodka",
         "columns":[
            "line_item_amount",
            "customer_id",
            "product_name_description"
         ],
         "limit":10
      }'
  )
)['results'] as results;