# Extracting Structured Insights from Apple's Supplier List with Snowflake Cortex AI
## PDF to Structured JSON from Apple's Supplier List using Snowflake Cortex AI
- Discover the power of Snowflake Cortex AI for intelligent document processing!
- This demonstration highlights how to use the SNOWFLAKE.CORTEX.COMPLETE function with 'claude-3-5-sonnet' LLM to parse an Apple Supplier List PDF and extract specific entities into a structured JSON format.
- We'll demonstrate how to define a response schema to reliably extract supplier information, location count, and lists of locations and countries, showcasing Cortex AI's ability to deliver structured output from complex documents.

## Create a Snowflake Managed Stage to Store Apple's Supplier List PDF File. 

In [None]:
CREATE STAGE APPLE_SUPPLIER_LIST_STG 
	DIRECTORY = ( ENABLE = true ) 
	ENCRYPTION = ( TYPE = 'SNOWFLAKE_SSE' );

## List the File in the Stage.  

In [None]:
LIST @Apple_supplier_list_stg;

## Create a table to store the parsed content from the Apple Supplier File PDF.  

In [None]:
CREATE TRANSIENT TABLE APPLE_SUPPLIER_LIST_TBL (
    APPLE_SUPPLIER_LIST_CONTENT VARCHAR
);

## Parse and Insert Content from the Supplier List PDF file into Table.  
### Parsing of the document is accomplished using Snowflake Cortex [PARSE_DOCUMENT](https://docs.snowflake.com/en/sql-reference/functions/parse_document-snowflake-cortex)

In [None]:
INSERT INTO APPLE_SUPPLIER_LIST_TBL 
    (APPLE_SUPPLIER_LIST_CONTENT)
SELECT       
TO_VARCHAR(SNOWFLAKE.CORTEX.PARSE_DOCUMENT('@Apple_supplier_list_stg','Apple-Supplier-List.pdf', {'mode': 'LAYOUT'}):content);

## Apple's Supplier List runs to **27 Pages**. So, we chunk the text to ensure we fit within the context and response window of the LLM.  
### We use Snowflake Cortex [SPLIT_TEXT_RECURSIVE_CHARACTER](https://docs.snowflake.com/en/sql-reference/functions/split_text_recursive_character-snowflake-cortex) function to chunk the text from the file.  

In [None]:
CREATE OR REPLACE TRANSIENT TABLE APPLE_SUPPLIER_LIST_CHUNKS_TBL 
AS
SELECT
     TO_VARCHAR(CHUNK_TXT.VALUE) AS APPLE_SUPPLIER_LIST_CHUNK
FROM
   APPLE_SUPPLIER_LIST_TBL,
   LATERAL FLATTEN( input => SNOWFLAKE.CORTEX.SPLIT_TEXT_RECURSIVE_CHARACTER (
      APPLE_SUPPLIER_LIST_CONTENT,
      'none',
      600,
      75
   )) CHUNK_TXT;

## Create a Table to Store the Structured Output From the Snowflake Cortex [COMPLETE](https://docs.snowflake.com/en/user-guide/snowflake-cortex/complete-structured-outputs) function

In [None]:
CREATE OR REPLACE TRANSIENT TABLE APPLE_SUPPLIER_LIST_CHUNKS_RAW_JSON_TBL
(
APPLE_SUPPLER_LIST_CHUNK_JSON_COL VARIANT
);

## Apply the COMPLETE Structured Output on Each Chunk and Insert the output JSON into APPLE_SUPPLIER_LIST_CHUNKS_RAW_JSON_TBL

In [None]:
INSERT INTO APPLE_SUPPLIER_LIST_CHUNKS_RAW_JSON_TBL
SELECT SNOWFLAKE.CORTEX.COMPLETE('claude-3-5-sonnet', [
        {
        'role': 'user',
        'content': 
        'Extract the following from the Apple Supplier List document: 
        1. Extract the Supplier Name. 
        2.a. For each Supplier count the number of locations. Remember, the locations can be comma separated.
        2.b. For example: This Primary location list - \'Alabama, Indiana, Iowa, Minnesota, South Carolina, Wisconsin\', has six locations. Extract and capture each location individually.   
        3. Extract the location, if it is comma-separated, put each location in the \'location\' array.
        4. Extract the conutry name, which is the last column in the document, and put it in the \'country\' array. 
' || APPLE_SUPPLIER_LIST_CHUNK
            }
    ],
    {
        'temperature': 0,
        'max_tokens': 8192,
        'response_format':{
            'type':'json',
            'schema': {
                'type': 'object',
                'properties': {
                    'apple_supplier_list': {
                        'type': 'array',
                        'items': {
                            'type': 'object',
                            'properties': {
                                'supplier_name': {'type': 'string'},
                                'number_of_locations': {'type': 'string'},
                                'locations':{'type': 'array'},
                                'countries':{'type': 'array'}
                            },
                            'required': []
                        }
                    }
                }
            }
            }
    }
) Apple_Supplier_List
FROM 
    APPLE_SUPPLIER_LIST_CHUNKS_TBL;


## List the Supplier Name and the Number of Location for that Supplier that Supply to Apple.

In [None]:
SELECT 
        apple_supplier_list_output.VALUE:"supplier_name"::STRING AS SUPPLIER_NAME,
        apple_supplier_list_output.VALUE:"number_of_locations"::NUMBER AS NUMBER_OF_LOCATIONS,
        apple_supplier_list_output.VALUE,
        -- Assign a row number for each supplier based on your desired ordering
        ROW_NUMBER() OVER (
            PARTITION BY SUPPLIER_NAME 
            ORDER BY NUMBER_OF_LOCATIONS DESC  -- Or any other criteria to define the "first" row
        ) AS rn
    FROM 
        APPLE_SUPPLIER_LIST_CHUNKS_RAW_JSON_TBL,
        LATERAL FLATTEN(input => APPLE_SUPPLER_LIST_CHUNK_JSON_COL:"structured_output") struct_output,
        LATERAL FLATTEN(INPUT => struct_output.VALUE:"raw_message":"apple_supplier_list") apple_supplier_list_output
    WHERE 
        SUPPLIER_NAME NOT IN ('<UNKNOWN>', 'CLEAN ENERGY', '')

## Here's the query to extract the locations of each supplier. 
### Some supplier have multiple locations from which they support Apple.  
### Since we created overlapping chunks, we need to handle the duplicate supplier names.  

In [None]:
WITH RankedSuppliers AS (
    SELECT 
        apple_supplier_list_output.VALUE:"supplier_name"::STRING AS SUPPLIER_NAME,
        apple_supplier_list_output.VALUE:"number_of_locations"::NUMBER AS NUMBER_OF_LOCATIONS,
        apple_supplier_list_output.VALUE,
        -- Assign a row number for each supplier based on your desired ordering
        ROW_NUMBER() OVER (
            PARTITION BY SUPPLIER_NAME 
            ORDER BY NUMBER_OF_LOCATIONS DESC  -- Or any other criteria to define the "first" row
        ) AS rn
    FROM 
        APPLE_SUPPLIER_LIST_CHUNKS_RAW_JSON_TBL,
        LATERAL FLATTEN(input => APPLE_SUPPLER_LIST_CHUNK_JSON_COL:"structured_output") struct_output,
        LATERAL FLATTEN(INPUT => struct_output.VALUE:"raw_message":"apple_supplier_list") apple_supplier_list_output
    WHERE 
        SUPPLIER_NAME NOT IN ('<UNKNOWN>', 'CLEAN ENERGY', '') 
        -- AND SUPPLIER_NAME IS NOT NULL AND SUPPLIER_NAME != ' '
)
SELECT 
    RANKEDSUPPLIERS.SUPPLIER_NAME,
    RANKEDSUPPLIERS.NUMBER_OF_LOCATIONS,
    -- RANKEDSUPPLIERS.VALUE:"locations"::STRING LOCATIONS,
    -- *,
    -- locations_array.VALUE::STRING
FROM 
    RankedSuppliers,
    -- LATERAL FLATTEN(INPUT=> RANKEDSUPPLIERS.VALUE:"locations") locations_array
WHERE 
    rn = 1
ORDER BY 
    RANKEDSUPPLIERS.SUPPLIER_NAME ASC;

In [None]:
WITH RankedSuppliers AS (
    SELECT 
        apple_supplier_list_output.VALUE:"supplier_name"::STRING AS SUPPLIER_NAME,
        apple_supplier_list_output.VALUE:"number_of_locations"::NUMBER AS NUMBER_OF_LOCATIONS,
        apple_supplier_list_output.VALUE,
        -- Assign a row number for each supplier based on your desired ordering
        ROW_NUMBER() OVER (
            PARTITION BY SUPPLIER_NAME 
            ORDER BY NUMBER_OF_LOCATIONS DESC  -- Or any other criteria to define the "first" row
        ) AS rn
    FROM 
        APPLE_SUPPLIER_LIST_CHUNKS_RAW_JSON_TBL,
        LATERAL FLATTEN(input => APPLE_SUPPLER_LIST_CHUNK_JSON_COL:"structured_output") struct_output,
        LATERAL FLATTEN(INPUT => struct_output.VALUE:"raw_message":"apple_supplier_list") apple_supplier_list_output
    WHERE 
        SUPPLIER_NAME NOT IN ('<UNKNOWN>', 'CLEAN ENERGY', '') 
        -- AND SUPPLIER_NAME IS NOT NULL AND SUPPLIER_NAME != ' '
)
SELECT 
    RANKEDSUPPLIERS.SUPPLIER_NAME,
    RANKEDSUPPLIERS.NUMBER_OF_LOCATIONS,
    RANKEDSUPPLIERS.VALUE:"locations"::STRING LOCATION,
    locations_array.VALUE::STRING
FROM 
    RankedSuppliers,
    LATERAL FLATTEN(INPUT=> RANKEDSUPPLIERS.VALUE:"locations") locations_array
WHERE 
    rn = 1
ORDER BY 
    RANKEDSUPPLIERS.SUPPLIER_NAME ASC;

In [None]:
SELECT DISTINCT LOCATION
FROM 
(WITH RankedSuppliers AS (
    SELECT 
        apple_supplier_list_output.VALUE:"supplier_name"::STRING AS SUPPLIER_NAME,
        apple_supplier_list_output.VALUE:"number_of_locations"::NUMBER AS NUMBER_OF_LOCATIONS,
        apple_supplier_list_output.VALUE,
        -- Assign a row number for each supplier based on your desired ordering
        ROW_NUMBER() OVER (
            PARTITION BY SUPPLIER_NAME 
            ORDER BY NUMBER_OF_LOCATIONS DESC  -- Or any other criteria to define the "first" row
        ) AS rn
    FROM 
        APPLE_SUPPLIER_LIST_CHUNKS_RAW_JSON_TBL,
        LATERAL FLATTEN(input => APPLE_SUPPLER_LIST_CHUNK_JSON_COL:"structured_output") struct_output,
        LATERAL FLATTEN(INPUT => struct_output.VALUE:"raw_message":"apple_supplier_list") apple_supplier_list_output
    WHERE 
        SUPPLIER_NAME NOT IN ('<UNKNOWN>', 'CLEAN ENERGY', '') 
        -- AND SUPPLIER_NAME IS NOT NULL AND SUPPLIER_NAME != ' '
)
SELECT 
    RANKEDSUPPLIERS.SUPPLIER_NAME,
    RANKEDSUPPLIERS.NUMBER_OF_LOCATIONS,
    RANKEDSUPPLIERS.VALUE:"locations"::STRING LOCATION_ARRAY,
    locations_array.VALUE::STRING LOCATION,
FROM 
    RankedSuppliers,
    LATERAL FLATTEN(INPUT=> RANKEDSUPPLIERS.VALUE:"locations") locations_array
WHERE 
    rn = 1
ORDER BY 
    RANKEDSUPPLIERS.SUPPLIER_NAME ASC);

In [None]:
WITH RankedSuppliers AS (
    SELECT 
        apple_supplier_list_output.VALUE:"supplier_name"::STRING AS SUPPLIER_NAME,
        apple_supplier_list_output.VALUE:"number_of_locations"::NUMBER AS NUMBER_OF_LOCATIONS,
        apple_supplier_list_output.VALUE,
        -- Assign a row number for each supplier based on your desired ordering
        ROW_NUMBER() OVER (
            PARTITION BY SUPPLIER_NAME 
            ORDER BY NUMBER_OF_LOCATIONS DESC  -- Or any other criteria to define the "first" row
        ) AS rn
    FROM 
        APPLE_SUPPLIER_LIST_CHUNKS_RAW_JSON_TBL,
        LATERAL FLATTEN(input => APPLE_SUPPLER_LIST_CHUNK_JSON_COL:"structured_output") struct_output,
        LATERAL FLATTEN(INPUT => struct_output.VALUE:"raw_message":"apple_supplier_list") apple_supplier_list_output
    WHERE 
        SUPPLIER_NAME NOT IN ('<UNKNOWN>', 'CLEAN ENERGY', '') 
        -- AND SUPPLIER_NAME IS NOT NULL AND SUPPLIER_NAME != ' '
)
SELECT 
    RANKEDSUPPLIERS.SUPPLIER_NAME,
    RANKEDSUPPLIERS.NUMBER_OF_LOCATIONS,
    RANKEDSUPPLIERS.VALUE:"countries"::STRING COUNTRY_ARRAY,
    countries_array.VALUE::STRING COUNTRY
FROM 
    RankedSuppliers,
    LATERAL FLATTEN(INPUT=> RANKEDSUPPLIERS.VALUE:"countries") countries_array
WHERE 
    rn = 1
ORDER BY 
    RANKEDSUPPLIERS.SUPPLIER_NAME ASC;

In [None]:
SELECT DISTINCT COUNTRY
FROM 
(WITH RankedSuppliers AS (
    SELECT 
        apple_supplier_list_output.VALUE:"supplier_name"::STRING AS SUPPLIER_NAME,
        apple_supplier_list_output.VALUE:"number_of_locations"::NUMBER AS NUMBER_OF_LOCATIONS,
        apple_supplier_list_output.VALUE,
        -- Assign a row number for each supplier based on your desired ordering
        ROW_NUMBER() OVER (
            PARTITION BY SUPPLIER_NAME 
            ORDER BY NUMBER_OF_LOCATIONS DESC  -- Or any other criteria to define the "first" row
        ) AS rn
    FROM 
        APPLE_SUPPLIER_LIST_CHUNKS_RAW_JSON_TBL,
        LATERAL FLATTEN(input => APPLE_SUPPLER_LIST_CHUNK_JSON_COL:"structured_output") struct_output,
        LATERAL FLATTEN(INPUT => struct_output.VALUE:"raw_message":"apple_supplier_list") apple_supplier_list_output
    WHERE 
        SUPPLIER_NAME NOT IN ('<UNKNOWN>', 'CLEAN ENERGY', '') 
        -- AND SUPPLIER_NAME IS NOT NULL AND SUPPLIER_NAME != ' '
)
SELECT 
    RANKEDSUPPLIERS.SUPPLIER_NAME,
    RANKEDSUPPLIERS.NUMBER_OF_LOCATIONS,
    RANKEDSUPPLIERS.VALUE:"countries"::STRING COUNTRY_ARRAY,
    countries_array.VALUE::STRING COUNTRY
FROM 
    RankedSuppliers,
    LATERAL FLATTEN(INPUT=> RANKEDSUPPLIERS.VALUE:"countries") countries_array
WHERE 
    rn = 1
ORDER BY 
    RANKEDSUPPLIERS.SUPPLIER_NAME ASC);