In [None]:
# Import python packages
import streamlit as st
import pandas as pd

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


In [None]:
create stage if not exists EXAMPLE_DOCS 
	DIRECTORY = ( ENABLE = true 
                  AUTO_REFRESH = TRUE) 
	ENCRYPTION = ( TYPE = 'SNOWFLAKE_SSE' );

Load documents into a Snowflake stage

In [None]:
MY_STAGE = 'EXAMPLE_DOCS/machine_reports'
MY_FILE_NAME = "data/docs/machine_reports/*.pdf"


# Upload the file to a stage.
put_result = session.file.put(MY_FILE_NAME, MY_STAGE, auto_compress=False,overwrite=True)

In [None]:
alter stage example_docs refresh;

AI_EXTRACT leverages Snowflake's Vision model Arctic-extract.  It is a vision model so we do not need to perform OCR, and then ask questions.  We can ask questions directly on documents.  Below we ask 2 questions about 1 document

In [None]:
st.image('data/docs/machine_reports/example_doc.png')

In [None]:
CREATE OR REPLACE TABLE my_data_table (f FILE, p VARCHAR, r VARCHAR);

In [None]:
INSERT INTO my_data_table (f, p, r)
SELECT 
TO_FILE('@EXAMPLE_DOCS/machine_reports', 'Manual_2022-02-01.pdf'), 
'{"Name": "Who inspected the machine?",
  "Date": "What was the date of the inspection?",
  "Serial Number": "What is the Serial Number of the machine"
    }', 
'{"Name": "Emily Johnson",
  "Date": "2022-02-01",
  "Serial Number": "SGMM-12345"}';

In [None]:
Select * from my_data_table

In [None]:
CREATE OR REPLACE DATASET my_dataset;

In [None]:
ALTER DATASET my_dataset
ADD VERSION 'v1' FROM (
  SELECT FL_GET_STAGE(f) || '/' || FL_GET_RELATIVE_PATH(f) AS "file",
       p AS "prompt",
       r AS "response"
  FROM my_data_table
);

In [None]:
SELECT SNOWFLAKE.CORTEX.FINETUNE(
  'CREATE',
  'machine_docs_fine_tuned',
  'arctic-extract',
  'snow://dataset/demo.public.my_dataset/versions/v1'
);

In [None]:
SELECT 
relative_path,
  json_data:response.date::STRING as response_date,
  json_data:response.name::STRING as inspector,
  json_data:response.grade::STRING as grade,
  json_data:response.machine::STRING as machine
from
(
SELECT 
relative_path,
AI_EXTRACT(
  model => 'DEMO.PUBLIC.MACHINE_DOCS_FINE_TUNED',
  file => TO_FILE('@EXAMPLE_DOCS',RELATIVE_PATH),
  responseFormat => [['name', 'Who inspected the machine?'], 
  ['date', 'What was the date of the inspection?'],
  ['grade', 'What was the grade of the inspection?'],
  ['machine', 'What machine was inspected?']]
) as json_data
from DIRECTORY(@EXAMPLE_DOCS)
where relative_path like 'machine_reports/%'
);