In [None]:
# Import python packages
import streamlit as st
import pandas as pd
import time

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


In [None]:
create or replace stage EXAMPLE_DOCS 
	DIRECTORY = ( ENABLE = true 
                  AUTO_REFRESH = TRUE) 
	ENCRYPTION = ( TYPE = 'SNOWFLAKE_SSE' );

Load documents into a Snowflake stage

In [None]:
MY_STAGE = 'EXAMPLE_DOCS/machine_reports'
MY_FILE_NAME = "data/docs/machine_reports/*.pdf"


# Upload the file to a stage.
put_result = session.file.put(MY_FILE_NAME, MY_STAGE, auto_compress=False,overwrite=True)

MY_STAGE = 'EXAMPLE_DOCS/resumes'
MY_FILE_NAME = "data/docs/resumes/*.pdf"


# Upload the file to a stage.
put_result = session.file.put(MY_FILE_NAME, MY_STAGE, auto_compress=False,overwrite=True)
put_result[0].status

In [None]:
alter stage example_docs refresh;

AI_EXTRACT leverages Snowflake's Vision model Arctic-extract.  It is a vision model so we do not need to perform OCR, and then ask questions.  We can ask questions directly on documents.  Below we ask 2 questions about 1 document

In [None]:
st.image('data/docs/example_doc.png')

In [None]:
SELECT AI_EXTRACT(
  file => TO_FILE('@EXAMPLE_DOCS/machine_reports','Manual_2022-02-01.pdf'),
  responseFormat => [['name', 'Who inspected the machine?'], ['date', 'What was the date of the inspection?']]
) as json_data

We can even use it for classification

In [None]:
SELECT AI_EXTRACT(
  file => TO_FILE('@EXAMPLE_DOCS/machine_reports','Manual_2022-02-01.pdf'),
  responseFormat => [['name', 'does this doc have a name? Answer Yes or No']]
) as json_data

Here we can run AI_EXTRACT on multiple files

In [None]:
SELECT 
relative_path,
  json_data:response.date::STRING as response_date,
  json_data:response.name::STRING as inspector,
  json_data:response.grade::STRING as grade,
  json_data:response.machine::STRING as machine
from
(
SELECT 
relative_path,
AI_EXTRACT(
  file => TO_FILE('@EXAMPLE_DOCS',RELATIVE_PATH),
  responseFormat => [['name', 'Who inspected the machine?'], 
  ['date', 'What was the date of the inspection?'],
  ['grade', 'What was the grade of the inspection?'],
  ['machine', 'What machine was inspected?']]
) as json_data
from DIRECTORY(@EXAMPLE_DOCS)
where relative_path like 'machine_reports/%'
);

Extract key information from resume using AI_EXTRACT

In [None]:
SELECT 
  json_data:response.name::STRING as name,
  json_data:response.email::STRING as email,
  json_data:response.phone::STRING as phone_number
from
(
SELECT AI_EXTRACT(
  file => TO_FILE('@EXAMPLE_DOCS/resumes','resume-sample-2-13-1.pdf'),
  responseFormat => [['name', 'What is the name of the resume applicant?'], 
  ['email', 'What is the email of the resume applicant?'],
  ['phone', 'What is the phone number of the resume applicant?']]
) as json_data);

Parse text from a resume using PARSE_DOCUMENT

In [None]:
SELECT 
relative_path,
SNOWFLAKE.CORTEX.PARSE_DOCUMENT('@EXAMPLE_DOCS',relative_path):content::STRING AS resume_text,
from DIRECTORY(@EXAMPLE_DOCS)
where relative_path like 'resumes/%'
limit 5;

Combine PARSE_DOCUMENT and AI_EXTRACT to create candidates table

In [None]:
create or replace table candidates as 
SELECT 
  json_data:response.name::STRING as name,
  json_data:response.email::STRING as email,
  json_data:response.phone::STRING as phone_number,
  resume_text,
  relative_path as resume_file_path,
from
(
SELECT 
    relative_path,
    
    AI_EXTRACT(
        file => TO_FILE('@EXAMPLE_DOCS',relative_path),
        responseFormat => [['name', 'What is the name of the resume applicant?'], 
          ['email', 'What is the email of the resume applicant?'],
          ['phone', 'What is the phone number of the resume applicant?']]
    ) as json_data,
    
    SNOWFLAKE.CORTEX.PARSE_DOCUMENT('@EXAMPLE_DOCS',relative_path):content::STRING AS resume_text

from DIRECTORY(@EXAMPLE_DOCS)
where relative_path like 'resumes/%');

select * from candidates;

Create Jobs table

In [None]:
-- Create table
CREATE OR REPLACE TABLE JOBS (
  job_id INTEGER AUTOINCREMENT,
  job_title VARCHAR,
  job_category VARCHAR,
  job_description VARCHAR,
  created_at TIMESTAMP_NTZ DEFAULT CURRENT_TIMESTAMP,
  PRIMARY KEY (job_id)
);

-- Seed data (10 rows)
INSERT INTO JOBS (job_title, job_category, job_description) VALUES
  ('Network Engineer', 'computer networking',
   'Design, implement, and troubleshoot enterprise LAN/WAN and wireless networks. Configure routers, switches, and firewalls with high availability. Monitor performance using SNMP/NetFlow and packet analysis. Collaborate on capacity planning and network upgrades.'),
  ('Network Security Analyst', 'computer networking',
   'Monitor and analyze network traffic for threats and anomalies. Manage firewalls, IDS/IPS, and VPNs following zero-trust principles. Investigate incidents and produce actionable remediation plans. Maintain security baselines and compliance documentation.'),
  ('Backend Software Engineer', 'software development',
   'Build scalable APIs and services with robust domain models and clean interfaces. Optimize data access patterns and background processing. Write comprehensive tests and instrumentation for reliability. Participate in code reviews and architecture discussions.'),
  ('Software Engineer', 'software development',
   'Develop responsive, accessible web interfaces with modern frameworks. Integrate APIs and manage application state efficiently. Optimize performance, bundle size, and rendering. Maintain a high-quality component library and UI tests.'),
  ('DevOps Engineer', 'software development',
   'Automate CI/CD pipelines and infrastructure as code. Improve observability with logs, metrics, and tracing. Harden deployments with blue/green and canary strategies. Optimize cost and reliability across environments.'),
  ('Data Engineer', 'software development',
   'Design and maintain reliable ETL/ELT pipelines and data models. Orchestrate workflows and ensure data quality and lineage. Tune warehouses and storage for performance. Partner with analytics and ML teams on scalable datasets.'),
  ('Marketing Manager', 'marketing',
   'Own go-to-market plans and integrated campaigns across channels. Define positioning, messaging, and audience segmentation. Track funnel metrics and ROI to optimize spend. Coordinate launches with sales and product teams.'),
  ('Digital Marketing Specialist', 'marketing',
   'Execute SEO, SEM, and paid social campaigns end-to-end. Create and test creatives and landing pages for conversion. Analyze performance and run A/B tests to improve CAC. Maintain accurate tracking and attribution.'),
  ('Corporate Trainer', 'personnel training',
   'Deliver engaging instructor-led and virtual training programs. Assess skill gaps and tailor curricula to business goals. Measure learning outcomes and iterate content. Coach subject matter experts to scale delivery.'),
  ('Instructional Designer', 'personnel training',
   'Design learner-centered courses using modern instructional frameworks. Develop eLearning modules, labs, and assessments. Align objectives with measurable outcomes. Maintain content libraries and update materials regularly.');

In [None]:
SELECT job_title, job_description, resume_text, name, email, phone_number
FROM candidates c
JOIN JOBS j
ON AI_FILTER(PROMPT('Does the following resume {0} fit this job description {1}?', c.resume_text, j.job_description));

Classify candidates as entry level, management level, or executive

In [None]:
Select *,
    AI_CLASSIFY(
        resume_text, 
        ['entry level', 'management level', 'executive level'],
        {
            'task_description':'Categorize the candidate into one of the given levels of expertise'
        }):labels[0]::string as expertise
from candidates;

Use AI_AGG to look across all of the machine reports

In [None]:
with parsed_reports as (
SELECT 
SNOWFLAKE.CORTEX.PARSE_DOCUMENT('@EXAMPLE_DOCS',relative_path):content::STRING AS report_text
from DIRECTORY(@EXAMPLE_DOCS)
where relative_path like 'machine_reports/%'
)
Select 
    AI_AGG(report_text, 'What were the main reasons for machines not passing inspection?')
from parsed_reports;

Use AI_AGG to extract skills accross expertise

In [None]:
create or replace temporary table expertise as
Select resume_text,
    AI_CLASSIFY(
        resume_text, 
        ['entry level', 'management level', 'executive level'],
        {
            'task_description':'Categorize the candidate into one of the given levels of expertise'
        }):labels[0]::string as expertise
from candidates;

In [None]:
SELECT expertise,
       AI_AGG(resume_text, 'What are some common skills across these resumes?  List them in an array []') AS summarized_resumes
  FROM expertise
 GROUP BY expertise;