# Covid-19 Vaccine Consent forms :scientist:
Building a data Engineering Pipeline using DocumentAI :snowflake:

Now that our business user has leveraged the low-code UI of DocumentAI to define what ingihts we are looking to extract and has ensured that the insights extracted are correct let's build our data engineering pipeline


# Extracting insights from multiple documents in stage

## 🪣 What documents do we need to process? 🪣

In [None]:
-- looking at the documents we have in our stage
SELECT * FROM DIRECTORY(@COVID_VACCINATION);

### ✨ Creating a raw table for extracted data via the **predict** model (**COVID_VACCINATION_CONSENT_FORM**) ✨

In [None]:
-----CREATE A RAW TABLE WITH THE Covid-19 vaccination consent data
CREATE OR REPLACE TABLE RAW_COVID_VACCINE_CONSENT AS 
SELECT  
GET_PRESIGNED_URL(@COVID_VACCINATION, RELATIVE_PATH) DOC_URL, SIZE, LAST_MODIFIED,
HEALTH_DB.PUBLIC.COVID_VACCINATION_CONSENT_FORM!PREDICT(DOC_URL,3) DOC_META ,RELATIVE_PATH
FROM (select * from DIRECTORY (@COVID_VACCINATION));


In [None]:
SELECT * FROM RAW_COVID_VACCINE_CONSENT;

### Flattening out the data and creating a structured table

In [None]:
--- Create a table for the processed information
CREATE OR REPLACE VIEW HEALTH_DB.PUBLIC.COVID_VACCINE_CONSENTS AS 
SELECT DOC_META,
DOC_META:CHILDBEARING_F[0]:value::text "Female Patient Childbearing Age (Y/N)"
,DOC_META:CONSENT[0]:value::text "CONSENT (Y/N)" 
,DOC_META:DATE_BIRTH[0]:value::text " Date of Birth"
,DOC_META:GENDER[0]:value::text "Gender"
,DOC_META:GP_NAME[0]:value::text "Name of Surgery"
,DOC_META:INJECTION[0]:value::text "Injection Arm"
,DOC_META:NHS_NUMBER[0]:value::text "NHS Number"
,DOC_META:PATIENT_NAME[0]:value::text "Patient Name"
,DOC_META:REASON[0]:value::text "Reason for no vaccine"
,relative_path
FROM RAW_COVID_VACCINE_CONSENT;

-- checking the table
SELECT * FROM COVID_VACCINE_CONSENTS;

### 🔄🔄 Use a task to automatically update the table with new rows as more documents come in 🔄🔄

In [None]:
CREATE OR REPLACE TASK load_new_file_data
  WAREHOUSE = DOC_AI_WH
  SCHEDULE = '1 minute'
  COMMENT = 'Process new files in the stage and insert data into the pdf_reviews table.'
WHEN SYSTEM$STREAM_HAS_DATA('my_pdf_stream')
AS
INSERT INTO VACCINE_CONSENT (
  SELECT  
    GET_PRESIGNED_URL(@COVID_VACCINATION, RELATIVE_PATH) DOC_URL, 
    HEALTH_DB.PUBLIC.COVID_VACCINATION_CONSENT_FORM!PREDICT(DOC_URL,3) DOC_META ,RELATIVE_PATH
    FROM (select * from DIRECTORY (@COVID_VACCINATION))
    WHERE METADATA$ACTION = 'INSERT'
);

# Preparing data for Streamlit App :page_with_curl:

The purpose of the streamlit application is for the business user to be able to review any documents that have extracted values with the confidence value below defined tolerence levels

In [None]:
-- Create a table with all values and scores
CREATE OR REPLACE VIEW health_db.public.covid_ocr_score2
AS
WITH 
-- First part gets the result from applying the model on the pdf documents as a JSON with additional metadata
temp as(
    SELECT * FROM RAW_COVID_VACCINE_CONSENT
    
)
-- Second part extract the values and the scores from the JSON into columns
SELECT

RELATIVE_PATH AS file_name
, SIZE AS file_size
, last_modified
, GET_PRESIGNED_URL(@COVID_VACCINATION, RELATIVE_PATH) snowflake_file_url
, DOC_META AS JSON
, json:__documentMetadata.ocrScore::FLOAT AS ocrScore
, json:CHILDBEARING_F[0]:value::STRING as Female_Patient_Childbearing
, json:CHILDBEARING_F[0]:score::FLOAT AS CHILDBEARING_F_score
, json:CONSENT[0]:value::STRING as CONSENT
, json:CONSENT[0]:score::FLOAT AS CONSENT_score
, json:DATE_BIRTH[0]:value::STRING as DATE_BIRTH
, json:DATE_BIRTH[0]:score::FLOAT AS DATE_BIRTH_score
, json:GENDER[0]:value::STRING as GENDER
, json:GENDER[0]:score::FLOAT AS GENDER_score
, json:GP_NAME[0]:value::STRING as GP_NAME
, json:GP_NAME[0]:score::FLOAT AS GP_NAME_score
, json:INJECTION[0]:value::STRING as INJECTION
, json:INJECTION[0]:score::FLOAT AS INJECTION_score
, json:NHS_NUMBER[0]:value::STRING as NHS_NUMBER
, json:NHS_NUMBER[0]:score::FLOAT AS NHS_NUMBER_score
, json:PATIENT_NAME[0]:value::STRING as PATIENT_NAME
, json:PATIENT_NAME[0]:score::FLOAT AS PATIENT_NAME_score
, json:REASON[0]:value::STRING as REASON
, json:REASON[0]:score::FLOAT AS REASON_score
FROM temp;

In [None]:
from snowflake.snowpark.context import get_active_session
import streamlit as st
session = get_active_session()
table = session.table('health_db.public.covid_ocr_score2')

st.markdown('### DATA QUALITY SCORES')
col1,col2,col3,col4 = st.columns(4)
with col1:
    st.markdown('INJECTION')
    st.bar_chart(table, y='INJECTION_SCORE',x='FILE_NAME', color='#fc8702')
with col2:
    st.markdown('GP NAME')
    st.bar_chart(table, y='GP_NAME_SCORE',x='FILE_NAME', color='#7bcbff')
with col3:
    st.markdown('GENDER')
    st.bar_chart(table, y='GENDER_SCORE',x='FILE_NAME', color ='#ba78e5')

with col4:
    st.markdown('REASON')
    st.bar_chart(table, y='REASON_SCORE',x='FILE_NAME', color='#fc8702')

col1,col2,col3,col4 = st.columns(4)
with col1:
    st.markdown('NHS NUMBER')
    st.bar_chart(table, y='NHS_NUMBER_SCORE',x='FILE_NAME', color='#7bcbff')
with col2:
    st.markdown('PATIENT NAME')
    st.bar_chart(table, y='PATIENT_NAME_SCORE',x='FILE_NAME', color ='#ba78e5')

with col3:
    st.markdown('DATE OF BIRTH')
    st.bar_chart(table, y='DATE_BIRTH_SCORE',x='FILE_NAME', color='#7bcbff')
with col4:
    st.markdown('CONSENT')
    st.bar_chart(table, y='CONSENT_SCORE',x='FILE_NAME', color ='#ba78e5')