# SIMULATE DATA WITH CORTEX

You will have already processed unstructured data with document AI.  We will use this plus some additional structured synthetic patient data to simulate further patient data.

## 1. GATHER EXISTING SYTHETIC DATA
The table below contains some high level information about synthetic people.  some of these people also has linked documents about them.

In [None]:
SELECT * FROM DEFAULT_DATABASE.DEFAULT_SCHEMA.SAMPLE_PEOPLE_FOR_LAB

## GENERATE OTHER HISTORIC DATA
You will remember from the cortex playground that we have already seen you can use LLMs to provide synthetic data.  Let's now use this logic to get some more patient history.  We will assume that all this infomation is stored electronically

In [None]:
CREATE TABLE IF NOT EXISTS DEFAULT_SCHEMA.CONSULTANT_LETTERS as

SELECT NHS_NUMBER, SNOWFLAKE.CORTEX.COMPLETE('claude-3-5-sonnet',

CONCAT('write a recent letter using markdown for formatting purposes which includes bold, bullet points and paragraphs within the last three months from the patients consultant to their GP about a recent visit to the the consultant about their morbidity',
            PERSON_INFO::text,
            'Fill out any unknown details with synthetic ones.  Do not use placeholders')) LETTER from DEFAULT_DATABASE.DEFAULT_SCHEMA.SAMPLE_PEOPLE_FOR_LAB;

SELECT * FROM DEFAULT_SCHEMA.CONSULTANT_LETTERS;

## USE PYTHON TO CREATE A DOCUMENT BROWSER
The documents here are written in markdown for formatting and can be easily visualised using python.

In [None]:
import streamlit as st
import pandas as pd
from snowflake.snowpark.functions import *
from snowflake.snowpark.types import *

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()

letters = session.table('DEFAULT_SCHEMA.CONSULTANT_LETTERS')

NHS_NUMBER = letters.select('NHS_NUMBER')

with st.container(height=1000):
    st.markdown('### CONSULTANT LETTER')
    S_NHS_NUMBER = st.selectbox('Choose NHS Number:',NHS_NUMBER)
    st.markdown(letters.filter(col('NHS_NUMBER')==S_NHS_NUMBER).collect()[0][1])

## Generate Historic Notes

the cortex functions work both with SQL and snowpark dataframes for python.  We will now use the same prompt which we tried in cortex playground to get synthetic fhir messages from an eletronic system.

In [None]:
model = 'claude-3-5-sonnet'
prompt = '''based on the following dataset, write a detailed synthetic longitudinal medical record which includes medications prescribed, 
when they were prescribed,any key events that have happened during the lifetime of the patient and please add made up detailed notes from the practitioner.  
Use the data provided in the prompt as a guide for building the dataset, but make up any data that doesnt exist.  
Use the dataset as a guide to make up the record. Dont include anything before 2004.  
Return the results using FHIR standards in a nested json object.  ONLY INCLUDE JSON WITHOUT NOTES'''

call_llm = call_function('snowflake.cortex.complete',model,concat(lit(prompt),col('PERSON_INFO').astype(StringType())))

historic_electronic_data = session.table('DEFAULT_DATABASE.DEFAULT_SCHEMA.SAMPLE_PEOPLE_FOR_LAB').with_column('HISTORIC_NOTES',call_llm).drop('PERSON_INFO')

#historic_electronic_data.write.mode('overwrite').save_as_table("DEFAULT_SCHEMA.HISTORIC_RECORD")

In [None]:
notes = session.table('DEFAULT_DATABASE.DEFAULT_SCHEMA.HISTORIC_RECORD').with_column('HISTORIC_NOTES',parse_json('HISTORIC_NOTES'))
notes

## Create detailed patient activity tables
Below you will create a new view of data for each Resouce Type 

In [None]:
flattened_notes = notes.join_table_function('flatten',col('HISTORIC_NOTES')['entry']).select('NHS_NUMBER',col('VALUE'))

patient_details = flattened_notes.filter(col('VALUE')['resourceType']=='Patient').with_column_renamed('VALUE','PATIENT')
patient_details.createOrReplaceView("DEFAULT_SCHEMA.V_PATIENT_DETAILS")
st.write(patient_details.limit(2))

Condition = flattened_notes.filter(col('VALUE')['resourceType']=='Condition').with_column_renamed('VALUE','CONDITION')
Condition.createOrReplaceView("DEFAULT_SCHEMA.V_PATIENT_CONDITIONS")
st.write(Condition.limit(2))

MedicationStatement = flattened_notes.filter(col('VALUE')['resourceType']=='MedicationStatement').with_column_renamed('VALUE','MEDICATION')
MedicationStatement.createOrReplaceView("DEFAULT_SCHEMA.V_PATIENT_MEDICATION")
st.write(MedicationStatement.limit(2))

Encounter = flattened_notes.filter(col('VALUE')['resourceType']=='Encounter').with_column_renamed('VALUE','ENCOUNTER')
Encounter.createOrReplaceView("DEFAULT_SCHEMA.V_ENCOUNTER")
st.write(Encounter.limit(2))

## USING SUMMARIZE TO CREATE A SUMMARY OF THE MEDICAL INFORMATION

In [None]:
summarized_notes = notes.with_column('SUMMARY',(call_function('SNOWFLAKE.CORTEX.SUMMARIZE',col('HISTORIC_NOTES'))))
#summarized_notes.drop('HISTORIC_NOTES').write.mode('overwrite').save_as_table("DEFAULT_SCHEMA.ELECTRONIC_NOTES_SUMMARY")

session.table("DEFAULT_SCHEMA.ELECTRONIC_NOTES_SUMMARY")

## CREATE DAILY OBSERVATIONS

Here, we are going to create some time series data for medical observations.  We will do this for blood pressure.

In [None]:
CREATE OR REPLACE VIEW DEFAULT_SCHEMA.V_GENERATE_OBSERVATION_DAYS AS

select B.*,A.* FROM

(
SELECT 
  --SEQ4() AS id,
  DATEADD(DAY, SEQ4(), DATE '2025-04-01') AS date
FROM TABLE(GENERATOR(ROWCOUNT => 30))v
ORDER BY date asc) A

INNER JOIN DEFAULT_DATABASE.DEFAULT_SCHEMA.SAMPLE_PEOPLE_FOR_LAB B;

SELECT * FROM DEFAULT_SCHEMA.V_GENERATE_OBSERVATION_DAYS

Now blood pressure readings are generated for each person in the sample.  This can take around 20 minutes to run.  Here we are being a lot more specifific with the prompt engineering such as being specific about the FHIR standard to ensure consistancy accross all returned values.  You will note that i am helping the prompt by specifying the exact format we would like the results to be in.  This is based on a **FHIR** standard.  Once the data is retrieved, a streamlit browser is created to browse through the documents per day and per patient.

In [None]:
model2 = 'claude-3-5-sonnet'

prompt2 = '''based on the following synthetic patient trial dataset,''' 

prompt3 = '''write 6 synthetic random but appropiate blood pressure readings, with variances during the day.  
Return the results by inserting each reading into an array using the provided json template. All data based on this date'''


prompt4 = '''Include a blood pressure reading every 4 hours.
ONLY INCLUDE JSON WITHOUT NOTES. DO NOT TRUNCATE THE RESULTS'''

prompt5 = {"component": [
          {
            "code": {
              "coding": [
                {
                  "system": "http://loinc.org",
                  "code": "8480-6",
                  "display": "Systolic blood pressure"
                }
              ]
            },
            "valueQuantity": {
              "value": 138,
              "unit": "mmHg"
            }
          },
          {
            "code": {
              "coding": [
                {
                  "system": "http://loinc.org",
                  "code": "8462-4",
                  "display": "Diastolic blood pressure"
                }
              ]
            },
            "valueQuantity": {
              "value": 88,
              "unit": "mmHg"
            }
          }
        ]
      }


call_llm = call_function('snowflake.cortex.complete',model2,concat(lit(prompt2),col('PERSON_INFO').astype(StringType()),
                                                                   lit(prompt3),col('DATE').astype(StringType()),
                                                                   lit('use the json provided as a template'),
                                                                   lit(prompt5).astype(StringType()),
                                                                  lit(prompt4)))

blood_pressure = session.table('DEFAULT_DATABASE.DEFAULT_SCHEMA.V_GENERATE_OBSERVATION_DAYS').cache_result().with_column('BPRESSURE',call_llm).drop('PERSON_INFO')
#blood_pressure.write.mode('overwrite').save_as_table("DEFAULT_SCHEMA.BLOOD_PRESSURE")
with st.container(height=500):
    results = session.table('DEFAULT_SCHEMA.BLOOD_PRESSURE')
    s_patient = st.selectbox('Choose Patient:',results.select('NHS_NUMBER').distinct())
    results = results.filter(col('NHS_NUMBER')==s_patient)
    s_date = st.select_slider('choose_date:',results.select('DATE').distinct().order_by('DATE'))
    st.code(results.select('BPRESSURE').filter(col('DATE')==s_date).collect()[0][0])

### USING SEMI STRUCTURED NOTATION TO FORMAT THE RESULTS IN A TABLE

In [None]:
CREATE OR REPLACE VIEW DEFAULT_SCHEMA.V_BLOOD_PRESSURE_VALUES AS
select A.* EXCLUDE BPRESSURE, 
REPLACE(VALUE:effectiveDateTime::text,'T24','T00')::DATETIME "DATE_TIME",
VALUE:component[0]:valueQuantity:value::integer Systolic_blood_pressure,
VALUE:component[1]:valueQuantity:value::integer Diastolic_blood_pressure
from DEFAULT_SCHEMA.BLOOD_PRESSURE A, 
lateral flatten (PARSE_JSON(BPRESSURE)) B;

SELECT * FROM DEFAULT_SCHEMA.V_BLOOD_PRESSURE_VALUES

### VISUALISING THE BLOOD PRESSURE OBSERVATIONS USING STREAMLIT

In [None]:
bpressure = session.table('DEFAULT_DATABASE.DEFAULT_SCHEMA.V_BLOOD_PRESSURE_VALUES')
bpressure1 = bpressure.filter(col('NHS_NUMBER')=='0053634643')\

bpressure_S = bpressure1.group_by('DATE').agg(max('Systolic_blood_pressure').alias('MAXSBP'),
                                                             min('Systolic_blood_pressure').alias('MINSBP'))

st.line_chart(bpressure_S,x='DATE',y='MAXSBP')
st.line_chart(bpressure_S,x='DATE',y='MINSBP')
st.line_chart(bpressure1,x='DATE_TIME',y='SYSTOLIC_BLOOD_PRESSURE')

### CREATE ADDITIONAL SUPPORTING DATA

Let's now focus on the two patients that we used in the document AI GP notes example

Here, the two patients which we have documents for from document ai, we are also going to have additional supporting data which can  be used for a patient dashboard.

In [None]:
CREATE TABLE IF NOT EXISTS DEFAULT_SCHEMA.COVID_TRIAL_DIAGNOSTICS AS

SELECT DISTINCT NHS_NUMBER,

ARRAY_AGG(OBJECT_CONSTRUCT(*)) PATIENT_DATA,

SNOWFLAKE.CORTEX.COMPLETE('mistral-large2',CONCAT('''Create a fake ecg report over a 24 hour period taking this data into account''',PATIENT_DATA::TEXT))ECG,
SNOWFLAKE.CORTEX.COMPLETE('mistral-large2',CONCAT('''Create a fake mri brain scan report taking this data into account''',PATIENT_DATA::TEXT))MRI,
SNOWFLAKE.CORTEX.COMPLETE('mistral-large2',CONCAT('''Create a fake lung function test report taking this data into account''',PATIENT_DATA::TEXT))LUNG_FUNCTION

FROM DEFAULT_SCHEMA.GP_NOTES_CLEANED GROUP BY ALL;

select * from DEFAULT_SCHEMA.COVID_TRIAL_DIAGNOSTICS

Let's now use Cortex Complete to explain the blood pressure readings of the same two patients

In [None]:
SELECT * FROM DEFAULT_DATABASE.DEFAULT_SCHEMA.ELECTRONIC_NOTES_SUMMARY A 

NATURAL JOIN DEFAULT_SCHEMA.COVID_TRIAL_DIAGNOSTICS B 


## Combine all unstructured data and create a Patient Search Service.  
As the data is relatively short, we will not chunk it this time, we will create a search service on these patients '**as-is**'.

In [None]:
CREATE OR REPLACE TABLE DEFAULT_SCHEMA.PATIENT_DATA_SEARCH_SERVICE AS 
SELECT NHS_NUMBER,'SUMMARY' INFORMATION_TYPE, SUMMARY TEXT FROM DEFAULT_SCHEMA.ELECTRONIC_NOTES_SUMMARY

UNION

SELECT NHS_NUMBER, 'SUMMARY' INFORMATION_TYPE,PATIENT_DATA::TEXT TEXT FROM DEFAULT_SCHEMA.COVID_TRIAL_DIAGNOSTICS 
UNION
SELECT NHS_NUMBER, 'ECG' INFORMATION_TYPE,ECG TEXT FROM DEFAULT_SCHEMA.COVID_TRIAL_DIAGNOSTICS
UNION
SELECT NHS_NUMBER, 'MRI' INFORMATION_TYPE, MRI TEXT FROM DEFAULT_SCHEMA.COVID_TRIAL_DIAGNOSTICS
UNION
SELECT NHS_NUMBER, 'GP_NOTES' INFORMATION_TYPE, ARRAY_AGG(OBJECT_CONSTRUCT('APPOINTMENT DATE',APPOINTMENT_DATE,'GP NOTES',GP_NOTES,'PRESCRIPTION',PRESCRIPTION))::TEXT TEXT FROM DEFAULT_SCHEMA.GP_NOTES_CLEANED GROUP BY ALL


In [None]:
CREATE OR REPLACE  CORTEX SEARCH SERVICE DEFAULT_SCHEMA.PATIENT_DATA
  ON TEXT
  ATTRIBUTES NHS_NUMBER,INFORMATION_TYPE
  WAREHOUSE = DEFAULT_WH
  TARGET_LAG = '1 hour'
  COMMENT = 'SEARCH SERVICE FOR PATIENT DATA'
  AS SELECT * FROM DEFAULT_SCHEMA.PATIENT_DATA_SEARCH_SERVICE;