In [None]:
import pandas as pd
import numpy as np

import snowflake.core
from snowflake.snowpark import Session
from snowflake.core import Root
import snowflake.snowpark as snowpark
from snowflake.snowpark.context import get_active_session
from snowflake.cortex import complete

from typing import List
import os
import sys
import json
import time

#Set up snowflake session vars and env vars
session = get_active_session()
root = Root(session)

os.environ["TRULENS_OTEL_TRACING"] = "1"

In [None]:
DB_NAME = "SUMMIT_25_AI_OBS_DEMO"
SCHEMA_NAME = "DATA"
STAGE_NAME = "DOCS"
WH_NAME = "COMPUTE_WH"

In [None]:
#Access cortex search retriever built in 1st notebook
t1 = time.time()
test_query = "What is the performance of Cortex Search?"


cortex_search_service = (
    root
    .databases[DB_NAME]
    .schemas[SCHEMA_NAME]
    .cortex_search_services["SNOWFLAKE_BLOG_RETRIEVAL"]
)
resp = cortex_search_service.search(
    query=test_query,
    columns=["SEARCH_COL"],
    limit=10,
    experimental={"returnConfidenceScores": True}
)
t2 = time.time()

f_time = 1000*(t2 - t1)
print(f"Execution time: {f_time:.2f} milliseconds")

search_results = [row["SEARCH_COL"] for row in resp.results] if resp.results else []

search_results

In [None]:
#Filter out resuls with confidence score below a set confidence_score_threshold
confidence_score_threshold = 1
filtered_results = list(filter(lambda x: int(x['@CONFIDENCE_SCORE']) >=confidence_score_threshold, resp.results))
context_chunks = list(map(lambda x: x['SEARCH_COL'], filtered_results))

context_chunks

In [None]:
# Below code is 50x slower (1 ms instead of 0.02 but a little cleaner)
t1 = time.time()

[d['SEARCH_COL'] for d in resp.results if int(d['@CONFIDENCE_SCORE']) >= 2]

t2 = time.time()

f_time = 1000*(t2 - t1)

print(f"Execution time: {f_time:.2f} milliseconds")

In [None]:
respo = requests.get('https://docs.snowflake.com/en/user-guide/snowflake-cortex/cortex-analyst/custom-instructions').text
# .content.decode('utf-8')
soupo = BeautifulSoup(respo, 'html.parser')

# Get all text content
text_content = soupo.get_text(separator='\n', strip=True)
text_content.split('¶')

In [None]:
import requests
from bs4 import BeautifulSoup

def search_snow_docs(query):
    try:
        #Define URL and get links
        url = f"https://docs.snowflake.com/search?q={query}"
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx or 5xx)
        
        #set up bs4 and get all links from search result page
        soup = BeautifulSoup(response.text, 'html.parser')
        links = [a.get('href') for a in soup.find_all('a', href=True) if a.get('href').startswith("https://")]
        try:
            links.remove('https://docs.snowflake.com')
            links.remove('https://status.snowflake.com')
            links.remove('https://other-docs.snowflake.com/en/opencatalog/overview')
        except:
            pass
        return requests.get(links[0].content)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
        return []
    

all_links = search_snow_docs("Cortex Analyst Custom instructions")

if all_links:
    for link in all_links:
        print(link)
else:
    print("No links found or an error occurred.")

In [None]:
# Create the RAGWithObservability class to structure the RAG pipeline
from snowflake.cortex import complete
from trulens.core.otel.instrument import instrument
from trulens.otel.semconv.trace import SpanAttributes


class RAGWithObservability():
    def __init__(self, llm_model, retriever):
        self.llm_model = llm_model
        self.retriever = retriever
        
#Here we're using the @instrument decorator to trace various stages of our RAG applicaiton
    @instrument (
        span_type=SpanAttributes.SpanType.RETRIEVAL, 
        attributes={
            SpanAttributes.RETRIEVAL.QUERY_TEXT: "query",
            SpanAttributes.RETRIEVAL.RETRIEVED_CONTEXTS: "return",
        })  
    def retrieve_context(self, query: str) -> List[str]:
        return self.retriever.retrieve(query)

    @instrument()
    def augment_prompt(self, query: str, contexts: list) -> str:
     
        prompt = f"""
        You are an expert assistant extracting information from context provided.
        Answer the question based on the context. Be concise and do not hallucinate.
        If you don't have the information, just say so.
        Context: {' '.join(contexts)}
        Question: {query}
        Answer:
        """
        return prompt


    @instrument (span_type=SpanAttributes.SpanType.GENERATION)    
    def generate_completion(self, query: str):
        
        df_response = complete(self.llm_model, query)
        return df_response


    @instrument (
        span_type=SpanAttributes.SpanType.RECORD_ROOT, 
        attributes={
            SpanAttributes.RECORD_ROOT.INPUT: "query",
            SpanAttributes.RECORD_ROOT.OUTPUT: "return",
        })
    def query_app(self, query: str) -> str:
        contexts = self.retrieve_context(query)
        prompt = self.augment_prompt(query, contexts)
        final_response = self.generate_completion(prompt)
        return final_response

In [None]:
import streamlit as st

#Define LLM classes
llama_rag = RAGWithObservability('llama3.1-8b', retriever)
mistral7b_rag = RAGWithObservability('mistral-7b', retriever)
claude_rag = RAGWithObservability('claude-3-5-sonnet', retriever)

#print Query
print(f"Query: {test_query}")

#Get and print responses
llama_response = llama_rag.query_app(test_query)
st.write(f"**Llama response** -  {llama_response} \n")

mistral_response = mistral7b_rag.query_app(test_query)
st.write(f"**Mistral-7b response** - {mistral_response} \n")

claude_response = claude_rag.query_app(test_query)
st.write(f"**Claude response** -  {claude_response} \n")

## Step 6: Observe and Evaluate LLM Performance with AI Observability (powered by TruLens)

**Adding Observability and Evaluataion to our RAG application**

Here, we enhance the Retrieval-Augmented Generation (RAG) process by introducing observability. Observability ensures that LLM responses can be measured and evaluated based on various feedback metrics, providing insights into the model's performance and areas for improvement.

**How This Works**

We will use a feature called AI Observability to register our recently created applications in Snowflake. This will allow users to pass in prompts to these applications, and trace each step the application takes to Retrieve appropriate context, Augment a system prompt with additional context and Generate a complete answer for the given prompt. 

From there we will use LLM-as-a-Judge based evaluations to measure LLM performance based on **feedback metrics** including:
- **Answer Relevance** - Evaluates how directly the LLM's response addresses the user's prompt.
- **Context Relevance** - Assesses the relevance of the retrieved context to the user's prompt.
- **Groundedness**  - Measures how well the LLM's response is anchored in the retrieved context.
- **Coherance** - Evaluates how logically structured and easy to follow the LLM's response is.

In [None]:
# Define image in a stage and read the file
image=session.file.get_stream("@SKO_SKORAGHOP_LIVE_PROD.HOP.RAG/AIObservability.jpg", decompress=False).read() 

# Display the image
st.image(image, width=800)

In [None]:
# from trulens.core import TruSession
from trulens.apps.app import TruApp
from trulens.connectors.snowflake import SnowflakeConnector

tru_snowflake_connector = SnowflakeConnector(snowpark_session=session)

app_name = "test_sko_app_update"
version_num = 'v0'

tru_rag_mistral = TruApp(
    mistral7b_rag,
    app_name=app_name,
    app_version=f"mistral_test_{version_num}",
    connector=tru_snowflake_connector
)

tru_rag_llama = TruApp(
    llama_rag,
    app_name=app_name,
    app_version=f"llama_test_{version_num}",
    connector=tru_snowflake_connector
)

tru_rag_claude = TruApp(
    claude_rag,
    app_name=app_name,
    app_version=f"claude_test_{version_num}",
    connector=tru_snowflake_connector
)

In [None]:
import pandas as pd

prompts = [
    "What are some metrics to measure the quality of a retrieval system?",
    "Can I have a back-and-forth conversation with Cortex?",
    "Does Snowflake support text-to-sql? What services would support this?",
    "What year was the war of 1812?",
    "Tell me a story about Snowflake Cortex"
]


batch_data = pd.DataFrame({'QUERY': prompts})
batch_data

In [None]:
from trulens.core.run import Run
from trulens.core.run import RunConfig

mistral_run_config = RunConfig(
    run_name=f"mistral_exp_{version_num}",
    description="questions about snowflake AI cababilities",
    dataset_name="SNOW_RAG_DF1",
    source_type="DATAFRAME",
    label="MISTRAL",
    llm_judge_name = "llama3.1-70b",
    dataset_spec={
        "RECORD_ROOT.INPUT": "QUERY",
    },
)



llama_run_config = RunConfig(
    run_name=f"llama_exp_{version_num}",
    description="questions about snowflake AI cababilities",
    dataset_name="SNOW_RAG_DF1",
    source_type="DATAFRAME",
    label="LLAMA",
    dataset_spec={
        "RECORD_ROOT.INPUT": "QUERY",
    },
    
)


claude_run_config = RunConfig(
    run_name=f"claude_exp_{version_num}",
    description="questions about snowflake AI cababilities",
    dataset_name="SNOW_RAG_DF1",
    source_type="DATAFRAME",
    label="CLAUDE",
    dataset_spec={
        "RECORD_ROOT.INPUT": "QUERY",
    },
    
)

In [None]:
mistral_run = tru_rag_mistral.add_run(run_config=mistral_run_config)

llama_run = tru_rag_llama.add_run(run_config=llama_run_config)

claude_run = tru_rag_claude.add_run(run_config=claude_run_config)

In [None]:
mistral_run.start(input_df=batch_data)
print("Finished mistral run")

In [None]:
llama_run.start(input_df=batch_data)
print("Finished Llama run")

In [None]:
claude_run.start(input_df=batch_data)
print("Finished Claude run")

In [None]:
print(f"Mistral: {mistral_run.get_status()}")
print(f"Llama: {llama_run.get_status()}")
print(f"Claude: {claude_run.get_status()}")

In [None]:
#The following code kicks off LLM-as-a-Judge evals for several metrics

mistral_run.compute_metrics([
    "coherence",
    "answer_relevance",
    "context_relevance",
    "groundedness",
])

In [None]:
#The following code kicks off LLM-as-a-Judge evals for several metrics

llama_run.compute_metrics([
    "coherence",
    "answer_relevance",
    "context_relevance",
    "groundedness",
])

In [None]:
#The following code kicks off LLM-as-a-Judge evals for several metrics

claude_run.compute_metrics([
    "coherence",
    "answer_relevance",
    "context_relevance",
    "groundedness",
])

In [None]:
print(f"Mistral: {mistral_run.get_status()}")
print(f"Llama: {llama_run.get_status()}")
print(f"Claude: {claude_run.get_status()}")

In [None]:
import streamlit as st

org_name = session.sql('SELECT CURRENT_ORGANIZATION_NAME()').collect()[0][0]
account_name = session.sql('SELECT CURRENT_ACCOUNT_NAME()').collect()[0][0]
db_name = session.sql('SELECT CURRENT_DATABASE()').collect()[0][0]
schema_name = session.sql('SELECT CURRENT_SCHEMA()').collect()[0][0]

st.write(f'https://app.snowflake.com/{org_name}/{account_name}/#/ai-evaluations/databases/{db_name}/schemas/{schema_name}/applications/{app_name.upper()}')

In [None]:
# Define image in a stage and read the file
image=session.file.get_stream("@SKO_SKORAGHOP_LIVE_PROD.HOP.RAG/AIObsApp.jpg", decompress=False).read() 

# Display the image
st.image(image, width=800)

In [None]:
image1=session.file.get_stream("@SKO_SKORAGHOP_LIVE_PROD.HOP.RAG/Anthropic.jpg", decompress=False).read() 
st.image(image1, width=800)
image2=session.file.get_stream("@SKO_SKORAGHOP_LIVE_PROD.HOP.RAG/Summary2.jpg", decompress=False).read() 
st.image(image2, width=800)