AI Engineering Bootcamp Cohort 4 Midterm

In [1]:
import nest_asyncio

nest_asyncio.apply()

In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

#### Install our key components for RAG etc

In [3]:
!pip install -q langchain
!pip install -q langchain-core==0.2.27 langchain-community==0.2.10
!pip install -q langchain-experimental==0.0.64 langgraph-checkpoint==1.0.6 langgraph==0.2.16 langchain-qdrant==0.1.3
!pip install -q langchain-openai==0.1.9
!pip install -q ragas==0.1.16

#### Install our vector store - Qdrant

In [4]:
!pip install -qU qdrant-client==1.11.2

#### Install supporting utilities

In [5]:
!pip install -qU tiktoken==0.7.0 pymupdf==1.24.10

Environment Variables

- get OpenAI API Key - will use some of the OpenAI models

In [35]:
import os
import getpass

openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
    openai_api_key = getpass.getpass("OpenAI API Key: ")

os.environ["OPENAI_API_KEY"] = openai_api_key

#### Set up our starting inputs and state and read in the documents

This allows to do do a lot of flexible testing to identify better decisions


In [7]:
from classes.app_state import AppState
from utilities.doc_utilities import get_documents
document_urls = [
    "https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf",
     "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf",
]

app_state = AppState()
app_state.set_debug(False)

app_state.set_document_urls(document_urls)

get_documents(app_state)


Set up our first model run

In [8]:
from classes.model_run_state import ModelRunState
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_openai import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings
from utilities.vector_utilities import create_vector_store

model_1000_100_state = ModelRunState()
model_1000_100_state.name = "TE3/1000/100"
model_1000_100_state.chunk_size = 1000
model_1000_100_state.chunk_overlap = 100

model_1000_100_state.qa_model_name = "gpt-4o-mini"
model_1000_100_state.qa_model = ChatOpenAI(model=model_1000_100_state.qa_model_name)

# the openai embedding model
model_1000_100_state.embedding_model_name = "text-embedding-3-small"
model_1000_100_state.embedding_model = OpenAIEmbeddings(model=model_1000_100_state.embedding_model_name)

create_vector_store(app_state, model_1000_100_state)


Vector store created


Test the retriever

In [9]:
query = "How should you be protected from abusive data practices "
results = model_1000_100_state.retriever.get_relevant_documents(query)

print(results[0].page_content)
print(results[0].metadata)
print("---")



You should be protected from abusive data practices via built-in 
protections and you should have agency over how data about 
you is used. You should be protected from violations of privacy through 
design choices that ensure such protections are included by default, including 
ensuring that data collection conforms to reasonable expectations and that 
only data strictly necessary for the specific context is collected. Designers, de­
velopers, and deployers of automated systems should seek your permission 
and respect your decisions regarding collection, use, access, transfer, and de­
letion of your data in appropriate ways and to the greatest extent possible; 
where not possible, alternative privacy by design safeguards should be used. 
Systems should not employ user experience and design decisions that obfus­
cate user choice or burden users with defaults that are privacy invasive. Con­
sent should only be used to justify collection of data in cases where it can be 
appropriately and

  warn_deprecated(


In [21]:
query = "tell me about Karen Hao"
results = model_1000_100_state.retriever.get_relevant_documents(query)

for result in results:
    print(result.page_content)
    print(result.metadata)
    print("---")

BLUEPRINT FOR AN 
AI BILL OF 
RIGHTS 
MAKING AUTOMATED 
SYSTEMS WORK FOR 
THE AMERICAN PEOPLE 
OCTOBER 2022
{'source': 'Blueprint for an AI Bill of Rights', 'document_id': '0c225ced-207c-4a0a-9925-28a6ad81a2ac', 'chunk_number': 1, '_id': '6913fb47c5384456bc64cc97986fc37b', '_collection_name': 'e1243742212d4c6eb16a99b4e28e318a'}
---
Table of Contents 
1. 
Introduction ..............................................................................................................................................1 
2. 
Overview of Risks Unique to or Exacerbated by GAI .....................................................................2 
3. 
Suggested Actions to Manage GAI Risks ......................................................................................... 12 
Appendix A. Primary GAI Considerations ............................................................................................... 47 
Appendix B. References .............................................................

In [12]:
from utilities.templates import get_qa_prompt
from langchain_openai import ChatOpenAI
from operator import itemgetter
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from utilities.debugger import dprint

def create_rag_chain(app_state, model_run_state):

    chat_prompt = get_qa_prompt()

    simple_chain = chat_prompt | model_run_state.qa_model
    dprint(app_state, simple_chain.invoke({"question": "Can you give me a summary of the 2 documents", "context":""}))

    rag_qa_chain = (
        {"context": itemgetter("question") | model_run_state.retriever, "question": itemgetter("question")}
        | RunnablePassthrough.assign(context=itemgetter("context"))


        | {"response": chat_prompt | model_run_state.qa_model, "context": itemgetter("context")}
    )
    response = rag_qa_chain.invoke({"question" : "What is the AI Bill of Rights "})
    dprint(app_state, response)
    dprint(app_state, response["response"].content)
    dprint(app_state, f"Number of found context: {len(response['context'])}")
    model_run_state.rag_qa_chain = rag_qa_chain
    print("RAG Chain Created")

create_rag_chain(app_state, model_1000_100_state)

Lets get the SDG done for evaluation

In [13]:

from langchain.text_splitter import RecursiveCharacterTextSplitter
from classes.ragas_state import RagasState
from ragas.testset.generator import TestsetGenerator
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# create document chunks
def Create_chunks_for_ragas(app_state, ragas_state):
    # we have 2 documents so want representative across both
    text_splitter_eval = RecursiveCharacterTextSplitter(
        chunk_size = ragas_state.chunk_size,
        chunk_overlap = ragas_state.chunk_overlap,
        length_function = len
    )
    combined_chunks_document = []
    for document in app_state.documents:
        eval_document = document["loaded_document"]
        document_chunks = text_splitter_eval.split_documents(eval_document)
        print(f"Num chumks: {len(document_chunks)}")
        combined_chunks_document = combined_chunks_document + document_chunks

    print(f"Total chunks: {len(combined_chunks_document)}")
    ragas_state.chunks = combined_chunks_document
    print()

# create the questions
def create_questions_for_ragas(app_state, ragas_state):
    generator_llm = ChatOpenAI(model=ragas_state.generator_llm)
    critic_llm = ChatOpenAI(model=ragas_state.critic_llm)
    embeddings = OpenAIEmbeddings()

    generator = TestsetGenerator.from_langchain(
        generator_llm,
        critic_llm,
        embeddings
    )

    testset = generator.generate_with_langchain_docs(
        ragas_state.chunks,
        ragas_state.num_questions, 
        ragas_state.distributions)
    # state.set_ragas_testset(testset)
    testset.to_pandas()
    testset.test_data[0]
    testset_df = testset.to_pandas()
    ragas_state.testset_df = testset_df
    print("Ragas questions created")
    testset_df

ragas_state = RagasState()
Create_chunks_for_ragas(app_state, ragas_state)
create_questions_for_ragas(app_state, ragas_state)


Num chumks: 439
Num chumks: 322
Total chunks: 761



Filename and doc_id are the same for all nodes.                     
Generating: 100%|██████████| 3/3 [00:12<00:00,  4.24s/it]


Ragas questions created


Generate answers based on the pipeline we have created

In [14]:
from datasets import Dataset
def create_answers(app_state, model_run_state, ragas_state):
  answers = []
  contexts = []

  test_questions = ragas_state.testset_df["question"].values.tolist()
  test_groundtruths = ragas_state.testset_df["ground_truth"].values.tolist()

  for question in test_questions:
    response = model_run_state.rag_qa_chain.invoke({"question" : question})
    answers.append(response["response"].content)
    contexts.append([context.page_content for context in response["context"]])

  # Wrap it in a huggingface dataset
  model_run_state.response_dataset = Dataset.from_dict({
      "question" : test_questions,
      "answer" : answers,
      "contexts" : contexts,
      "ground_truth" : test_groundtruths
  })
  model_run_state.response_dataset[0]
  print("Answers created - ready for Ragas evaluation")

create_answers(app_state, model_1000_100_state, ragas_state)

Answers created - ready for Ragas evaluation


Evaluation

In [15]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    answer_correctness,
    context_recall,
    context_precision,
)
def run_ragas_evaluation(app_state, model_run_state):
    metrics = [
        faithfulness,
        answer_relevancy,
        context_recall,
        context_precision,
        answer_correctness,
    ]
    model_run_state.ragas_results = evaluate(model_run_state.response_dataset, metrics)
    print("Ragas evaluation complete")
run_ragas_evaluation(app_state, model_1000_100_state)

Evaluating: 100%|██████████| 15/15 [00:13<00:00,  1.12it/s]


Ragas evaluation complete


In [29]:
model_1000_100_state.parameters()
#model_1000_100_state.results_summary()
model_1000_100_state.results()

print(model_1000_100_state.ragas_results)
results_df = model_1000_100_state.ragas_results.to_pandas()
results_df

Base model: gpt-4o-mini
Embedding model: Snowflake/snowflake-arctic-embed-m
Chink size: 1000
Chink overlap: 100
{'faithfulness': 0.6359, 'answer_relevancy': 0.9478, 'context_recall': 0.8333, 'context_precision': 1.0000, 'answer_correctness': 0.6633}


Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,"How can new GAI policies, procedures, and proc...","The new policies, procedures, and processes fo...","[19 \nGV-4.1-003 \nEstablish policies, procedu...","New GAI policies, procedures, and processes ca...",0.416667,0.94098,1.0,1.0,0.549834
1,How does confirmation bias contribute to poten...,Confirmation bias can significantly contribute...,[Algorithmic \nDiscrimination \nProtections \n...,Confirmation bias contributes to potentially i...,0.5625,0.966214,0.5,1.0,0.844938
2,What resources on AI risk management are avail...,The National Institute of Standards and Techno...,[NIST Trustworthy and Responsible AI \nNIST A...,The National Institute of Standards and Techno...,0.928571,0.936206,1.0,1.0,0.595151


In [67]:
snowflake_base_state = ModelRunState()
snowflake_base_state.name = "Snowflake_Base/1000/100"
snowflake_base_state.qa_model_name = "gpt-4o-mini"
snowflake_base_state.qa_model = ChatOpenAI(model=snowflake_base_state.qa_model_name)

# snowflake embedding model
snowflake_base_state.embedding_model_name = "Snowflake/snowflake-arctic-embed-m"
snowflake_base_state.embedding_model = HuggingFaceEmbeddings(model_name=snowflake_base_state.embedding_model_name)

# use same chunk size as before
snowflake_base_state.chunk_size = 1000
snowflake_base_state.chunk_overlap = 100
create_vector_store(app_state, snowflake_base_state)

create_rag_chain(app_state, snowflake_base_state)
create_answers(app_state, snowflake_base_state, ragas_state)
run_ragas_evaluation(app_state, snowflake_base_state)

Vector store created
Answers created - ready for Ragas evaluation


Evaluating: 100%|██████████| 15/15 [00:14<00:00,  1.04it/s]


Ragas evaluation complete


In [30]:

snowflake_base_state.parameters()
print(snowflake_base_state.ragas_results)
results_df = snowflake_base_state.ragas_results.to_pandas()
results_df


Base model: gpt-4o-mini
Embedding model: Snowflake/snowflake-arctic-embed-m
Chink size: 1000
Chink overlap: 100
{'faithfulness': 0.4478, 'answer_relevancy': 0.6039, 'context_recall': 0.3333, 'context_precision': 0.5833, 'answer_correctness': 0.3635}


Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,"How can new GAI policies, procedures, and proc...",The connection between new Generative AI (GAI)...,[Table of Contents \n1. \nIntroduction ..........,"New GAI policies, procedures, and processes ca...",0.454545,0.892835,0.0,0.833333,0.506942
1,How does confirmation bias contribute to poten...,"I don't have enough information, sorry. Howeve...",[BLUEPRINT FOR AN \nAI BILL OF \nRIGHTS \nMAKI...,Confirmation bias contributes to potentially i...,0.0,0.0,0.0,0.0,0.17908
2,What resources on AI risk management are avail...,The National Institute of Standards and Techno...,[57 \nNational Institute of Standards and Tech...,The National Institute of Standards and Techno...,0.888889,0.9188,1.0,0.916667,0.404474


Lets compare Snowflake base

In [34]:
import pandas as pd
def compare_results(run_model_1, run_model_2):
    results_1 = run_model_1.ragas_results
    results_2 = run_model_2.ragas_results
    comparison_data = {
        'Metric': list(results_1.keys()),
        run_model_1.name: [results_1[key] for key in results_1.keys()],
        run_model_2.name: [results_2[key] for key in results_2.keys()],
        'Difference': [results_2[key] - results_1[key] for key in results_1.keys()]
    }
    return pd.DataFrame(comparison_data)

snowflake_base_state.name = "Snowflake_Base/1000/100"
model_1000_100_state.name = "TE3/1000/100"
df = compare_results(snowflake_base_state, model_1000_100_state )
df

Unnamed: 0,Metric,Snowflake_Base/1000/100,TE3/1000/100,Difference
0,faithfulness,0.447811,0.635913,0.188101
1,answer_relevancy,0.603878,0.9478,0.343922
2,context_recall,0.333333,0.833333,0.5
3,context_precision,0.583333,1.0,0.416667
4,answer_correctness,0.363499,0.663308,0.299809


Lets take the fine tuned model for a run and test it

In [38]:
from classes.app_state import AppState
from classes.model_run_state import ModelRunState
from classes.ragas_state import RagasState
from utilities.doc_utilities import get_documents
from utilities.vector_utilities import create_vector_store

# document_urls = [
#     "https://www.whitehouse.gov/wp-content/uploads/2022/10/Blueprint-for-an-AI-Bill-of-Rights.pdf",
#      "https://nvlpubs.nist.gov/nistpubs/ai/NIST.AI.600-1.pdf",
# ]

# app_state_2 = AppState()
# app_state_2.set_debug(False)
# app_state_2.set_document_urls(document_urls)

# get_documents(app_state)

In [41]:
from langchain.embeddings import HuggingFaceEmbeddings
snowflake_finetune_state = ModelRunState()
snowflake_finetune_state.name = "Snowflake_Fine/1000/100"
snowflake_finetune_state.qa_model_name = "gpt-4o-mini"
snowflake_finetune_state.qa_model = ChatOpenAI(model=snowflake_finetune_state.qa_model_name)

# finetune snowflake embedding model

hf_username = "rchrdgwr"
hf_repo_name = "finetuned-arctic-model"

# Load the fine-tuned model from Hugging Face
snowflake_finetune_state.embedding_model_name = f"{hf_username}/{hf_repo_name}"
snowflake_finetune_state.embedding_model = HuggingFaceEmbeddings(model_name=snowflake_finetune_state.embedding_model_name)

# use same chunk size as before
snowflake_finetune_state.chunk_size = 1000
snowflake_finetune_state.chunk_overlap = 100
create_vector_store(app_state, snowflake_finetune_state)

create_rag_chain(app_state, snowflake_finetune_state)
create_answers(app_state, snowflake_finetune_state, ragas_state)
run_ragas_evaluation(app_state, snowflake_finetune_state)

Some weights of BertModel were not initialized from the model checkpoint at rchrdgwr/finetuned-arctic-model and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Vector store created
Answers created - ready for Ragas evaluation


Evaluating: 100%|██████████| 15/15 [00:19<00:00,  1.32s/it]


Ragas evaluation complete


In [42]:
snowflake_finetune_state.parameters()
print(snowflake_finetune_state.ragas_results)
results_df = snowflake_finetune_state.ragas_results.to_pandas()
results_df

Base model: gpt-4o-mini
Embedding model: rchrdgwr/finetuned-arctic-model
Chink size: 1000
Chink overlap: 100
{'faithfulness': 0.9103, 'answer_relevancy': 0.9455, 'context_recall': 0.8889, 'context_precision': 1.0000, 'answer_correctness': 0.4178}


Unnamed: 0,question,answer,contexts,ground_truth,faithfulness,answer_relevancy,context_recall,context_precision,answer_correctness
0,"How can new GAI policies, procedures, and proc...",New GAI (Generative Artificial Intelligence) p...,"[19 \nGV-4.1-003 \nEstablish policies, procedu...","New GAI policies, procedures, and processes ca...",0.730769,0.934013,1.0,1.0,0.527957
1,How does confirmation bias contribute to poten...,Confirmation bias can contribute to potentiall...,[Algorithmic \nDiscrimination \nProtections \n...,Confirmation bias contributes to potentially i...,1.0,0.966214,0.666667,1.0,0.33686
2,What resources on AI risk management are avail...,The National Institute of Standards and Techno...,[NIST Trustworthy and Responsible AI \nNIST A...,The National Institute of Standards and Techno...,1.0,0.936206,1.0,1.0,0.388704


In [52]:
import pandas as pd
def compare_results_3(run_model_1, run_model_2, run_model_3):
    # Extract results for each model
    results_1 = run_model_1.ragas_results
    results_2 = run_model_2.ragas_results
    results_3 = run_model_3.ragas_results

    # Create comparison data
    comparison_data = {
        'Metric': list(results_1.keys()),
        run_model_1.name: [results_1[key] for key in results_1.keys()],
        run_model_2.name: [results_2[key] for key in results_2.keys()],
        run_model_3.name: [results_3[key] for key in results_3.keys()],
        '1v2 Difference': [results_2[key] - results_1[key] for key in results_1.keys()],
        '1v3 Difference': [results_3[key] - results_1[key] for key in results_1.keys()],
        '2v3 Difference': [results_3[key] - results_2[key] for key in results_2.keys()]
    }

    # Return the dataframe
    return pd.DataFrame(comparison_data)

snowflake_base_state.name = "Snowflake_Base/1000/100"
model_1000_100_state.name = "TE3/1000/100"
df = compare_results_3(model_1000_100_state , snowflake_base_state,  snowflake_finetune_state)
df

Unnamed: 0,Metric,TE3/1000/100,Snowflake_Base/1000/100,Snowflake_Fine/1000/100,1v2 Difference,1v3 Difference,2v3 Difference
0,faithfulness,0.635913,0.447811,0.910256,-0.188101,0.274344,0.462445
1,answer_relevancy,0.9478,0.603878,0.945477,-0.343922,-0.002323,0.341599
2,context_recall,0.833333,0.333333,0.888889,-0.5,0.055556,0.555556
3,context_precision,1.0,0.583333,1.0,-0.416667,0.0,0.416667
4,answer_correctness,0.663308,0.363499,0.41784,-0.299809,-0.245467,0.054342


In [62]:
hf_username = "rchrdgwr"
hf_repo_name = "finetuned-arctic-model"

snowflake_finetune_model_name = f"{hf_username}/{hf_repo_name}"
snowflake_finetune_model = HuggingFaceEmbeddings(model_name=snowflake_finetune_model_name)

Some weights of BertModel were not initialized from the model checkpoint at rchrdgwr/finetuned-arctic-model and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [60]:
from utilities.constants import (
    CHUNKING_STRATEGY_TABLE_AWARE,
    CHUNKING_STRATEGY_SECTION_BASED,
    CHUNKING_STRATEGY_SEMANTIC
)

snowflake_finetune_section_state = ModelRunState()
snowflake_finetune_section_state.name = "Snowflake_FineSection/1000/100"
snowflake_finetune_section_state.qa_model_name = "gpt-4o-mini"
snowflake_finetune_section_state.qa_model = ChatOpenAI(model=snowflake_finetune_section_state.qa_model_name)

snowflake_finetune_section_state.embedding_model_name = snowflake_finetune_model_name
snowflake_finetune_section_state.embedding_model = snowflake_finetune_model

# use same chunk size as before
snowflake_finetune_section_state.chunking_strategy = CHUNKING_STRATEGY_SECTION_BASED
snowflake_finetune_section_state.chunk_size = 1000
snowflake_finetune_section_state.chunk_overlap = 100
create_vector_store(app_state, snowflake_finetune_section_state)

create_rag_chain(app_state, snowflake_finetune_section_state)
create_answers(app_state, snowflake_finetune_section_state, ragas_state)
run_ragas_evaluation(app_state, snowflake_finetune_section_state)
print(snowflake_finetune_section_state.ragas_results)

Some weights of BertModel were not initialized from the model checkpoint at rchrdgwr/finetuned-arctic-model and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Vector store created
Answers created - ready for Ragas evaluation


Evaluating: 100%|██████████| 15/15 [00:13<00:00,  1.10it/s]


Ragas evaluation complete
{'faithfulness': 0.9010, 'answer_relevancy': 0.9697, 'context_recall': 0.8889, 'context_precision': 1.0000, 'answer_correctness': 0.3700}


In [63]:
snowflake_finetune_table_state = ModelRunState()
snowflake_finetune_table_state.name = "Snowflake_FineTable/1000/100"
snowflake_finetune_table_state.qa_model_name = "gpt-4o-mini"
snowflake_finetune_table_state.qa_model = ChatOpenAI(model=snowflake_finetune_table_state.qa_model_name)

snowflake_finetune_table_state.embedding_model_name = snowflake_finetune_model_name
snowflake_finetune_table_state.embedding_model = snowflake_finetune_model

# use same chunk size as before
snowflake_finetune_table_state.chunking_strategy = CHUNKING_STRATEGY_TABLE_AWARE
snowflake_finetune_table_state.chunk_size = 1000
snowflake_finetune_table_state.chunk_overlap = 100
create_vector_store(app_state, snowflake_finetune_table_state)

create_rag_chain(app_state, snowflake_finetune_table_state)
create_answers(app_state, snowflake_finetune_table_state, ragas_state)
run_ragas_evaluation(app_state, snowflake_finetune_table_state)
print(snowflake_finetune_table_state.ragas_results)

Vector store created
Answers created - ready for Ragas evaluation


Evaluating: 100%|██████████| 15/15 [00:17<00:00,  1.13s/it]


Ragas evaluation complete
{'faithfulness': 0.6922, 'answer_relevancy': 0.9457, 'context_recall': 0.8889, 'context_precision': 1.0000, 'answer_correctness': 0.4848}


In [64]:

snowflake_finetune_semantic_state = ModelRunState()
snowflake_finetune_semantic_state.name = "Snowflake_FineSemantic/1000/100"
snowflake_finetune_semantic_state.qa_model_name = "gpt-4o-mini"
snowflake_finetune_semantic_state.qa_model = ChatOpenAI(model=snowflake_finetune_semantic_state.qa_model_name)

snowflake_finetune_semantic_state.embedding_model_name = snowflake_finetune_model_name
snowflake_finetune_semantic_state.embedding_model = snowflake_finetune_model

# use same chunk size as before
snowflake_finetune_semantic_state.chunking_strategy = CHUNKING_STRATEGY_SEMANTIC
snowflake_finetune_semantic_state.chunk_size = 1000
snowflake_finetune_semantic_state.chunk_overlap = 100
create_vector_store(app_state, snowflake_finetune_semantic_state)

create_rag_chain(app_state, snowflake_finetune_semantic_state)
create_answers(app_state, snowflake_finetune_semantic_state, ragas_state)
run_ragas_evaluation(app_state, snowflake_finetune_semantic_state)
print(snowflake_finetune_semantic_state.ragas_results)

Vector store created
Answers created - ready for Ragas evaluation


Evaluating: 100%|██████████| 15/15 [00:12<00:00,  1.16it/s]


Ragas evaluation complete
{'faithfulness': 0.8889, 'answer_relevancy': 0.9592, 'context_recall': 0.8889, 'context_precision': 1.0000, 'answer_correctness': 0.6295}


In [66]:
def compare_results_4(run_model_1, run_model_2, run_model_3, run_model_4):
    # Extract results for each model
    results_1 = run_model_1.ragas_results
    results_2 = run_model_2.ragas_results
    results_3 = run_model_3.ragas_results
    results_4 = run_model_4.ragas_results

    # Create comparison data
    comparison_data = {
        'Metric': list(results_1.keys()),
        run_model_1.name: [results_1[key] for key in results_1.keys()],
        run_model_2.name: [results_2[key] for key in results_2.keys()],
        run_model_3.name: [results_3[key] for key in results_3.keys()],
        run_model_4.name: [results_4[key] for key in results_4.keys()],
        '1v2 Difference': [results_2[key] - results_1[key] for key in results_1.keys()],
        '1v3 Difference': [results_3[key] - results_1[key] for key in results_1.keys()],
        '1v4 Difference': [results_4[key] - results_1[key] for key in results_1.keys()],
        '2v3 Difference': [results_3[key] - results_2[key] for key in results_2.keys()],
        '2v4 Difference': [results_4[key] - results_2[key] for key in results_2.keys()],
        '3v4 Difference': [results_4[key] - results_3[key] for key in results_3.keys()]
    }

    # Return the dataframe
    return pd.DataFrame(comparison_data)

df = compare_results_4(snowflake_finetune_state , snowflake_finetune_section_state, snowflake_finetune_table_state, snowflake_finetune_semantic_state)
df

Unnamed: 0,Metric,Snowflake_Fine/1000/100,Snowflake_FineSection/1000/100,Snowflake_FineTable/1000/100,Snowflake_FineSemantic/1000/100,1v2 Difference,1v3 Difference,1v4 Difference,2v3 Difference,2v4 Difference,3v4 Difference
0,faithfulness,0.910256,0.900966,0.69216,0.888889,-0.00929,-0.218097,-0.021368,-0.208806,-0.012077,0.196729
1,answer_relevancy,0.945477,0.969683,0.945677,0.959232,0.024206,0.0002,0.013755,-0.024006,-0.010451,0.013555
2,context_recall,0.888889,0.888889,0.888889,0.888889,0.0,0.0,0.0,0.0,0.0,0.0
3,context_precision,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,answer_correctness,0.41784,0.370029,0.4848,0.629459,-0.047811,0.066959,0.211619,0.114771,0.25943,0.14466


In [68]:
df = compare_results_4(snowflake_finetune_state , snowflake_finetune_section_state, snowflake_finetune_table_state, snowflake_finetune_semantic_state)
df

Unnamed: 0,Metric,Snowflake_Fine/1000/100,Snowflake_FineSection/1000/100,Snowflake_FineTable/1000/100,Snowflake_FineSemantic/1000/100,1v2 Difference,1v3 Difference,1v4 Difference,2v3 Difference,2v4 Difference,3v4 Difference
0,faithfulness,0.910256,0.900966,0.69216,0.888889,-0.00929,-0.218097,-0.021368,-0.208806,-0.012077,0.196729
1,answer_relevancy,0.945477,0.969683,0.945677,0.959232,0.024206,0.0002,0.013755,-0.024006,-0.010451,0.013555
2,context_recall,0.888889,0.888889,0.888889,0.888889,0.0,0.0,0.0,0.0,0.0,0.0
3,context_precision,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,answer_correctness,0.41784,0.370029,0.4848,0.629459,-0.047811,0.066959,0.211619,0.114771,0.25943,0.14466
