In [1]:
import requests
from langchain.document_loaders import TextLoader

In [27]:

#url = "https://github.com/evidentlyai/evidently/blob/main/examples/cookbook/prompt_optimization_bookings_example.ipynb"
#res = requests.get(url)
#with open("state_of_the_union.txt", "w") as f:
#    f.write(res.text)

loader = TextLoader('./state_of_the_union.txt')
documents = loader.load()

In [28]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

Created a chunk of size 633, which is longer than the specified 500


In [29]:
from dotenv import load_dotenv, find_dotenv 
from langchain.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings

_ = load_dotenv(find_dotenv())

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
vectordb = FAISS.from_documents(documents, embeddings)
retriever = vectordb.as_retriever()

In [30]:
from langchain.prompts import ChatPromptTemplate

template = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)

print(prompt)

input_variables=['context', 'question'] input_types={} partial_variables={} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. \nUse the following pieces of retrieved context to answer the question. \nIf you don't know the answer, just say that you don't know. \nUse three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:\n"), additional_kwargs={})]


In [36]:
from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

llm = ChatOpenAI(model_name="gpt-4o", temperature=0.5)

rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

query = "What is the code implementation function to detect data drift for tabular data?"
rag_chain.invoke(query)

'The code implementation function to detect data drift for tabular data uses the `evidently` library with the `DataDriftPreset`. You create a `Report` object with `DataDriftPreset()` and then run it using `report.run(eval_data_1, eval_data_2)` where `eval_data_1` and `eval_data_2` are datasets created from pandas dataframes.'

# RAG Evaluation

In [None]:
# Import necessary libraries
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Instantiate the models
generator_llm = ChatOpenAI(model="gpt-4o-mini")
critic_llm = ChatOpenAI(model="gpt-4o-mini")
embeddings = OpenAIEmbeddings()

# Create the TestsetGenerator
generator = TestsetGenerator.from_langchain(
    generator_llm,
    critic_llm,
    embeddings
)

# Call the generator
testset = generator.generate_with_langchain_docs(
data_transformed, 
test_size=20, 
distributions={ 
simple: 0.5, 
reasoning: 0.25, 
multi_context: 0.25}
)