In [None]:
!pip install -U langchain langchain_openai ragas sentence_transformers chromadb datasets lighthouz

## Import necessary libraries

In [1]:
import os
import pandas as pd
from datasets import load_dataset, Dataset
from langchain.document_loaders import WebBaseLoader
from dotenv import load_dotenv, find_dotenv
from langchain.indexes import VectorstoreIndexCreator
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import TokenTextSplitter
import requests
from langchain import LLMChain
import seaborn as sns
from bs4 import BeautifulSoup
from langchain.chat_models import ChatOpenAI
from langchain.chains import QAGenerationChain
from langchain.output_parsers.json import parse_json_markdown
from langchain.llms import OpenAI
from langchain import PromptTemplate, LLMChain
from sentence_transformers import SentenceTransformer
from sentence_transformers import util
# from ragas.langchain.evalchain import RagasEvaluatorChain

from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

  from .autonotebook import tqdm as notebook_tqdm


### OpenAI api key

1. Run nano ~/.env
2. OPENAI_API_KEY = "your open ai api key"

In [2]:
# read local .env file

_ = load_dotenv(find_dotenv()) 

### Step 1: Generate chunks of data

1.   Load the data science dojo page. 
2.   Select a chunk size and chunk overlap size
3.   Split the entire page using TokenTextSplitter
4.   Use openai to create embeddings for each split


In [3]:
# load the Wikipedia page
loader = WebBaseLoader("https://datasciencedojo.com/blog/open-source-llms-for-enterprises-benefits/")

data = loader.load()
chunk_size = 4000
chunk_overlap = 200
text_splitter = TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)

#split the document as per chunksize and chunkoverlap
splitdocument = text_splitter.split_documents(data)

#create embeddings
embeddings = OpenAIEmbeddings()

### Step 2: Generate Questions and Ground truth answers

The prompt below instructs the LLM to generate a question for each given chunk, and also generate an answer to the question. The answer is returned in a json format. 

In [4]:
qa_template = """ Please generate a meaningful question whose answer is present in the given paragraph.
Also generate the answer to the question using the information in the given paragraph. Don't generate extra information not present in the given paragraph.


Example 1:

Paragraph: The Moon is in geophysical terms a planetary-mass object or satellite planet. It has a mass that amounts to 1.2% of Earth's, and a diameter that is roughly one-quarter of Earth's or with 3,474 km (2,159 mi) about as wide as Australia.[17] Within the Solar System it is the most massive and largest satellite in relation to its parent planet, the fifth most massive and largest moon overall, and more massive and larger than all known dwarf planets.
Output:
{{
    "Q": "How does the Moon's mass and size compare to Earth's?",
    "A": "The Moon, in geophysical context, is considered a planetary-mass object, possessing about 1.2% of the Earth's mass. Its diameter is roughly one-quarter that of Earth's, measuring approximately 3,474 kilometers, which is nearly equivalent to the width of Australia. "
}}

Example 2:

Paragraph: Mars is the fourth planet from the Sun. The surface of Mars is orange-red because it is covered in iron(III) oxide dust, giving it the nickname "the Red Planet".[21][22] Mars is among the brightest objects in Earth's sky and its high-contrast albedo features have made it a common subject for telescope viewing. It is classified as a terrestrial planet and is the second smallest of the Solar System's planets with a diameter of 6,779 km (4,212 mi). In terms of orbital motion, a Martian solar day (sol) is equal to 24.5 hours and a Martian solar year is equal to 1.88 Earth years (687 Earth days). Mars has two natural satellites that are small and irregular in shape: Phobos and Deimos.
Output:
{{
    "Q": "What are the two natural satellites of Mars? ,
    "A": "The two natural satellites of Mars are Phobos and Deimos."
}}

## PARAGRAPH:
{paragraph}


## OUTPUT FORMAT:
{{
    "Q": "$Question",
    "A": "$Answer"
}}

"""

In [5]:
## chain for generating the QA
qa_prompt = PromptTemplate(template=qa_template, input_variables=["paragraph"])
llm_chain = LLMChain(prompt=qa_prompt, llm = ChatOpenAI(temperature=0, model_name='gpt-4-turbo-preview'))

  warn_deprecated(


In [6]:
## Let's take the top 2 splits only
def generate_qa(splitdocument):
    qa_generated = {'chunk': [], 'questions': [], 'ground_truths': []}
    for i in range(2):
        output = llm_chain.run(paragraph= splitdocument[i].page_content)   
        parsed_output = parse_json_markdown(output.strip())
        qa_generated['chunk'].append(i)
        qa_generated['questions'].append(parsed_output["Q"])
        qa_generated['ground_truths'].append(parsed_output["A"])
    return qa_generated

In [7]:
get_qa_generated = generate_qa(splitdocument)

  warn_deprecated(


In [8]:
df = pd.DataFrame(get_qa_generated)

In [9]:
df

Unnamed: 0,chunk,questions,ground_truths
0,0,What are the benefits and challenges of using ...,The benefits of using open-source LLMs include...
1,1,How many individuals follow the LinkedIn newsl...,"Over 95,000 individuals trust the LinkedIn new..."


### Step 3: Build a RAG Application

In [None]:
collection_name = "openai_collection"
local_directory = "openai_vect_embedding"
persist_directory = os.path.join(os.getcwd(), local_directory)
vectorstore = Chroma.from_documents(documents=splitdocument, embedding=OpenAIEmbeddings(),collection_name=collection_name,
                      persist_directory=persist_directory)
vectorstore.persist()
retriever = vectorstore.as_retriever(return_source_document=True)
qa = RetrievalQA.from_chain_type(llm= ChatOpenAI(temperature=0, model='gpt-4-turbo-preview'), chain_type="stuff", retriever=retriever,return_source_documents=True)
for i in range(len(df)):   
    retrieved_answers = qa({"query": df['questions'].iloc[i]})  
    df.loc[i, 'answers'] = retrieved_answers['result']
    df.loc[i,'contexts'] = retrieved_answers['source_documents'][0].page_content
    

In [None]:

df

### Step 4: Evaluate using similarity metrics

In [None]:
def similarity_eval(str1, str2):
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embeddings1 = model.encode(str1, convert_to_tensor=True)
    embeddings2 = model.encode(str2, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)
    
    return cosine_scores[0][0].item()

In [None]:
for i in range(len(df)):   
    score = similarity_eval(df.loc[i, 'ground_truths'], df.loc[i, 'answers'] )
    df.loc[i, 'similarity_score'] = score

In [None]:
df

### Step 5: Evaluate using an LLM

In [None]:
evaluation_prompt = """ You are an evaluator. You are given a question, a correct answer, and a given answer. 
Your goal is to compare the correct answer and the given answer to output one of the two labels: `Correct` or `Incorrect`. 

Use the following definitions to output the labels:
** Correct: The given answer matches the correct answer. 

** Incorrect: The given answer does not match the correct answer. 

You are given the following information. 

Question: {question}

Correct answer: {correct_answer}

Given answer: {given_answer}

Output only one word either `Correct` or `Incorrect`. You must provide an output. 
"""

In [None]:
eval_llm = ChatOpenAI(
            model_name="gpt-4-turbo-preview",
            temperature=0,
            request_timeout=120,
            openai_api_key=OPENAI_API_KEY
        )

In [None]:
def run_llm_eval(question, correct_answer, given_answer):
    eval_prompt = PromptTemplate(
            template=evaluation_prompt, input_variables=["question", "correct_answer", "given_answer"]
        )
    
    llm_chain = LLMChain(prompt=eval_prompt, llm=eval_llm)
    output = llm_chain.run(question=question, correct_answer=correct_answer, given_answer=given_answer)
    return output

In [None]:
for i in range(len(df)):   
    output = run_llm_eval(question=df.loc[i, 'questions'], 
                         correct_answer=df.loc[i, 'ground_truths'], 
                         given_answer=df.loc[i, 'answers'])
    df.loc[i, 'llm_eval_score'] = output == 'Correct'

In [None]:
df

### Step 6: Run eval with RAGAS 

![image.png](attachment:image.png)

In [None]:
from ragas.metrics import (
    answer_relevancy,
    faithfulness,
    context_recall,
    context_precision,
)
from ragas import evaluate

data = {
    "question": df["questions"].tolist(),
    "answer": df["answers"].tolist(),
    "contexts": [[i] for i in df["contexts"].tolist()],
    "ground_truth": df["ground_truths"].tolist()
}
dataset = Dataset.from_dict(data)

result = evaluate(
    dataset,
    metrics=[
        context_precision,
        faithfulness,
        answer_relevancy,
        context_recall,
    ],
).to_pandas()

In [None]:
result

### Step 7: Run eval with Lighthouz

In [None]:
import os
import requests

from lighthouz import Lighthouz
from lighthouz.benchmark import Benchmark
from lighthouz.app import App
from lighthouz.evaluation import Evaluation

In [None]:
LH = Lighthouz("LIGHTHOUZ-API-KEY") # Add your Lighthouz API key. To obtain a Lighthouz API key contact srijan@lighthouz.ai


In [None]:
!mkdir EXAMPLE-DATA
!wget https://d18rn0p25nwr6d.cloudfront.net/CIK-0000320193/b4266e40-1de6-4a34-9dfb-8632b8bd57e0.pdf -O ./EXAMPLE-DATA/apple-10K-2022.pdf
RAG_DOCUMENT = "./EXAMPLE-DATA/apple-10K-2022.pdf"  # you can provide any pdf file or folder with pdf files to create the RAG benchmark
RAG_DIRECTORY = "./EXAMPLE-DATA/"

## Step 1: Generate a RAG benchmark with Lighthouz AutoBench

In [None]:
# Benchmark id is available on the lighthouz dashboard.
benchmark_id = "659b66198e4cc1f4af4e2373" # this is the pre-loaded finance benchmark on apple's 10-K report.

In [None]:
# benchmark_categories = ["rag_benchmark"]
# benchmark_generator = Benchmark(LH)
# benchmark_data = benchmark_generator.generate_benchmark(file_path=RAG_DOCUMENT, benchmark_categories=benchmark_categories)
# benchmark_id = benchmark_data["benchmark_id"]

## Step 2: Connect your RAG app on Lighthouz  

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import TokenTextSplitter
from langchain.vectorstores.chroma import Chroma
from langchain.llms import HuggingFaceHub, HuggingFaceEndpoint
from langchain import HuggingFacePipeline
from langchain.prompts import PromptTemplate

def langchain_rag_model(llm="gpt-3.5-turbo"):
    """
    This is a RAG model built with Langchain, OpenAI, and Chroma
    """
    print("Initializing LangChain RAG OpenAI Agent")

    chunk_size = 2000
    chunk_overlap = 150
    collection_name = "data-test_vect_embedding"
    local_directory = "data-test_vect_embedding"
    persist_directory = os.path.join(os.getcwd(), local_directory)
    if not os.path.exists(persist_directory) or not os.listdir(persist_directory):
        embeddings = OpenAIEmbeddings()
        documents = []
        if RAG_DOCUMENT.endswith(".pdf"):
            loader = PyPDFLoader(RAG_DOCUMENT)
            documents.extend(loader.load())
        else:
            for file in os.listdir(RAG_DOCUMENT):
                if file.endswith(".pdf"):
                    pdf_path = os.path.join(RAG_DOCUMENT, file)
                    loader = PyPDFLoader(pdf_path)
                    documents.extend(loader.load())
        text_splitter = TokenTextSplitter(
            chunk_size=chunk_size, chunk_overlap=chunk_overlap
        )
        splitdocument = text_splitter.split_documents(documents)
        vectDB = Chroma.from_documents(
            splitdocument,
            embeddings,
            collection_name=collection_name,
            persist_directory=persist_directory,
        )
        vectDB.persist()
    else:
        # Load the existing vector store
        embeddings = OpenAIEmbeddings()
        vectDB = Chroma(
            collection_name=collection_name, persist_directory=persist_directory, embedding_function=embeddings
        )

    # LLM used in RAG
    if llm =="gpt-3.5-turbo":
        llm_model = ChatOpenAI(
        model_name="gpt-3.5-turbo",
        temperature=0,
        request_timeout=120,
        )
    elif llm == "gpt-4":
        llm_model = ChatOpenAI(
            model_name="gpt-4",
            temperature=0,
            request_timeout=120,
            )

    retriever = vectDB.as_retriever(return_source_document=True)

    # prepare stuff prompt template
    prompt_template = """You are a helpful assistant. Your job is to provide the answer for the question based on the given context.
    ## CONTEXT: {context}
    ## QUESTION: {question}
    ## ANSWER: """.strip()

    prompt = PromptTemplate(
        input_variables=["context", "question"],
        template=prompt_template
    )

    rag_model = RetrievalQA.from_chain_type(
        llm=llm_model,
        retriever=retriever,
        chain_type="stuff",
        return_source_documents=False,
        chain_type_kwargs={"prompt":prompt}
    )
    print("Langchain RAG OpenAI agent has been initialized.")
    return rag_model

rag_model = langchain_rag_model(llm="gpt-3.5-turbo")

In [None]:
def langchain_rag_query_function(query: str) -> str:
    """
    This is a function to send queries to the RAG model
    """
    response = rag_model({"query": query})["result"]
    return response

In [None]:
app = App(LH)
app_data = app.register(name="gpt-3.5-turbo", model="gpt-3.5-turbo")
app_id = app_data["app_id"]

## Step 3: Evaluate the RAG app on the benchmark with Lighthouz AutoEval

In [None]:
evaluation = Evaluation(LH)
e_single = evaluation.evaluate_rag_model(
    response_function=langchain_rag_query_function,
    benchmark_id=benchmark_id,
    app_id=app_id,
)

## Step 4: Compare multiple RAG apps on the benchmark with Lighthouz Arena

In [None]:
rag_model_gpt4 = langchain_rag_model(llm="gpt-4")

def langchain_rag_query_function_gpt4(query: str) -> str:
    """
    This is a function to ask queries to the RAG model with GPT4
    """
    response = rag_model_gpt4({"query": query})["result"]
    return response

app = App(LH)
app_data = app.register(name="gpt-4", model="gpt-4")
app_id_gpt4 = app_data["app_id"]

In [None]:
e_multiple = evaluation.evaluate_multiple_rag_models(
    response_functions=[langchain_rag_query_function, langchain_rag_query_function_gpt4],
    benchmark_id=benchmark_id,
    app_ids=[app_id, app_id_gpt4],
)
print(e_multiple)