In [None]:
%%capture
!pip install llama-index==0.10.37 datasets llama-index-embeddings-openai llama-index-llms-openai

Note: you should install the following packages to your environment:

`pip install datasets`

`pip install llama-index-embeddings-fastembed`

`pip install llama-index-llms-mistralai`

In [None]:
import os
import sys
from getpass import getpass
import nest_asyncio

from dotenv import load_dotenv

sys.path.append('../helpers')

nest_asyncio.apply()

load_dotenv()

In [None]:
OPENAI_API_KEY = os.environ['OPENAI_API_KEY'] or getpass("Enter your OpenAI API key: ")

I'm using OpenAI here because Cohere has rate limits for it's free tier. You don't need to run this code yourself if you don't want to incur costs from OpenAI. I'll upload the dataset to the Hugging Face Hub and I'll show you how to download it from there when we need it.

In [None]:
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-3.5-turbo")

We've already cleaned up our data before. Recall that we've persisted the `Document` objects to disk using a Docstore in such a way that each Document object represents cleaned text from a page of a book.

In [None]:
from utils import get_documents_from_docstore

documents = get_documents_from_docstore("../data/words-of-the-senpais")

# Create a set of `Documents` for the evaluation set

- 📚 **`group_documents_by_author`**: A utility function that sorts a collection of douments into groups based on who wrote them.

- 🗂️ **How It Works**: It creates a  dictionary where each author's name is linked to all the documents they've written.
  - Starts with an empty dictionary ready to be filled with author-document pairs.
  - Goes through each document, checking the author's name and adding the document under the appropriate author in the dictionary.
  - If a document doesn't list an author, it skips adding that document with a warning note.

- 📝 **Input**: Takes a list of `Document` objects, each with metadata that includes the `author` field (the name of its author).

- 🔖 **Output**: Outputs a dictionary that groups all the documents by their respective authors.
  

In [None]:
import random
from utils import group_documents_by_author

random.seed(42)

documents_by_author = group_documents_by_author(documents)

- 📚 **`sample_documents`**: Picks a set number of documents randomly from each author's collection within a grouped dictionary.

- 🎲 **Sampling Logic**: It tries to get a specific number of documents for each author. If an author doesn't have enough documents, it alerts you.
  - Begins with an empty list for storing selected samples.
  - Loops through each author, considers only docs with >500 characters, checking if there are enough documents to fulfill the sampling requirement.
  - Randomly selects the desired number of documents from those available, adding them to the overall sample list.
  - Issues a warning if the documents under an author are too few to meet the sampling number.

- 📝 **Input**: Receives a dictionary where authors are keys and values are lists of their documents, along with an optional number of documents to sample per author.

- 🔖 **Output**: Outputs a list of randomly chosen documents from across all authors, sticking to the specified number per author when possible.

In [None]:
from utils import sample_documents

docs_for_eval_set = sample_documents(documents_by_author, num_samples=25)

# Perform a sanity check

In [None]:
from collections import Counter

def count_documents_by_author(documents):
    """
    Count the number of documents each author has in a list of document objects.

    :param documents: List of document objects with metadata containing 'author'.
    :return: A Counter object with authors as keys and counts of their documents as values.
    """
    # Extract the author from each document's metadata and count occurrences
    author_counts = Counter(doc.metadata['author'] for doc in documents if 'author' in doc.metadata)
    return author_counts

author_counts = count_documents_by_author(docs_for_eval_set)
for author, count in author_counts.items():
    print(f"Author '{author}' has {count} documents.")

In [None]:
len(docs_for_eval_set)

In [None]:
from utils import ingest 
from llama_index.core.node_parser import TokenTextSplitter

splitter = TokenTextSplitter(
    chunk_size = 256,
    chunk_overlap = 32
)

transformations = [splitter]

docs_for_eval_set = ingest(documents = docs_for_eval_set, transformations = transformations)

In [None]:
len(docs_for_eval_set)

## Let's create an evaluation set using custom prompts

In [None]:
from llama_index.core.prompts.base import PromptTemplate
from prompts import QUESTION_GEN_PROMPT
print(QUESTION_GEN_PROMPT)

In [None]:
QUESTION_GEN_PROMPT_TEMPLATE = PromptTemplate(QUESTION_GEN_PROMPT)

In [None]:
from llama_index.core import PromptTemplate

prompt = QUESTION_GEN_PROMPT_TEMPLATE.format(context_str=docs_for_eval_set[10].get_content()) 

response = llm.complete(prompt)

print(response)

# 🤖 + ❓Generate questions from context

We'll use GPT-3.5-Turbo to generate questions from our `Documents`

Here's what the function below is doing:

- Initialize an empty dictionary results to store the responses and contexts.

- Iterate through each document doc in `docs_for_eval_set`.

- For each document, we generate the prompt using `QUESTION_GEN_PROMPT_TEMPLATE` and the document's content.

- Get the response from the LLM using `question_gen_llm.complete(prompt)`.

- Store the response as the key, and the document's content as the value with the key "context" in the results dictionary.


In [None]:
from llama_index.core import PromptTemplate

questions = []

for doc in docs_for_eval_set:
    result_dict = {}
    context = doc.get_content()
    prompt = QUESTION_GEN_PROMPT_TEMPLATE.format(context_str=context)
    response = llm.complete(prompt)
    result_dict['question'] = response.text
    result_dict["context"] =  context
    questions.append(result_dict)

In [None]:
len(questions)

In [None]:
questions[:10]

# 🤖 + 💬 Create answers using generated question and context

Using GPT-3.5-Turbo (to keep costs down, you can of course use GPT-4-Turbo), we'll generate answers using the questions we just created plus the context.

In [None]:
from prompts import ANSWER_GEN_PROMPT

print(ANSWER_GEN_PROMPT)

In [None]:
ANSWER_GEN_PROMPT_TEMPLATE = PromptTemplate(ANSWER_GEN_PROMPT)

In [None]:
prompt = ANSWER_GEN_PROMPT_TEMPLATE.format(query_str=questions[42]['question'], context_str=questions[42]['context']) 

response = llm.complete(prompt)

print(response)

In [None]:
for question in questions:
    prompt = ANSWER_GEN_PROMPT_TEMPLATE.format(query_str=question['question'], context_str=question['context']) 
    response = llm.complete(prompt)
    question['answer'] = response.text

In [None]:
questions[:10]

# 🧐 How good are our questions?

I suppose you could do this part before generating answers, if you wanted to...But we'll do it now.

Here we're going to use GPT-4-Turbo to judge how good the questions is based on the context. We'll write a prompt that does this and score each question on a scale of 1-5.



In [None]:
critic_llm = OpenAI(model="gpt-4o")

In [None]:
from prompts import GROUNDEDNESS_PROMPT

print(GROUNDEDNESS_PROMPT)

In [None]:
GROUNDEDNESS_PROMPT_TEMPLATE = PromptTemplate(GROUNDEDNESS_PROMPT)

In [None]:
prompt = GROUNDEDNESS_PROMPT_TEMPLATE.format(query_str=questions[42]['question'], context_str=questions[42]['context']) 

response = critic_llm.complete(prompt)

print(response)

In [None]:
for question in questions:
    prompt = GROUNDEDNESS_PROMPT_TEMPLATE.format(query_str=question['question'], context_str=question['context']) 
    response = critic_llm.complete(prompt)
    response_string = response.text
    try:
        score_as_int = int(response_string.split("Total rating: ")[-1].strip())
        score_rational = response_string.split("Total rating: ")[-2].split("Evaluation: ")[1]
        question['question_groundedness_score'] = score_as_int
        question['question_groundedness_score_rationale'] = score_rational
    except Exception as e:
        question['question_groundedness_score'] = None
        question['question_groundedness_score_rationale'] = None

In [None]:
questions[-10:]

In [None]:
from datasets import Dataset

rag_eval_set = Dataset.from_list(questions)

In [None]:
rag_eval_set.push_to_hub("harpreetsahota/LI_Learning_RAG_Eval_Set")

# You can find the dataset on Hugging Face

You don't have to run the examples here if you don't want to incur costs from OpenAI. 

[Here's the dataset](https://huggingface.co/datasets/harpreetsahota/LI_Learning_RAG_Eval_Set). You can click around and explore using the dataset viewer. If you sign-up for an account on Hugging Face, feel free to [follow me](https://huggingface.co/harpreetsahota)!



In [None]:
from datasets import load_dataset

rag_eval_set = load_dataset("harpreetsahota/LI_Learning_RAG_Eval_Set")