# Building a simple chatbot with Retrieval Augmented Generation (RAG) using LlamaIndex

## Installing the required packages needed

In [1]:
# !pip install llama-index
# !pip install openai
# !pip install transformers
# !pip install accelerate
# !pip install pypdf
# !pip install optimum[exporters]
# !pip install InstructorEmbedding
# !pip install sentence_transformers
# !pip install python-dotenv
# !pip install ragas

## Import libraries, API and set filepath

In [2]:
from llama_index import Document, GPTVectorStoreIndex, ServiceContext
from llama_index.readers import BeautifulSoupWebReader, SimpleDirectoryReader
from llama_index.llms import OpenAI
from llama_index.evaluation import DatasetGenerator
from llama_index import download_loader # For CSV
from llama_index import (VectorStoreIndex,
                         SimpleDirectoryReader,
                         StorageContext,
                         load_index_from_storage,)
from llama_index import (ServiceContext, 
                         LLMPredictor, 
                         OpenAIEmbedding, 
                         PromptHelper)
from llama_index.text_splitter import SentenceSplitter
from llama_index import set_global_service_context
# from llama_index.llama_dataset.generator import RagDatasetGenerator
# from llama_index.embeddings import OpenAIEmbedding

import openai
import os
import nest_asyncio
import random
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
import toml

In [3]:
# set filepath to my data directory 

current_dir = os.getcwd()
data_dir = os.path.join(current_dir, "./data")

## Load the data

According to [LlamaIndex's documentation](https://gpt-index.readthedocs.io/en/latest/examples/data_connectors/simple_directory_reader.html), the `SimpleDirectoryReader` is the most commonly used data connector that just works. Simply pass in a input directory or a list of files. It will select the best file reader based on the file extensions. 

In this use case here, there are PDFs and html pages from the latest release from MUIS and other islamic wesbites, which are not included in gpt-3.5-turbo's.

In [4]:
filename_fn = lambda filename: {'file_name': filename}
pdfhtml_docs = SimpleDirectoryReader(input_dir=data_dir, exclude_hidden=True, file_metadata=filename_fn).load_data()
print([x.doc_id for x in pdfhtml_docs])

['39949edf-badb-4231-a3cb-1079dcd35711', '3992c16a-8594-46a6-afec-46a17a6813d4', '69004272-5d69-4996-bafb-85b22aca724b', 'e00ebc10-99f3-4687-9757-421144c6dde1', '0af3b1bf-a347-4737-90ca-e19e2a274bc2', '2d66a4e8-a38b-4fea-9081-0737c8cdbbc7', '0ac4f1ba-8b84-4405-be31-6b546661388a', '62561b66-a68c-4d7d-8f52-11dfca87dc94', 'e144b2c0-4c06-4c4f-b6f5-21819d37c408', '365cacc4-9625-4c83-8b99-1b73a6524272', '0f8551b3-f582-4eec-b169-1e267c22718e', '68ff4887-a1c0-46dd-a3e6-8ea664408445', '544520a7-3e02-4a74-934f-fca2ac4e661c', '9f404292-9df6-41c6-8dd3-4342a9e9f9f4', '64804a22-ef2a-47fb-b312-eb5be89b00d5', 'ed15b57a-1527-4f44-8cfe-053d28b0b52a', 'cb91d52e-ef7d-4942-a13a-63e0cd1cc433', '7c4e3c5d-069d-4d94-bfbb-937af8028334', '680cba60-87c1-4808-b711-a4c67e58e7fd', '72fffb9b-98ba-4fe7-b995-1a6b0484ec97', '9db8985c-9960-4145-bbca-956c5fda6077', '6f123891-87f9-4f2d-8f6b-6493b4868bac', '3ee59216-3db1-4f7e-b85a-701bd370301b', '93563a7a-32fa-4273-be84-76f8b072f138', 'dd48a8f6-8b67-4a12-b8b6-b9402fc9ffb0',

In [5]:
# Check if documents are loaded
if not pdfhtml_docs:
    print("No documents loaded. Check your data directory path.")
else:
    print(f"Loaded {len(pdfhtml_docs)} documents.")

Loaded 51 documents.


In [6]:
# #Use this code if your data set is in CSV format
# PagedCSVReader = download_loader("PagedCSVReader")

# loader = PagedCSVReader(encoding="utf-8")
# csv_docs = loader.load_data(file=Path('data/halal_non_halal_ingred.csv'))

# # print([x.doc_id for x in docs])
# print(f"Loaded {len(csv_docs)} docs")

## Creating an Index

Once all the data is loaded, we can proceed to build an index for the chatbot. There are four types of indexing methods available: Summary Index, VectorStore Index, Tree Index, and Keyword Table Index. In this context, we will be utilizing the `VectorStore Index`, which happens to be one of the most widely used indexing techniques.

### Step 1: Set Up OpenAI Service Context for NLP

Next, set up the ServiceContext with OpenAI GPT-3.5-turbo for processing and understanding user queries.
-  for more info on service context, refer to https://docs.llamaindex.ai/en/stable/module_guides/supporting_modules/service_context.html#setting-global-configuration

In [7]:
with open('secrets.toml', 'r') as f:
    config = toml.load(f)


In [8]:
# Load environment variables from .env file
dotenv_path = './.env'
load_dotenv(dotenv_path)

# Fetch and set the OpenAI API key
try:
    openai.api_key = os.environ['OPENAI_API_KEY']
except KeyError:
    print("OpenAI API key not found in environment variables.")

In [9]:
llm = OpenAI(model="gpt-3.5-turbo", temperature=0, max_tokens=256)

#configure service context
gpt_context = ServiceContext.from_defaults(
    llm=llm,
)


### Step 2: Set Up Document Indexing - Storing your index

First, create and store the document index if it doesn't already exist, or load it if it does. This index will be used to retrieve information about ingredients.
- for more information on storing your index, refer to: https://docs.llamaindex.ai/en/stable/getting_started/starter_example.html#

In [10]:
# Directory to store the indexed data
storage_dir = "./storage"

# Ensure the storage directory exists
if not os.path.exists(storage_dir):
    os.makedirs(storage_dir, exist_ok=True)
    
# Create a StorageContext
storage_context = StorageContext.from_defaults(persist_dir=storage_dir)

# Create the VectorStoreIndex with service_context
index = GPTVectorStoreIndex.from_documents(pdfhtml_docs, service_context=gpt_context, show_progress=True)

# Persist the index
index.storage_context.persist()
print("Index created and persisted successfully.")

Parsing nodes:   0%|          | 0/51 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/238 [00:00<?, ?it/s]

Index created and persisted successfully.


### Technically speaking, we are already able to proceed with a chat engine and end off this notebook. But, I am curious to know how to fine tune, so here it goes. 

The next step is generating a training and eval dataset.

We will generate 40 questions on different sections of the docs we just ingested.

Then, we will use GPT-3.5 on the eval questions to get our baseline performance, followed by using GPT-4 on the train questions to generate our training data. The training data will be collected with out `OpenAIFineTuningHandler`.

---

more info here: https://gpt-index.readthedocs.io/en/v0.8.49/examples/finetuning/openai_fine_tuning.html (this is where I got most of my code from too) 

Note: considerable amount of money and time is spent here!

## Train generation

In [11]:
# Shuffle the documents
random.seed(42)
random.shuffle(pdfhtml_docs)

In [12]:
# Define the question generation query
# This query will guide the generation of questions for each document
# It's focused on evaluating the halal status of ingredients in food products

question_gen_query = (
    "Given your expertise in Halal food certification, I need you to analyze the ingredients of a specific food product for Halal compliance. Please provide a clear and detailed assessment for each ingredient based on the information available in the provided documents. Consider factors such as the source and processing methods of each ingredient, as these can impact its Halal status. If any ingredient is non-Halal, doubtful, or lacks sufficient information for assessment, please categorize the entire food product accordingly. Your goal is to determine the overall Halal status of the food product. Please provide concise and factually accurate responses.""
)

# find out more about question generation from 
# https://gpt-index.readthedocs.io/en/latest/examples/evaluation/QuestionGeneration.html

# Create the dataset generator using the combined and shuffled documents
# This will use the defined service context and question generation query
# to create a dataset where each document is paired with a generated question

dataset_generator = DatasetGenerator.from_documents(
    pdfhtml_docs[6:10],   # Limit to the first 40 documents
    question_gen_query=question_gen_query,
    service_context=gpt_context,  # Use the previously defined service_context
)


  return cls(


In [13]:
# To avoid RuntimeError: asyncio.run() cannot be called from a running event loop
# The below code is to unblock: nest the event loops
# Apply asyncio patch to enable asynchronous operations in a Jupyter environment
nest_asyncio.apply()

In [14]:
# Set the timeout (in seconds)
# openai.api_timeout = 80 

questions = dataset_generator.generate_questions_from_nodes(num=40)
print("Generated ", len(questions), " questions")


Generated  40  questions


  return QueryResponseDataset(queries=queries, responses=responses_dict)


In [15]:
print(questions)

['Is Curcumin (C.I. 75300) considered halal?', 'What is the halal status of Riboflavin/Lactofavin/Vitamin B2?', 'Is Tartrazine/FD&C Yellow 5 (C.I. 19140) halal?', 'Can Quinoline Yellow (C.I. 47005) be considered halal?', 'Is Yellow 2G (C.I. 18965) halal?', 'What is the halal status of Sunset Yellow FCF/FD&C Yellow 6 (C.I. 15985)?', 'Is Cochineal/Carmines (C.I. 75470) considered halal?', 'Can Carmoisine/Azorubine (C.I. 14720) be considered halal?', 'Is Amaranth/FD&C Red 2 (C.I. 16185) halal?', 'What is the halal status of Ponceau 4R/Cochineal Red A (C.I. 16255)?', 'Is Erythrosine/FD&C Red 3 (C.I. 45430) considered halal?', 'Can Red', 'Is Colouring considered halal or non-halal based on the available information?', 'What is the halal status of Beet Red/Betanin/Betanidin?', 'Are Anthocyanins halal or non-halal?', 'Is Calcium Carbonate/Chalk considered halal or non-halal?', 'What is the halal status of Titanium Dioxide?', 'Are Iron Oxides halal or non-halal?', 'Is Aluminium halal or non-ha

In [16]:
# Open a file named 'train_questions.txt' in write mode
# 'with open' is used for safe handling of file operations
with open("train_questions.txt", "w") as f:
    # Iterate over each question in the 'questions' list
    for question in questions:
        # Write each question to the file, followed by a newline character
        # This ensures each question is on a new line in the file
        f.write(question + "\n")
        # The newline character '\n' is important for separating the questions

# Note: The 'with open' statement automatically handles the closing of the file
# once the block of code under it is executed. This is a good practice to prevent
# file handling errors and ensure that data is properly written to the file.

### Generate Evaluation Dataset

This dataset is for subsequent evaluation step to measure the performance of the models.
<br> Questions are generated from a different set of documents.

In [17]:
# Create a DatasetGenerator from a subset of 'docs' starting from the 4th document (index 3)
dataset_generator = DatasetGenerator.from_documents(
    pdfhtml_docs[
        9:11],  # since we generated question for the first 40 documents, we can skip the first 40 
    question_gen_query=question_gen_query,  # Specify the question generation query
    service_context=gpt_context,  # Provide the GPT service context
)

  return cls(


In [18]:
# Generate questions from a dataset using the dataset generator
questions = dataset_generator.generate_questions_from_nodes(num=20)

# Print the number of generated questions
print("Generated ", len(questions), " questions")

Generated  20  questions


  return QueryResponseDataset(queries=queries, responses=responses_dict)


In [19]:
# Write the generated questions to a file for evaluation purpose
with open("eval_questions.txt", "w") as f:
    for question in questions:
        f.write(question + "\n")

In [20]:
print("Total number of documents:", len(pdfhtml_docs))

Total number of documents: 51


### GPT-3.5 Turbo to Generate Training Data

This code is used to set up a fine-tuning process for a language model, specifically GPT-3.5 Turbo. Here's a breakdown of what it does:

- Create an instance of the `OpenAIFineTuningHandler`. This handler is used for fine-tuning the language model.

- Create a `CallbackManager` to manage callbacks during model interactions. The fine-tuning handler is added to this manager. Callbacks are functions that can be executed at various points during model operations.

- Configure the GPT-3.5 Turbo model with a specific context. This context includes the following settings:
    - The language model used is GPT-3.5 Turbo.
    - The temperature parameter is set to 0, which means deterministic output (no randomness).
    - The context window is limited to 2048 tokens. This artificially limits the amount of context that the model can consider, possibly for testing or optimization purposes.
    - The callback_manager is set to the previously created callback_manager.

In [21]:
from llama_index.callbacks import OpenAIFineTuningHandler
from llama_index.callbacks import CallbackManager

# Create an instance of the OpenAIFineTuningHandler for fine-tuning
finetuning_handler = OpenAIFineTuningHandler()

# Create a CallbackManager and add the fine-tuning handler to it
callback_manager = CallbackManager([finetuning_handler])

# Configure the GPT-3.5 Turbo model with a specific context
gpt_3_5_context = ServiceContext.from_defaults(
    llm=OpenAI(model="gpt-3.5-turbo", temperature=0),
    context_window=2048,  # limit the context window artifically to test refine process
    callback_manager=callback_manager,
)

In [22]:
# Read questions from a file and store them in a list

questions = []
with open("train_questions.txt", "r") as f:
    for line in f:
        questions.append(line.strip())

In [23]:
questions

['Is Curcumin (C.I. 75300) considered halal?',
 'What is the halal status of Riboflavin/Lactofavin/Vitamin B2?',
 'Is Tartrazine/FD&C Yellow 5 (C.I. 19140) halal?',
 'Can Quinoline Yellow (C.I. 47005) be considered halal?',
 'Is Yellow 2G (C.I. 18965) halal?',
 'What is the halal status of Sunset Yellow FCF/FD&C Yellow 6 (C.I. 15985)?',
 'Is Cochineal/Carmines (C.I. 75470) considered halal?',
 'Can Carmoisine/Azorubine (C.I. 14720) be considered halal?',
 'Is Amaranth/FD&C Red 2 (C.I. 16185) halal?',
 'What is the halal status of Ponceau 4R/Cochineal Red A (C.I. 16255)?',
 'Is Erythrosine/FD&C Red 3 (C.I. 45430) considered halal?',
 'Can Red',
 'Is Colouring considered halal or non-halal based on the available information?',
 'What is the halal status of Beet Red/Betanin/Betanidin?',
 'Are Anthocyanins halal or non-halal?',
 'Is Calcium Carbonate/Chalk considered halal or non-halal?',
 'What is the halal status of Titanium Dioxide?',
 'Are Iron Oxides halal or non-halal?',
 'Is Alumini

In [24]:
# Create a VectorStoreIndex from a list of documents using the gpt_4_context
index = VectorStoreIndex.from_documents(pdfhtml_docs, service_context=gpt_3_5_context)

# Create a query engine based on the index with a specified similarity threshold
query_engine = index.as_query_engine(similarity_top_k=2)

In [25]:
# Iterate through a list of questions and query the query engine for each question
for question in questions:
    response = query_engine.query(question)

### Create `OpenAIFinetuneEngine`

`OpenAIFinetuneEngine` is a finetune engine that will take care of launching a finetuning job, and returning an LLM model that can be directly plugged in to the rest of LlamaIndex workflows.

In [26]:
# Save fine-tuning events to a JSONL file
finetuning_handler.save_finetuning_events("finetuning_events.jsonl")

Wrote 76 examples to finetuning_events.jsonl


## Evaluation

To measure the performance of the pipeline, whether it is able to generate relevant and accurate responses given the external data source and a set of queries, we use 2 evaluation metrics from [`ragas` evaluation library](https://github.com/explodinggradients/ragas/tree/main/docs/concepts/metrics). Ragas uses LLMs under the hood to compute the evaluations.

The performance of the base model, gpt-3.5-turbo, will be compared with the fine-tuned model.

Computation of evaluation metrics require 3 components: 
1) `Question`: A list of questions that could be asked about my external data/documents, generated using .generate_questions_from_nodes in above fine-tuning step<br>
2) `Context`: Retrieved contexts corresponding to each question. The context represents (chunks of) documents that are relevant to the question, i.e. the source from where the answer will be generated.<br>
3) `Answer`: Answer generated corresponding to each question from baseline and fine-tuned model.

The two metrics are as follow:

- `answer_relevancy` - Measures how relevant the generated answer is to the question, where an answer is considered relevant when it <u>directly</u> and <u>appropriately</u> addresses the orginal question, i.e. answers that are complete and do not include unnecessary or duplicated information. The metric does not consider factuality. It is computed using `question` and `answer`, with score ranging between 0 and 1, the higher the score, the better the performance in terms of providing relevant answers. To calculate this score, the LLM is prompted to generate an appropriate question for the generated answer multiple times, and the mean cosine similarity between these generated questions and the original question is measured. The underlying idea is that if the generated answer accurately addresses the initial question, the LLM should be able to generate questions from the answer that align with the original question, i.e. high mean cosine similarity, translating to high score.


- `faithfulness` - Measures how factually accurate is the generated answer, i.e. if the response was hallucinated, or based on factuality (from the context). It is computed from `answer` and `context`, with score ranging between 0 and 1, the higher the score, the better the performance in terms of providing contextually accurate information. To calculate this score, the LLM identifies statements within the generated answer and verifies if each statement is supported by the retrieved context. The process then counts the number of statements within the generated answer that can be logically inferred from the context, and dvide by the total number of statements in the answer. 

Additional note: Cosine similarity is a metric used to measure how similar two items are. Mathematically, it measures the cosine of the angle between two vectors projected in a multi-dimensional space. The output value ranges from 0–1 where 0 means no similarity, whereas 1 means that both the items are 100% similar.
<br>Hallucinations refer to instances where the language model produces information or claims that are not accurate or supported by the input context.

Resources:
<br>https://cobusgreyling.medium.com/rag-evaluation-9813a931b3d4
<br>https://blog.langchain.dev/evaluating-rag-pipelines-with-ragas-langsmith/
<br>https://medium.aiplanet.com/evaluate-rag-pipeline-using-ragas-fbdd8dd466c1

In [27]:
questions = []
with open("eval_questions.txt", "r") as f:
    for line in f:
        questions.append(line.strip())

In [30]:
from llama_index import VectorStoreIndex

# limit the context window to 2048 tokens so that refine is used
gpt_context = ServiceContext.from_defaults(
    # If finetuning on openai website, replace the model name accordingly
    llm=OpenAI(model="ft:gpt-3.5-turbo-0613:personal::8TEKU4A7", temperature=0), context_window=2048
    
    # If finetuning on localhost, uncomment this code
    # llm=OpenAI(model="gpt-3.5-turbo", temperature=0), context_window=2048
)

index = VectorStoreIndex.from_documents(pdfhtml_docs, service_context=gpt_context)

# as_query_engine builds a default retriever and query engine on top of the index
# We configure the retriever to return the top 2 most similar documents, which is also the default setting
query_engine = index.as_query_engine(similarity_top_k=2)

In [31]:
# Initialize empty lists to store contexts and answers
contexts = []
answers = []

# Iterate through a list of questions
for question in questions:
    # Query the query_engine with the current question
    response = query_engine.query(question)
    
    # Extract and store the content of source nodes as contexts
    # This assumes that response.source_nodes is a list of nodes
    # and each node has a get_content() method
    context_content = [x.node.get_content() for x in response.source_nodes]
    contexts.append(context_content)
    
    # Convert the response to a string and store it as an answer
    answer_str = str(response)
    answers.append(answer_str)

# At the end of this loop, 'contexts' will contain lists of context content,
# and 'answers' will contain the responses generated by the query engine.

In [32]:
# Import necessary libraries
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import answer_relevancy, faithfulness

# Create a Dataset from a dictionary containing questions, answers, and contexts
ds = Dataset.from_dict(
    {
        "question": questions,  # List of questions
        "answer": answers,      # List of answers
        "contexts": contexts,  # List of contexts
    }
)

# Evaluate the dataset using specified metrics (faithfulness and answer_relevancy)
result = evaluate(ds, [answer_relevancy, faithfulness])

# Print the evaluation result
print(result)


evaluating with [answer_relevancy]


100%|██████████| 2/2 [00:20<00:00, 10.24s/it]


evaluating with [faithfulness]


100%|██████████| 2/2 [00:17<00:00,  8.68s/it]


{'answer_relevancy': 0.9697, 'faithfulness': 0.5167}


In [33]:
df_gpt_35 = result.to_pandas()

# Export cleaned dataframe as .csv
df_gpt_35.to_csv("df_gpt_35.csv",index=False)