### RAG Process For BARBAH GAMES Chatbot

https://www.kaggle.com/ryanmutiga
Kaggle was used as compute was available

In [None]:
!pip install numpy
!pip install transformers torch
!pip install sentence-transformers
!pip install accelerate
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
!pip install requests
!pip install huggingface_hub


In [None]:
# Log in to Hugging Face
from huggingface_hub import notebook_login

notebook_login()



In [None]:
import numpy as np
from typing import List, Dict, Any
from sentence_transformers import SentenceTransformer, util
import json
import os
import torch
from torch.nn import DataParallel
import logging
import time
from huggingface_hub import InferenceClient

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class Document:
    def __init__(self, content: str, metadata: Dict[str, Any] = None):
        self.content = content
        self.metadata = metadata or {}

class EmbeddingModel:
    def __init__(self, model_name: str = "jinaai/jina-embeddings-v2-small-en"):
        self.model = SentenceTransformer(model_name, trust_remote_code=True)
        self.model.max_seq_length = 1024  # Adjust max sequence length as needed

        if torch.cuda.device_count() > 1:
            self.model = DataParallel(self.model)

    def encode(self, texts: List[str], batch_size: int = 16, device: str = 'cuda:0') -> np.ndarray:
        embeddings = []
        self.model.to(device)
        for i in range(0, len(texts), batch_size):
            batch_texts = texts[i:i + batch_size]
            batch_embeddings = self.model.module.encode(batch_texts, normalize_embeddings=True)
            embeddings.append(batch_embeddings)
        return np.vstack(embeddings)

class DocumentRetriever:
    def __init__(self, documents: List[Document], embedding_model: EmbeddingModel, device: str = 'cuda:0'):
        self.documents = documents
        self.embedding_model = embedding_model
        self.device = device
        self.document_embeddings = self.embedding_model.encode([doc.content for doc in documents], device=self.device)

    def retrieve(self, query: str, k: int = 5) -> List[Document]:
        try:
            query_embedding = self.embedding_model.encode([query], device=self.device)
            similarities = util.cos_sim(query_embedding, self.document_embeddings)
            
            logger.info(f"Query: {query}")
            logger.info(f"Similarities shape: {similarities.shape}")
            
            if len(similarities) == 0 or len(similarities[0]) == 0:
                logger.warning("No similarities found")
                return []
            
            k = max(1, min(k, len(self.documents)))  # Ensure k is between 1 and the number of documents
            top_k_indices = np.argsort(similarities[0])[-k:][::-1]
            
            logger.info(f"Top {k} indices: {top_k_indices}")
            
            return [self.documents[i] for i in top_k_indices]
        except Exception as e:
            logger.error(f"Error in retrieve method: {str(e)}")
            return []

class PromptConstructor:
    def __init__(self, system_prompt: str, template: str):
        self.system_prompt = system_prompt
        self.template = template

    def construct(self, query: str, retrieved_docs: List[Document]) -> List[Dict[str, str]]:
        context = "\n\n".join([doc.content for doc in retrieved_docs])
        user_message = self.template.format(context=context, query=query)
        return [
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": user_message}
        ]

class LanguageModel:
    def __init__(self, model_name: str, api_token: str):
        self.client = InferenceClient(model=model_name, token=api_token)

    def generate(self, messages: List[Dict[str, str]]) -> str:
        try:
            logger.info(f"Sending request to API with messages: {messages[:2]}...")  # Log first two messages
            response = self.client.chat_completion(messages, max_tokens=500, stream=False)
            logger.info(f"Received response from API")
            return response.choices[0].message.content
        except Exception as e:
            logger.error(f"Error in API request: {str(e)}")
            return "I apologize, but I encountered an error while processing your request. Please try again later."

class RAG:
    def __init__(self, embedding_model: EmbeddingModel, language_model: LanguageModel,
                 system_prompt: str, prompt_template: str, device: str = 'cuda:0'):
        self.embedding_model = embedding_model
        self.language_model = language_model
        self.documents = []
        self.document_retriever = None
        self.prompt_constructor = PromptConstructor(system_prompt, prompt_template)
        self.device = device
        self.system_prompt = system_prompt

    def add_documents(self, documents: List[Document]):
        self.documents.extend(documents)
        self.document_retriever = DocumentRetriever(self.documents, self.embedding_model, self.device)

    def query(self, query: str, k: int = 5) -> str:
        try:
            if not self.document_retriever:
                logger.warning("Document retriever not initialized. Falling back to language model.")
                return self.fallback_query(query)
            
            retrieved_docs = self.document_retriever.retrieve(query, k)
            if retrieved_docs:
                messages = self.prompt_constructor.construct(query, retrieved_docs)
            else:
                logger.info("No relevant documents found. Falling back to language model.")
                return self.fallback_query(query)
            
            return self.language_model.generate(messages)
        except Exception as e:
            logger.error(f"Error in RAG query: {str(e)}")
            return self.fallback_query(query)

    def fallback_query(self, query: str) -> str:
        messages = [
            {"role": "system", "content": self.system_prompt},
            {"role": "user", "content": f"User Query: {query}\n\nPlease provide a response based on your general knowledge about BARBAH Games, esports, and SDGs."}
        ]
        return self.language_model.generate(messages)

# Load and process JSON files with error handling
def load_json_documents(file_paths, batch_size=10):
    documents = []
    for file_path in file_paths:
        if os.path.exists(file_path):
            try:
                with open(file_path, 'r', encoding='utf-8') as file:
                    data = json.load(file)
                    for doc in data[0]['docs']:
                        content = doc['content']
                        metadata = {
                            'source': file_path,
                            'title': doc['metadata'].get('title', ''),
                            'description': doc['metadata'].get('description', '')
                        }
                        documents.append(Document(content, metadata))
                        if len(documents) >= batch_size:
                            yield documents
                            documents = []
            except Exception as e:
                logger.error(f"Error loading {file_path}: {e}")
        else:
            logger.warning(f"File not found: {file_path}")
    if documents:
        yield documents

# Usage example
MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
API_TOKEN = "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"

embedding_model = EmbeddingModel()
language_model = LanguageModel(MODEL_NAME, API_TOKEN)

system_prompt = """
You are an AI assistant for BARBAH Games, an organization focused on promoting esports in East Africa and integrating Sustainable Development Goals (SDGs) into gaming. Your name is Astro and you have expertise in esports, SDGs, and BARBAH Games' initiatives. Provide informative and engaging responses to user queries, always emphasizing the connection between gaming, education, and sustainable development.
"""

prompt_template = """
Context:
{context}

User Query: {query}

Please provide a response based on the context, your expertise in esports and SDGs, and BARBAH Games' initiatives. If relevant, suggest ways the user can contribute to SDGs through gaming or esports activities.
"""

# Split documents between two GPUs
rag_system_0 = RAG(embedding_model, language_model, system_prompt, prompt_template, device='cuda:0')
rag_system_1 = RAG(embedding_model, language_model, system_prompt, prompt_template, device='cuda:1')

# Add existing documents to the RAG system
existing_documents = [
    Document("BARBAH GAMES TOKENIZING GAMING FOR REAL-WORLD CAREERS IN WEB3...", {"source": "/kaggle/input/barbah-games-website-chatbot/Txt/Txt/BARBAHGAMES Deck018 2.txt"}),
    Document("SUSTAINABLE DEVELOPMENT GOALS Progress Chart 2023...", {"source": "/kaggle/input/barbah-games-website-chatbot/Txt/Txt/SUSTAINABLE DEVELOPMENT GOALS Progress-Chart-2023.txt"}),
    Document("BARBAH GAMES ESPORTS : A CATALYST FOR GLOBAL ESG INITIATIVES...", {"source": "/kaggle/input/barbah-games-website-chatbot/Txt/Txt/VISION DECK.txt"}),
    Document("FOR THE NEXT GENERATION FY22 NIKE, Inc. Impact Report...", {"source": "/kaggle/input/barbah-games-website-chatbot/Txt/Txt/FY22-NIKE,-Inc.-Impact-Report.txt"}),
    Document("Driving Engagement by gamifying SDG Quests At BARBAH Games, we are committed to more than just gaming...", {"source": "/kaggle/input/barbah-games-website-chatbot/Txt/Txt/SDG QUESTS.txt"}),
    Document("Skills for a Sustainable Future: How Green and Digital Skills Intersect...", {"source": "/kaggle/input/barbah-games-website-chatbot/Txt/Txt/Skills for a Sustainable Future How Green and Digital Skills Intersect.txt"}),
    Document("Corporate Sponsorship Model Overview...", {"source": "/kaggle/input/barbah-games-website-chatbot/Txt/Txt/Corporate Sponsorship Tiers.txt"}),
    Document("BARBAH Games Essentials Copy Elevate Your Game with BARBAH Games Essentials!...", {"source": "/kaggle/input/barbah-games-website-chatbot/Txt/Txt/BARBAH GAMERS ESSENTIALS.txt"}),
    Document("Promote Esports in East Africa July 12, 2024 www.barbahgames.com...", {"source": "/kaggle/input/BARBAH_GAMES_GPT_Conversation.txt"}),
    Document("Gaming Mentor EA Supreme Instructions Overview...", {"source": "/kaggle/input/BARBAH_GAMES_GPT_Instructions.txt"})
]

# Split documents into two halves
half_index = len(existing_documents) // 2
rag_system_0.add_documents(existing_documents[:half_index])
rag_system_1.add_documents(existing_documents[half_index:])

# Add new documents from JSON files in batches
json_file_paths = [
    "/kaggle/input/barbah-games-website-chatbot/Websites Info/Websites Info/22-02-2024/barbah games.com.json",
    "/kaggle/input/barbah-games-website-chatbot/Websites Info/Websites Info/22-02-2024/barbah games.gg.json",
    "/kaggle/input/barbah-games-website-chatbot/Websites Info/Websites Info/22-02-2024/epic games.json",
    "/kaggle/input/barbah-games-website-chatbot/Websites Info/Websites Info/22-02-2024/forwardfaster global compact.json",
    "/kaggle/input/barbah-games-website-chatbot/Websites Info/Websites Info/22-02-2024/prophix esg report.json",
    "/kaggle/input/barbah-games-website-chatbot/Websites Info/Websites Info/22-02-2024/un global compact.json",
    "/kaggle/input/barbah-games-website-chatbot/Websites Info/Websites Info/22-02-2024/un sdg learn.json"
]

for batch in load_json_documents(json_file_paths, batch_size=5):
    half_index = len(batch) // 2
    rag_system_0.add_documents(batch[:half_index])
    rag_system_1.add_documents(batch[half_index:])

# Main loop
def main():
    while True:
        query = input("Enter your query (or 'quit' to exit): ")
        if query.lower() == 'quit':
            break

        try:
            response_0 = rag_system_0.query(query)
            response_1 = rag_system_1.query(query)

            # Combine responses from both systems
            combined_response = f"Response from GPU 0:\n{response_0}\n\nResponse from GPU 1:\n{response_1}"
            print(combined_response)
        except Exception as e:
            logger.error(f"Error in main loop: {str(e)}")
            print("An error occurred while processing your query. Please try again.")

        # Add a delay to avoid rate limiting
        time.sleep(2)

if __name__ == "__main__":
    main()

In [None]:
# import numpy as np
# from typing import List, Dict, Any
# from sentence_transformers import SentenceTransformer, util
# import json
# import os
# import torch
# from torch.nn import DataParallel
# import logging
# import time
# from huggingface_hub import InferenceClient

# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

# class Document:
#     def __init__(self, content: str, metadata: Dict[str, Any] = None):
#         self.content = content
#         self.metadata = metadata or {}

# class EmbeddingModel:
#     def __init__(self, model_name: str = "jinaai/jina-embeddings-v2-small-en"):
#         self.model = SentenceTransformer(model_name, trust_remote_code=True)
#         self.model.max_seq_length = 1024  # Adjust max sequence length as needed

#         if torch.cuda.device_count() > 1:
#             self.model = DataParallel(self.model)

#     def encode(self, texts: List[str], batch_size: int = 16, device: str = 'cuda:0') -> np.ndarray:
#         embeddings = []
#         self.model.to(device)
#         for i in range(0, len(texts), batch_size):
#             batch_texts = texts[i:i + batch_size]
#             batch_embeddings = self.model.module.encode(batch_texts, normalize_embeddings=True)
#             embeddings.append(batch_embeddings)
#         return np.vstack(embeddings)

# class DocumentRetriever:
#     def __init__(self, documents: List[Document], embedding_model: EmbeddingModel, device: str = 'cuda:0'):
#         self.documents = documents
#         self.embedding_model = embedding_model
#         self.device = device
#         self.document_embeddings = self.embedding_model.encode([doc.content for doc in documents], device=self.device)

#     def retrieve(self, query: str, k: int = 5) -> List[Document]:
#         try:
#             query_embedding = self.embedding_model.encode([query], device=self.device)
#             similarities = util.cos_sim(query_embedding, self.document_embeddings)
            
#             logger.info(f"Query: {query}")
#             logger.info(f"Similarities shape: {similarities.shape}")
            
#             if len(similarities) == 0 or len(similarities[0]) == 0:
#                 logger.warning("No similarities found")
#                 return []
            
#             k = max(1, min(k, len(self.documents)))  # Ensure k is between 1 and the number of documents
#             top_k_indices = np.argsort(similarities[0])[-k:][::-1]
            
#             logger.info(f"Top {k} indices: {top_k_indices}")
            
#             return [self.documents[i] for i in top_k_indices]
#         except Exception as e:
#             logger.error(f"Error in retrieve method: {str(e)}")
#             return []

# class PromptConstructor:
#     def __init__(self, system_prompt: str, template: str):
#         self.system_prompt = system_prompt
#         self.template = template

#     def construct(self, query: str, retrieved_docs: List[Document]) -> List[Dict[str, str]]:
#         context = "\n\n".join([doc.content for doc in retrieved_docs])
#         user_message = self.template.format(context=context, query=query)
#         return [
#             {"role": "system", "content": self.system_prompt},
#             {"role": "user", "content": user_message}
#         ]

# class LanguageModel:
#     def __init__(self, model_name: str, api_token: str):
#         self.client = InferenceClient(model=model_name, token=api_token)

#     def generate(self, messages: List[Dict[str, str]]) -> str:
#         try:
#             logger.info(f"Sending request to API with messages: {messages[:2]}...")  # Log first two messages
#             response = self.client.chat_completion(messages, max_tokens=500, stream=False)
#             logger.info(f"Received response from API")
#             return response.choices[0].message.content
#         except Exception as e:
#             logger.error(f"Error in API request: {str(e)}")
#             return "I apologize, but I encountered an error while processing your request. Please try again later."


# class RAG:
#     def __init__(self, embedding_model: EmbeddingModel, language_model: LanguageModel,
#                  system_prompt: str, prompt_template: str, device: str = 'cuda:0'):
#         self.embedding_model = embedding_model
#         self.language_model = language_model
#         self.documents = []
#         self.document_retriever = None
#         self.prompt_constructor = PromptConstructor(system_prompt, prompt_template)
#         self.device = device
#         self.system_prompt = system_prompt

#     def add_documents(self, documents: List[Document]):
#         self.documents.extend(documents)
#         self.document_retriever = DocumentRetriever(self.documents, self.embedding_model, self.device)

#     def query(self, query: str, k: int = 10, similarity_threshold: float = 0.5) -> str:
#         try:
#             if not self.document_retriever:
#                 logger.warning("Document retriever not initialized. Falling back to language model.")
#                 return self.fallback_query(query)
            
#             retrieved_docs = self.document_retriever.retrieve(query, k)
            
#             # Filter documents based on similarity threshold
#             filtered_docs = self.filter_documents(query, retrieved_docs, similarity_threshold)
            
#             if filtered_docs:
#                 messages = self.prompt_constructor.construct(query, filtered_docs)
#                 response = self.language_model.generate(messages)
                
#                 # Fact-checking step
#                 if self.fact_check(response, filtered_docs):
#                     return response
#                 else:
#                     logger.warning("Response failed fact-checking. Falling back to a more conservative response.")
#                     return self.generate_conservative_response(query, filtered_docs)
#             else:
#                 logger.info("No relevant documents found after filtering. Falling back to language model.")
#                 return self.fallback_query(query)
        
#         except Exception as e:
#             logger.error(f"Error in RAG query: {str(e)}")
#             return self.fallback_query(query)

#     def filter_documents(self, query: str, documents: List[Document], threshold: float) -> List[Document]:
#         query_embedding = self.embedding_model.encode([query], device=self.device)
#         doc_embeddings = self.embedding_model.encode([doc.content for doc in documents], device=self.device)
#         similarities = util.cos_sim(query_embedding, doc_embeddings)[0]
        
#         return [doc for doc, sim in zip(documents, similarities) if sim >= threshold]

#     def fact_check(self, response: str, documents: List[Document]) -> bool:
#         # Implement a basic fact-checking mechanism
#         # This is a simplified version and can be expanded for more robust checking
#         response_lower = response.lower()
#         for doc in documents:
#             if any(fact.lower() in response_lower for fact in doc.content.split('.')):
#                 return True
#         return False

#     def generate_conservative_response(self, query: str, documents: List[Document]) -> str:
#         context = "\n".join([doc.content for doc in documents])
#         conservative_prompt = f"""
#         Based strictly on the following context, provide a concise and factual response to the query. 
#         If the information is not explicitly stated in the context, respond with "I don't have enough information to answer that question accurately."

#         Context: {context}

#         Query: {query}

#         Response:
#         """
#         return self.language_model.generate([{"role": "user", "content": conservative_prompt}])

#     def fallback_query(self, query: str) -> str:
#         messages = [
#             {"role": "system", "content": self.system_prompt},
#             {"role": "user", "content": f"User Query: {query}\n\nPlease provide a response based on your general knowledge about BARBAH Games, esports, and SDGs. If you're not certain about any information, please state that clearly."}
#         ]
#         return self.language_model.generate(messages)


# # Load and process JSON files with error handling
# def load_json_documents(file_paths, batch_size=10):
#     documents = []
#     for file_path in file_paths:
#         if os.path.exists(file_path):
#             try:
#                 with open(file_path, 'r', encoding='utf-8') as file:
#                     data = json.load(file)
#                     for doc in data[0]['docs']:
#                         content = doc['content']
#                         metadata = {
#                             'source': file_path,
#                             'title': doc['metadata'].get('title', ''),
#                             'description': doc['metadata'].get('description', '')
#                         }
#                         documents.append(Document(content, metadata))
#                         if len(documents) >= batch_size:
#                             yield documents
#                             documents = []
#             except Exception as e:
#                 logger.error(f"Error loading {file_path}: {e}")
#         else:
#             logger.warning(f"File not found: {file_path}")
#     if documents:
#         yield documents

# # Usage example
# MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
# API_TOKEN = "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"

# embedding_model = EmbeddingModel()
# language_model = LanguageModel(MODEL_NAME, API_TOKEN)

# system_prompt = """
# You are an AI assistant for BARBAH Games, an organization focused on promoting esports in East Africa and integrating Sustainable Development Goals (SDGs) into gaming. Your name is Astro and you have expertise in esports, SDGs, and BARBAH Games' initiatives. Provide informative and engaging responses to user queries, always emphasizing the connection between gaming, education, and sustainable development.
# """

# prompt_template = """
# Context:
# {context}

# User Query: {query}

# Please provide a response based on the context, your expertise in esports and SDGs, and BARBAH Games' initiatives. If relevant, suggest ways the user can contribute to SDGs through gaming or esports activities.
# """

# # Split documents between two GPUs
# rag_system_0 = RAG(embedding_model, language_model, system_prompt, prompt_template, device='cuda:0')
# rag_system_1 = RAG(embedding_model, language_model, system_prompt, prompt_template, device='cuda:1')

# # Add existing documents to the RAG system
# existing_documents = [
#     Document("BARBAH GAMES TOKENIZING GAMING FOR REAL-WORLD CAREERS IN WEB3...", {"source": "/kaggle/input/barbah-games-website-chatbot/Txt/Txt/BARBAHGAMES Deck018 2.txt"}),
#     Document("SUSTAINABLE DEVELOPMENT GOALS Progress Chart 2023...", {"source": "/kaggle/input/barbah-games-website-chatbot/Txt/Txt/SUSTAINABLE DEVELOPMENT GOALS Progress-Chart-2023.txt"}),
#     Document("BARBAH GAMES ESPORTS : A CATALYST FOR GLOBAL ESG INITIATIVES...", {"source": "/kaggle/input/barbah-games-website-chatbot/Txt/Txt/VISION DECK.txt"}),
#     Document("FOR THE NEXT GENERATION FY22 NIKE, Inc. Impact Report...", {"source": "/kaggle/input/barbah-games-website-chatbot/Txt/Txt/FY22-NIKE,-Inc.-Impact-Report.txt"}),
#     Document("Driving Engagement by gamifying SDG Quests At BARBAH Games, we are committed to more than just gaming...", {"source": "/kaggle/input/barbah-games-website-chatbot/Txt/Txt/SDG QUESTS.txt"}),
#     Document("Skills for a Sustainable Future: How Green and Digital Skills Intersect...", {"source": "/kaggle/input/barbah-games-website-chatbot/Txt/Txt/Skills for a Sustainable Future How Green and Digital Skills Intersect.txt"}),
#     Document("Corporate Sponsorship Model Overview...", {"source": "/kaggle/input/barbah-games-website-chatbot/Txt/Txt/Corporate Sponsorship Tiers.txt"}),
#     Document("BARBAH Games Essentials Copy Elevate Your Game with BARBAH Games Essentials!...", {"source": "/kaggle/input/barbah-games-website-chatbot/Txt/Txt/BARBAH GAMERS ESSENTIALS.txt"}),
#     Document("Promote Esports in East Africa July 12, 2024 www.barbahgames.com...", {"source": "/kaggle/input/BARBAH_GAMES_GPT_Conversation.txt"}),
#     Document("Gaming Mentor EA Supreme Instructions Overview...", {"source": "/kaggle/input/BARBAH_GAMES_GPT_Instructions.txt"})
# ]

# # Split documents into two halves
# half_index = len(existing_documents) // 2
# rag_system_0.add_documents(existing_documents[:half_index])
# rag_system_1.add_documents(existing_documents[half_index:])

# # Add new documents from JSON files in batches
# json_file_paths = [
#     "/kaggle/input/barbah-games-website-chatbot/Websites Info/Websites Info/22-02-2024/barbah games.com.json",
#     "/kaggle/input/barbah-games-website-chatbot/Websites Info/Websites Info/22-02-2024/barbah games.gg.json",
#     "/kaggle/input/barbah-games-website-chatbot/Websites Info/Websites Info/22-02-2024/epic games.json",
#     "/kaggle/input/barbah-games-website-chatbot/Websites Info/Websites Info/22-02-2024/forwardfaster global compact.json",
#     "/kaggle/input/barbah-games-website-chatbot/Websites Info/Websites Info/22-02-2024/prophix esg report.json",
#     "/kaggle/input/barbah-games-website-chatbot/Websites Info/Websites Info/22-02-2024/un global compact.json",
#     "/kaggle/input/barbah-games-website-chatbot/Websites Info/Websites Info/22-02-2024/un sdg learn.json"
# ]

# for batch in load_json_documents(json_file_paths, batch_size=5):
#     half_index = len(batch) // 2
#     rag_system_0.add_documents(batch[:half_index])
#     rag_system_1.add_documents(batch[half_index:])

# # Main loop
# def main():
#     while True:
#         query = input("Enter your query (or 'quit' to exit): ")
#         if query.lower() == 'quit':
#             break

#         try:
#             response_0 = rag_system_0.query(query)
#             response_1 = rag_system_1.query(query)

#             # Combine responses from both systems
#             combined_response = f"Response from GPU 0:\n{response_0}\n\nResponse from GPU 1:\n{response_1}"
#             print(combined_response)
#         except Exception as e:
#             logger.error(f"Error in main loop: {str(e)}")
#             print("An error occurred while processing your query. Please try again.")

#         # Add a delay to avoid rate limiting
#         time.sleep(2)

# if __name__ == "__main__":
#     main()

In [None]:
# import time
# import pandas as pd
# from typing import List, Dict
# import matplotlib.pyplot as plt

# def evaluate_chunk_size(rag_system, questions: List[str], chunk_sizes: List[int]) -> pd.DataFrame:
#     data = []
    
#     for chunk_size in chunk_sizes:
#         # Update the chunk size in the RAG system
#         rag_system.embedding_model.model.max_seq_length = chunk_size
        
#         total_response_time = 0
#         total_faithfulness = 0
#         total_relevancy = 0
        
#         for question in questions:
#             start_time = time.time()
#             response = rag_system.query(question)
#             elapsed_time = time.time() - start_time
            
#             # Here you would implement faithfulness and relevancy checks
#             # For now, we'll use dummy values
#             faithfulness = 1  # Placeholder
#             relevancy = 1  # Placeholder
            
#             total_response_time += elapsed_time
#             total_faithfulness += faithfulness
#             total_relevancy += relevancy
        
#         avg_response_time = total_response_time / len(questions)
#         avg_faithfulness = total_faithfulness / len(questions)
#         avg_relevancy = total_relevancy / len(questions)
        
#         data.append({
#             'Chunk Size': chunk_size,
#             'Average Response Time': avg_response_time,
#             'Average Faithfulness': avg_faithfulness,
#             'Average Relevancy': avg_relevancy
#         })
    
#     return pd.DataFrame(data)

# # Define the combined set of test questions
# test_questions = [
#     # Gamer Questions
#     "What is BARBAH Games and how does it's SDG goals help me as a gamer?",
#     "How can I participate in BARBAH Games' esports tournaments in East Africa?",
#     "What kind of skills can I develop through BARBAH Games' initiatives?",
#     "How does BARBAH Games combine gaming with education?",
#     "What are the SDG Quests in BARBAH Games and how do they work?",
#     "Can BARBAH Games help me build a career in esports or gaming?",
#     "What types of games or platforms does BARBAH Games focus on?",
#     "How does BARBAH Games' approach differ from traditional gaming experiences?",
#     "Are there any rewards or incentives for participating in BARBAH Games' SDG-focused activities?",
#     "How can I track my progress and impact through BARBAH Games' platform?",
    
#     # Corporate Questions
#     "What is BARBAH Games and how does it's SDG goals align with corporate social responsibility?",
#     "How can sponsoring BARBAH Games' initiatives benefit our company's ESG ratings?",
#     "What kind of partnership opportunities does BARBAH Games offer for corporations?",
#     "How does BARBAH Games measure and report the impact of corporate involvement in their initiatives?",
#     "Can BARBAH Games' platform be integrated into our corporate training or team-building programs?",
#     "What is the reach and demographic of BARBAH Games in East Africa?",
#     "How does BARBAH Games ensure data privacy and security for corporate partners?",
#     "What are the different tiers of corporate sponsorship available with BARBAH Games?",
#     "How can our company contribute to specific SDGs through BARBAH Games' platform?",
#     "What kind of brand visibility can we expect from partnering with BARBAH Games?"
# ]

# # Define chunk sizes to test
# chunk_sizes_to_test = [128, 256, 512, 1024]

# # Run the evaluation
# results_df = evaluate_chunk_size(rag_system_0, test_questions, chunk_sizes_to_test)

# # Display results
# print(results_df)

# # Plot the results
# plt.figure(figsize=(12, 6))
# plt.plot(results_df['Chunk Size'], results_df['Average Response Time'], marker='o', label='Avg Response Time')
# plt.plot(results_df['Chunk Size'], results_df['Average Faithfulness'], marker='s', label='Avg Faithfulness')
# plt.plot(results_df['Chunk Size'], results_df['Average Relevancy'], marker='^', label='Avg Relevancy')
# plt.xlabel('Chunk Size')
# plt.ylabel('Score')
# plt.title('RAG System Performance by Chunk Size')
# plt.legend()
# plt.grid(True)
# plt.show()