# RAG example #
Frameworks and tools used:
- LangChain
- HuggingFace
- OpenAI
- Chroma

In [None]:
import openai
import textwrap
import pandas as pd
from tabulate import tabulate
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOpenAI
from sentence_transformers import SentenceTransformer
from langchain.embeddings import HuggingFaceEmbeddings


# Load OpenAI credentials
openai_key = "your openai api key" 
model = "gpt-4o"

In [11]:
# Cargar documentos para nuestra base de datos (para RAG)
document_loader = TextLoader("data.txt", encoding="utf-8")
documents = document_loader.load()

# Dividir texto en fragmentos (chunks) para mejorar la recuperación
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
split_docs = text_splitter.split_documents(documents)

# Cargar el modelo que vamos a usar para hacer embeddings 
embedding_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

# Crear embeddings
embeddings = HuggingFaceEmbeddings(model_name="paraphrase-MiniLM-L6-v2")

# Guardar embeddings en ChromaDB
vector_store = Chroma.from_documents(split_docs, embeddings, persist_directory="./chroma_db")
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 1})

In [12]:
# Configurar LLM y RAG Chain
llm = ChatOpenAI(
    openai_api_key=openai_key,
    model_name=model
)

qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever, chain_type="stuff")

In [None]:
# Obtener respuesta del asistente potenciado con RAG
def get_RAG_response(query):
    response = qa_chain.invoke(query)["result"]
    return response

#query = "What ways of work are available for employees of TechNova?"
query = "What is the name of the CEO of TechNova?"


response = get_RAG_response(query)
print("GPT-4o with RAG:", response)

{'query': 'What is the name of the CEO of TechNova?', 'result': "I don't know."}
GPT-4o with RAG: I don't know.


In [14]:
# Obtener respuesta de ChatGPT directamente
def get_gpt4o_response(query):
    client = openai.OpenAI(api_key=openai_key)
    response = client.chat.completions.create(
        model=model,  
        messages=[
            {"role": "system", "content": "You are a helpful assistant that provides very concise and accurate answers."},
            {"role": "user", "content": query}
        ]
    )
    return response.choices[0].message.content

#user_query = "What ways of work are available for employees of TechNova?"
user_query = "What is the name of the CEO of TechNova?"

gpt4o_response = get_gpt4o_response(user_query)

print("GPT-4o:", gpt4o_response)

GPT-4o: I'm sorry, but I don't have information about the CEO of TechNova. You might want to check the company's official website or recent news articles for the most up-to-date information.


In [15]:
# Queries to compare
queries = [
    "What ways of work are available for employees of TechNova?",
    "How many paid vacation days do employees at TechNova Inc. get per year?",
    "How many sick days are TechNova Inc. employees entitled to annually?",
    "What does TechNova Inc.’s warranty cover, and how long is it valid?",
    "Can I get a refund for my TechNova Inc. product if I return it after 45 days?",
    "What security measures are required for employees to access company systems?",
    "What is the guest Wi-Fi password for TechNova Inc.?",
    "Where is TechNova Inc. headquartered?",
    "Can employees work remotely at TechNova Inc., and if so, how often?"
]

In [16]:
# Compare responses from both methods
results = []
for query in queries:
    rag_response = get_RAG_response(query)
    gpt4o_response = get_gpt4o_response(query)
    results.append({"Query": query, "RAG Response": rag_response, "GPT-4o Response": gpt4o_response})

In [17]:
# Convert results to a dataframe and display
df = pd.DataFrame(results)

# Function to wrap text in all string columns
def wrap_dataframe_text(df, width=40):
    wrapped_df = df.copy()
    for col in wrapped_df.select_dtypes(include=['object']).columns:
        wrapped_df[col] = wrapped_df[col].apply(lambda x: "\n".join(textwrap.wrap(x, width)) if isinstance(x, str) else x)
    return wrapped_df

# Apply text wrapping to all string columns
df_wrapped = wrap_dataframe_text(df, width=30)

# Display DataFrame as a formatted table
print(tabulate(df_wrapped, headers='keys', tablefmt='grid'))

+----+--------------------------------+--------------------------------+--------------------------------+
|    | Query                          | RAG Response                   | GPT-4o Response                |
|  0 | What ways of work are          | Employees of TechNova have the | TechNova offers three primary  |
|    | available for employees of     | option to work at the          | ways of working for its        |
|    | TechNova?                      | headquarters located at 123    | employees: remote work, hybrid |
|    |                                | Innovation Drive, Silicon      | work, and in-office work. This |
|    |                                | Valley, CA, during office      | flexible approach allows       |
|    |                                | hours from 9 AM to 5 PM,       | employees to choose the option |
|    |                                | Monday to Friday.              | that best suits their needs    |
|    |                                | Additi