# RAG Chat

In [3]:
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings
from langchain.vectorstores.chroma import Chroma
import os
import shutil

In [4]:
from dataclasses import dataclass
# Deprecated
# from langchain.chat_models import ChatOpenAI
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

In [5]:
os.chdir("/Users/rcruz2/Library/CloudStorage/OneDrive-MAPFRE/Trabajo/Cloud/PyData/202405009_tallerpydata")

In [6]:
!pip install openai #Mínimo
!pip install ipyreact tiktoken numpy #Óptimo



In [7]:
from tools import *

In [8]:
from dotenv import load_dotenv
import os

load_dotenv()
openai_api_key = os.getenv('OPENAI_API_KEY')

import openai
client = openai.OpenAI()

In [9]:
check_openai()

Todo está ok! :)


True

In [11]:
# Configura tu clave de API de OpenAI
openai.api_key = openai_api_key

### Prompt con RAG

- Generación de respuestas basada en recuperación (RAG)
- Ejemplo: chatbot de dudas sobre la Python standard library

```python
def chatbot_rag(pregunta):
    prompt = f"""
Eres un chatbot experto en la biblioteca estándar de Python. Utiliza la siguiente base de conocimientos para responder a la pregunta del usuario:

Base de conocimientos: {recupera_info_relevante(pregunta)}

Usuario: {pregunta}
"""
    respuesta = genera_texto(prompt)
    return respuesta
```

In [13]:
CHROMA_PATH = "chroma"
DATA_PATH = "library"

In [14]:
def save_to_chroma(chunks: list[Document]):
    ## Clear out the database first.
    #if os.path.exists(CHROMA_PATH):
    #    print (os.path.abspath(CHROMA_PATH))
    #    shutil.rmtree(CHROMA_PATH)

    # Create a new DB from the documents.
    #db = Chroma.from_documents(
    #    chunks, OpenAIEmbeddings(), persist_directory=CHROMA_PATH
    #)
    ## No es necesario llamar a persist() aquí
    ## db.persist()
    #print(f"Saved {len(chunks)} chunks to {CHROMA_PATH}.")
    # Obtener la ruta absoluta del directorio CHROMA_PATH
    chroma_path = os.path.abspath(CHROMA_PATH)
    
    # Verificar si el directorio existe
    if os.path.exists(chroma_path):
        # Mostrar la ruta absoluta para depuración
        print(f"Ruta absoluta de CHROMA_PATH: {chroma_path}")
        
        # Eliminar el directorio y su contenido
        shutil.rmtree(chroma_path)
    
    # Crear el directorio CHROMA_PATH si no existe
    os.makedirs(chroma_path, exist_ok=True)

    # Crear una nueva DB a partir de los documentos
    db = Chroma.from_documents(
        chunks, OpenAIEmbeddings(), persist_directory=chroma_path
    )

    print(f"Guardado {len(chunks)} fragmentos en {chroma_path}.")

In [15]:
def split_text(documents: list[Document]):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=300,
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")

    document = chunks[10]
    print(document.page_content)
    print(document.metadata)

    return chunks

In [16]:
def load_documents():
    loader = DirectoryLoader(DATA_PATH, glob="*.rst")
    documents = loader.load()
    print (f"There are {len(documents)} documents to proccess")
    return documents

In [17]:
def generate_data_store():
    documents = load_documents()
    chunks = split_text(documents)
    save_to_chroma(chunks)

In [18]:
generate_data_store()


























































There are 293 documents to proccess
Split 293 documents into 22917 chunks.
new_event_loop()

Create and return a new event loop object.

Note that the behaviour of get_event_loop, set_event_loop, and new_event_loop functions can be altered by setting a custom event loop policy <asyncio-policies>.

Contents

This documentation page contains the following sections:
{'source': 'library/asyncio-eventloop.rst', 'start_index': 1793}
Ruta absoluta de CHROMA_PATH: /Users/rcruz2/Library/CloudStorage/OneDrive-MAPFRE/Trabajo/Cloud/PyData/202405009_tallerpydata/chroma
Guardado 22917 fragmentos en /Users/rcruz2/Library/CloudStorage/OneDrive-MAPFRE/Trabajo/Cloud/PyData/202405009_tallerpydata/chroma.


In [19]:
PROMPT_TEMPLATE = """
Eres un chatbot experto en la biblioteca estándar de Python. Utiliza la siguiente base de conocimientos para responder a la pregunta del usuario:

Base de conocimientos: {context}

---

Usuario: {question}
"""

In [49]:
# Haz tu pregunta
query_text = "What is the use of dataclass from dataclasses"

In [55]:
def chatbot_rag_model(pregunta):
    # Prepare the DB.
    embedding_function = OpenAIEmbeddings()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    results = db.similarity_search_with_relevance_scores(query_text, k=3)
    if len(results) == 0 or results[0][1] < 0.7:
        print(f"Unable to find matching results.")
        return

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=pregunta)
    print(prompt)

    model = ChatOpenAI()
    response_text = model.invoke(prompt)

    sources = [doc.metadata.get("source", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)

In [22]:
def genera_texto(prompt):
    # Llama a la API de OpenAI para generar un texto basado en el prompt proporcionado
    respuesta = client.completions.create(model="gpt-3.5-turbo-instruct",
        prompt=prompt,
        max_tokens=1000,  # Ajusta según sea necesario
        n=1,
        stop=None,
        temperature=0.7
    )
    return respuesta.choices[0].text.strip()

In [23]:
def chatbot_rag(pregunta):
    # Prepare the DB.
    embedding_function = OpenAIEmbeddings()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    results = db.similarity_search_with_relevance_scores(query_text, k=3)
    if len(results) == 0 or results[0][1] < 0.7:
        print(f"Unable to find matching results.")
        return

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=pregunta)
    print(prompt)

    response_text = genera_texto(prompt)

    sources = [doc.metadata.get("source", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)

In [57]:
chatbot_rag_model(query_text)

Human: 
Eres un chatbot experto en la biblioteca estándar de Python. Utiliza la siguiente base de conocimientos para responder a la pregunta del usuario:

Base de conocimientos: If @dataclass is used just as a simple decorator with no
parameters, it acts as if it has the default values documented in this
signature. That is, these three uses of @dataclass are
equivalent:

The parameters to @dataclass are:

---

This function is not strictly required, because any Python mechanism
for creating a new class with !__annotations__ can then apply the @dataclass <dataclass>
function to convert that class to a dataclass. This function is provided
as a convenience. For example:

Is equivalent to:

---

The @dataclass decorator will add various "dunder"
methods to the class, described below. If any of the added methods
already exist in the class, the behavior depends on the parameter, as
documented below. The decorator returns the same class that it is called
on; no new class is created.

---

Usu

In [53]:
chatbot_rag(query_text)

Human: 
Eres un chatbot experto en la biblioteca estándar de Python. Utiliza la siguiente base de conocimientos para responder a la pregunta del usuario:

Base de conocimientos: If @dataclass is used just as a simple decorator with no
parameters, it acts as if it has the default values documented in this
signature. That is, these three uses of @dataclass are
equivalent:

The parameters to @dataclass are:

---

This function is not strictly required, because any Python mechanism
for creating a new class with !__annotations__ can then apply the @dataclass <dataclass>
function to convert that class to a dataclass. This function is provided
as a convenience. For example:

Is equivalent to:

---

The @dataclass decorator will add various "dunder"
methods to the class, described below. If any of the added methods
already exist in the class, the behavior depends on the parameter, as
documented below. The decorator returns the same class that it is called
on; no new class is created.

---

Usu