## Parte 1

- Se usan embeddings con los documentos y se cargan en una base de datos vectorial.

In [3]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from langchain.document_loaders import PyPDFLoader

In [4]:
#APIs
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

#SPECS
PINECONE_CLOUD = os.environ.get('PINECONE_CLOUD')
PINECONE_REGION = os.environ.get('PINECONE_REGION')

In [3]:
# Se cargan los documentos en pdf
loader = PyPDFLoader("./data/CV_Simon.pdf")
documents = loader.load()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Crear un splitter para dividir en chunks de 500 caracteres
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # Tamaño máximo de cada chunk
    chunk_overlap=50  # Superposición entre chunks
)

# Dividir los documentos en chunks
chunks = text_splitter.split_documents(documents)

# Imprimir los chunks generados
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:\n{chunk.page_content}\n")

In [None]:
from transformers import AutoModel

embedding_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)
chunk_embeddings = [embedding_model.encode(chunk.page_content).tolist() for chunk in chunks]

In [6]:
dim = len(chunk_embeddings[0])
dim

768

In [7]:
import time
from pinecone import Pinecone, ServerlessSpec

# Index Specs
pc_cloud = PINECONE_CLOUD or "aws"
pc_region = PINECONE_REGION or "us-east-1"


# Se abre la conexión con Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "cv-index"

# Se crea el índice si no existe
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=dim,
        metric="cosine",
        spec=ServerlessSpec(
            cloud=pc_cloud, 
            region=pc_region
        ) 
    ) 

# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [None]:
len(chunks)

In [None]:
records = []
for idx, emb in enumerate(chunk_embeddings):
    records.append({
        "id": str(idx),
        "values": emb,
        "metadata": {'text': chunks[idx].page_content }
    })

In [None]:
# Se conecta al índice
pinecone_index = pc.Index(index_name)
time.sleep(1)

# Se insertan los chunks en el índice
pinecone_index.upsert(
    vectors=records,
    namespace="default"
)

# For batches
# from tqdm.auto import tqdm
# for batch in tqdm(records.iter_documents(batch_size=500), total=160):
#     index.upsert(batch)

In [11]:
# Se conecta al índice
pinecone_index = pc.Index(index_name)
time.sleep(1)

# view index stats
pinecone_index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'default': {'vector_count': 13}},
 'total_vector_count': 13}

In [12]:
# Realizar una consulta a la base de datos
def get_closest_answer(user_query):
    # Obtener el embedding de la consulta
    query_embedding = embedding_model.encode([user_query]).tolist() # Necesario convertirlo a una lista

    # Buscar el vector más cercano usando Pinecone
    result = pinecone_index.query(
        namespace="default",
        vector=query_embedding, 
        top_k=1,
        include_metadata=True,
        include_values=False,
        )
    
    return result

In [13]:
# Add the retrieved information to the LLM system prompt
def get_system_prompt(user_query):
    result = get_closest_answer(user_query)
    matched_info = ' '.join(item['metadata']['text'] for item in result['matches'])
    context = f"Information: {matched_info}"
    sys_prompt = f"""
    Instructions:
    - Be helpful and answer questions concisely. If you don't know the answer, say 'I don't know'
    - Utilize the context provided for accurate and specific information.
    Context: {context}
    """
    return sys_prompt

In [14]:
from groq import Groq

client = Groq(
    # GROQ_API_KEY is the default and can be omitted if in ENV variables
    api_key=GROQ_API_KEY,
)

def sent_query_to_groq(sys_prompt, user_query):
    # Define the query
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": sys_prompt,
            },
            {
                "role": "user",
                "content": user_query,
            }
        ],
        model="llama3-8b-8192",
    )
    # Get the response
    response = chat_completion.choices[0].message.content
    print(response)

In [139]:
user_query = "En que universidad estudió simon?"
#user_query = "What is Simon's experience?"
#user_query = "What are Simon's skills?"
#user_query = "What are Simon's interests?"

sys_prompt = get_system_prompt(user_query)

sent_query_to_groq(sys_prompt, user_query)

Simon estudió en la Universidad Nacional Experimental “Antonio José de Sucre” (VEN).


Equipo:
- Andres Malvestti
- Cristian Davico
- Simon Rodriguez

## Parte 2

- Se utilizan agentes para responder especificamente sobre cada uno de los documentos.
- Si no se presenta nombre, entonces traer uno por defecto.
- PLUS: Si se consulta por más de un CV, traer el contexto de cada uno de forma acorde.

### Pasos:

1. Esquematizar el diagrama de flujo que debe tener la aplicación.
2. Definir el "conditional Edge", tomador de la decisión. Consejo: utilizaar la librería **re** y su método **match**.
3. Implementar cada uno de los pasos y compilar el diagrama de flujo.

#### Se preparan y cargan los CVs en índices separados

In [36]:
import re
import os
import time

from groq import Groq
from langchain.document_loaders import PyPDFLoader
from transformers import AutoModel
from pinecone import Pinecone, ServerlessSpec

In [20]:
#APIs
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [21]:
# Funciones para cargar un PDF, dividirlo en chunks y obtener los embeddings

def load_pdf(file_path):
    loader = PyPDFLoader(file_path)
    documents = loader.load()
    return documents

def split_chunks(documents):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=200,  # Tamaño máximo de cada chunk
        chunk_overlap=50  # Superposición entre chunks
    )
    chunks = text_splitter.split_documents(documents)
    return chunks

def print_chunks(chunks):
    for i, chunk in enumerate(chunks):
        print(f"Chunk {i+1}:\n{chunk.page_content}\n")

def get_embeddings(chunks):
    embedding_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)
    chunk_embeddings = [embedding_model.encode(chunk.page_content).tolist() for chunk in chunks]
    dim = len(chunk_embeddings[0])
    return chunk_embeddings, dim

In [22]:
# Variables
simon_cv = "./data/CV_Simon.pdf"
jess_cv = "./data/CV_Jess.pdf"

simon_index = "simon-index"
jess_index = "jess-index"

In [None]:
# Simon
documents_s = load_pdf(simon_cv)
chunks_s = split_chunks(documents_s)
print_chunks(chunks_s)
chunk_embeddings_s, dim_s = get_embeddings(chunks_s)

In [None]:
# Jess
documents_j = load_pdf(jess_cv)
chunks_j = split_chunks(documents_j)
print_chunks(chunks_j)
chunk_embeddings_j, dim_j = get_embeddings(chunks_j)

In [242]:
# Index Specs
pc_cloud = "aws"
pc_region = "us-east-1"

# Funcion para crear un índice en Pinecone
def create_index(index_name, dim):
    pc = Pinecone(api_key=PINECONE_API_KEY)

    # Se crea el índice si no existe
    if not pc.has_index(index_name):
        pc.create_index(
            name=index_name,
            dimension=dim,
            metric="cosine",
            spec=ServerlessSpec(
                cloud=pc_cloud, 
                region=pc_region
            ) 
        ) 

    # Wait for the index to be ready
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

def insert_chunks(index_name, chunk_embeddings, chunks):
    records = []
    pc = Pinecone(api_key=PINECONE_API_KEY)
    for idx, emb in enumerate(chunk_embeddings):
        records.append({
            "id": str(idx),
            "values": emb,
            "metadata": {'text': chunks[idx].page_content }
        })

    # Se conecta al índice
    pinecone_index = pc.Index(index_name)
    time.sleep(1)

    # Se insertan los chunks en el índice
    pinecone_index.upsert(
        vectors=records,
        namespace="default"
    )

def index_describe(index_name):
    pc = Pinecone(api_key=PINECONE_API_KEY)
    pinecone_index = pc.Index(index_name)
    time.sleep(1)
    return pinecone_index.describe_index_stats()

In [243]:
# Se crean los indices para cada CV
create_index(simon_index, dim_s)
create_index(jess_index, dim_j)

In [244]:
# Se agregan los chunks a los índices
insert_chunks(simon_index, chunk_embeddings_s, chunks_s)
insert_chunks(jess_index, chunk_embeddings_j, chunks_j)

In [247]:
# Se describen los índices
index_describe(simon_index)

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'default': {'vector_count': 35}},
 'total_vector_count': 35}

In [248]:
index_describe(jess_index)

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'default': {'vector_count': 28}},
 'total_vector_count': 28}

#### Se definen los agentes y funciones

In [37]:
from groq import Groq
client = Groq(api_key=GROQ_API_KEY)

In [38]:
# Se crea una clase Agente para interactuar con el sistema, esta es la estructura del agente
class Agent:
    def __init__(self, sys_prompt=""):
        self.system = sys_prompt
        self.messages = []
        if self.system:
            self.messages.append({"role": "system", "content": sys_prompt})

    def __call__(self, user_query):
        self.messages.append({"role": "user", "content": user_query})
        result = self.execute()
        self.messages.append({"role": "assistant", "content": result})
        return result

    def execute(self):
        completion = client.chat.completions.create(
                        model="llama3-8b-8192", 
                        messages=self.messages)
        return completion.choices[0].message.content

In [213]:
# Realizar una consulta a la base de datos
def get_closest_answer(user_query, index_name):
    # Obtener el embedding de la consulta
    embedding_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)
    query_embedding = embedding_model.encode([user_query]).tolist() # Necesario convertirlo a una lista


    # Se abre la conexión con Pinecone
    pc = Pinecone(api_key=PINECONE_API_KEY)
    # Se conecta al índice
    pinecone_index = pc.Index(index_name)

    # Buscar el vector más cercano usando Pinecone
    result = pinecone_index.query(
        namespace="default",
        vector=query_embedding, 
        top_k=1,
        include_metadata=True,
        include_values=False,
        )
    
    return result


# Agrega la información al prompt del sistema para dar el contexto
def get_system_prompt(user_query, index_name):
    result = get_closest_answer(user_query, index_name)
    matched_info = ' '.join(item['metadata']['text'] for item in result['matches'])
    context = f"Information: {matched_info}"
    print(index_name)
    sys_prompt = f"""
    Instructions:
    - Be helpful and answer questions concisely. If you don't know the answer, say 'I don't know'
    - Utilize the context provided for accurate and specific information.
    Context: {context}
    """
    return sys_prompt

In [214]:
def agente_simon(user_query):
    sys_prompt = get_system_prompt(user_query, "simon-index")
    agent = Agent(sys_prompt)
    result = agent(user_query)
    return result

def agente_jess(user_query):
    sys_prompt = get_system_prompt(user_query, "jess-index")
    agent = Agent(sys_prompt)
    result = agent(user_query)
    return result

In [215]:
action_re = re.compile(r'^ACT: call "(\w+)" for query "(.*)"$', re.MULTILINE)  # expresión regular para capturar secuencias de texto

acciones_disponibles = {
    "cv_simon": agente_simon,
    "cv_jessica": agente_jess,
}

In [None]:
INITIAL_PROMPT = f"""
Instructions: You run in a cycle of THINK, ACT, WAIT, and RESULT. At the end of the cycle, you give an ANSWER.

THINK: Read the user query and think about an action to answer it.
ACT: Reply with calling an Action to perform any of the actions available to you. This should be formatted as ACT: call "action" for query "user query".
WAIT: Wait until you receive the next prompt with the result of your action.
RESULT: you will recieve the result of your action in the next prompt.
ANSWER: finally, answer the user query with the result you received.

Available actions:
- cv_simon: use it if you want to ask about Simon's CV
- cv_jessica: use it if you want to ask about Jessica's CV

DO NOT FORGET TO WAIT FOR THE RESULT BEFORE ANSWERING THE USER QUERY. DO NOT SKIP ANY STEP.
"""

In [250]:
def query(user_query, max_turns=2):
    i = 0
    bot = Agent(INITIAL_PROMPT)
    next_prompt = user_query
    while i < max_turns:
        i += 1
        print("Turno", i)
        result = bot(next_prompt)
        print(result)
        acciones = [
            action_re.search(a)
            for a in result.split('\n') 
            if action_re.search(a)
        ]
        if acciones:
            # There is an action to run
            accion, accion_input = acciones[0].groups()
            print("Acción detectada:", accion)
            if accion not in acciones_disponibles:
                raise Exception("Acción desconocida: {}: {}".format(accion, accion_input))
            print(" -- corriendo {} {}".format(accion, accion_input))
            observacion = acciones_disponibles[accion](accion_input)
            print("RESULT:", observacion)
            next_prompt = "RESULT: {}".format(observacion)
        else:
            print("No hay acciones detectadas")
            return

In [266]:
question = "What did Jessica study?"
query(question)

Turno 1
THINK: Ah, Jessica's CV, let me think for a moment...

ACT: call "cv_jessica" for query "What did Jessica study?"

WAIT: Please respond with the result of my action.

(Please respond with the result, and I'll proceed with answering the user query)
Acción detectada: cv_jessica
 -- corriendo cv_jessica What did Jessica study?
jess-index
RESULT: I don't have information about Jessica's education or academic background.
Turno 2
THINK: Okay, it seems I don't have enough information about Jessica's education. Let me think what to do next...

ACT: Sorry, I don't have the information to answer that query. I'll let the user know that Jessica's education or academic background is unknown.

ANSWER: Sorry, I don't have information about Jessica's education or academic background.
No hay acciones detectadas


In [267]:
question = "What did Simon study?"
query(question)

Turno 1
THINK: Hmm, the user is asking about Simon's background...

ACT: call "cv_simon" for query "What did Simon study?"

WAIT: (Waiting for the result...)
Acción detectada: cv_simon
 -- corriendo cv_simon What did Simon study?
simon-index
RESULT: Simon Rodriguez A. is a Mechanical Engineer.
Turno 2
THINK: Ah, I've got the result! Simon is a Mechanical Engineer...

ANSWER: Simon studied Mechanical Engineering.
No hay acciones detectadas
