## Parte 1

- Se usan embeddings con los documentos y se cargan en una base de datos vectorial.

In [3]:
import os
from dotenv import load_dotenv
from pinecone import Pinecone, ServerlessSpec
from langchain.document_loaders import PyPDFLoader

In [4]:
#APIs
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

#SPECS
PINECONE_CLOUD = os.environ.get('PINECONE_CLOUD')
PINECONE_REGION = os.environ.get('PINECONE_REGION')

In [3]:
# Se cargan los documentos en pdf
loader = PyPDFLoader("./data/CV_Simon.pdf")
documents = loader.load()

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Crear un splitter para dividir en chunks de 500 caracteres
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # Tamaño máximo de cada chunk
    chunk_overlap=50  # Superposición entre chunks
)

# Dividir los documentos en chunks
chunks = text_splitter.split_documents(documents)

# Imprimir los chunks generados
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}:\n{chunk.page_content}\n")

Chunk 1:
RESUME 
 
 
 
Professional Experience 
 
02/2024 –  
today 
Eng. Digitization & PLM Specialist (Engineering Systems) – Accenture, Munich (DE) 
• Coordination of data migration activities and teams across multiple time zones.  
• Planning, design, and supervision of ETL tasks during PLM implementations. 
• Lead Engineering and R&D Transformation Programs to drive Innovation and Process Enablement 
for the Clients in cross-functional teams.

Chunk 2:
for the Clients in cross-functional teams. 
• Enable transformation in R&D utilizing the  PLM capabilities by creating business processes for 
Package/Product design, BOM Management, Recipe/Specification Management, Engineering 
Change Management, Simulations, Sustainability, and supplier integration.  
12/2022 –  
02/2024 
Management Consultant (Digital Engineering and R&D) – Accenture, CABA (ARG) 
• Participation in solution architecting proposals for clients.

Chunk 3:
• Design standardized assets and solutions for the PLM , Prod

In [None]:
from transformers import AutoModel

embedding_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)
chunk_embeddings = [embedding_model.encode(chunk.page_content).tolist() for chunk in chunks]

In [6]:
dim = len(chunk_embeddings[0])
dim

768

In [7]:
import time
from pinecone import Pinecone, ServerlessSpec

# Index Specs
pc_cloud = PINECONE_CLOUD or "aws"
pc_region = PINECONE_REGION or "us-east-1"


# Se abre la conexión con Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "cv-index"

# Se crea el índice si no existe
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=dim,
        metric="cosine",
        spec=ServerlessSpec(
            cloud=pc_cloud, 
            region=pc_region
        ) 
    ) 

# Wait for the index to be ready
while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [None]:
len(chunks)

In [None]:
records = []
for idx, emb in enumerate(chunk_embeddings):
    records.append({
        "id": str(idx),
        "values": emb,
        "metadata": {'text': chunks[idx].page_content }
    })

In [None]:
# Se conecta al índice
pinecone_index = pc.Index(index_name)
time.sleep(1)

# Se insertan los chunks en el índice
pinecone_index.upsert(
    vectors=records,
    namespace="default"
)

# For batches
# from tqdm.auto import tqdm
# for batch in tqdm(records.iter_documents(batch_size=500), total=160):
#     index.upsert(batch)

In [11]:
# Se conecta al índice
pinecone_index = pc.Index(index_name)
time.sleep(1)

# view index stats
pinecone_index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'default': {'vector_count': 13}},
 'total_vector_count': 13}

In [12]:
# Realizar una consulta a la base de datos
def get_closest_answer(user_query):
    # Obtener el embedding de la consulta
    query_embedding = embedding_model.encode([user_query]).tolist() # Necesario convertirlo a una lista

    # Buscar el vector más cercano usando Pinecone
    result = pinecone_index.query(
        namespace="default",
        vector=query_embedding, 
        top_k=1,
        include_metadata=True,
        include_values=False,
        )
    
    return result

In [13]:
# Add the retrieved information to the LLM system prompt
def get_system_prompt(user_query):
    result = get_closest_answer(user_query)
    matched_info = ' '.join(item['metadata']['text'] for item in result['matches'])
    context = f"Information: {matched_info}"
    sys_prompt = f"""
    Instructions:
    - Be helpful and answer questions concisely. If you don't know the answer, say 'I don't know'
    - Utilize the context provided for accurate and specific information.
    Context: {context}
    """
    return sys_prompt

In [14]:
from groq import Groq

client = Groq(
    # GROQ_API_KEY is the default and can be omitted if in ENV variables
    api_key=GROQ_API_KEY,
)

def sent_query_to_groq(sys_prompt, user_query):
    # Define the query
    chat_completion = client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": sys_prompt,
            },
            {
                "role": "user",
                "content": user_query,
            }
        ],
        model="llama3-8b-8192",
    )
    # Get the response
    response = chat_completion.choices[0].message.content
    print(response)

In [139]:
user_query = "En que universidad estudió simon?"
#user_query = "What is Simon's experience?"
#user_query = "What are Simon's skills?"
#user_query = "What are Simon's interests?"

sys_prompt = get_system_prompt(user_query)

sent_query_to_groq(sys_prompt, user_query)

Simon estudió en la Universidad Nacional Experimental “Antonio José de Sucre” (VEN).


Equipo:
- Andres Malvestti
- Cristian Davico
- Simon Rodriguez

## Parte 2

- Se utilizan agentes para responder especificamente sobre cada uno de los documentos.
- Si no se presenta nombre, entonces traer uno por defecto.
- PLUS: Si se consulta por más de un CV, traer el contexto de cada uno de forma acorde.