In [1]:
# Utilidades del sistema
import os
import time

# Bibliotecas externas
from tqdm.auto import tqdm

# Procesamiento de documentos
from odf.opendocument import load
from odf.text import P

# LangChain y módulos relacionados
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain.chains import RetrievalQA

# Pinecone y módulos relacionados
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

# Obtiene las claves API necesarias desde las variables de entorno
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 
LANGCHAIN_TRACING_V2 = os.getenv("LANGCHAIN_TRACING_V2") 
LANGCHAIN_API_KEY = os.getenv("LANGCHAIN_API_KEY")
LANGCHAIN_PROJECT="pr-plaintive-radar-87"
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") 

assert OPENAI_API_KEY is not None, "OPENAI_API_KEY is not set"
assert LANGCHAIN_TRACING_V2 is not None, "LANGCHAIN_TRACING_V2 is not set"
assert LANGCHAIN_API_KEY is not None, "LANGCHAIN_API_KEY is not set"

# Inicializa el modelo de chat de OpenAI con configuración personalizada
chat = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,  # Utiliza la clave API para autenticar la conexión con OpenAI
    model='gpt-3.5-turbo',  # Especifica el modelo de lenguaje a usar
    temperature=0,  # Control de creatividad (0 hace que el output sea más determinista)
    streaming=True  # Permite la transmisión de resultados conforme se generan
)

In [2]:
### EMBEDDINGS
# Inicializa un modelo de embeddings
embed_model = OpenAIEmbeddings(model="text-embedding-3-large") 

# Genera el embedding de la palabra 'hola' y muestra la longitud del vector
print(len(embed_model.embed_query('hola')))  


3072


In [3]:
# Ruta de la carpeta donde se encuentran los documentos .odt
carpeta= "docs_odt"

In [4]:
# Función para leer todos los documentos .odt en una carpeta
def leer_documentos_odt(carpeta):
    documentos = {}
    for archivo in os.listdir(carpeta):
        if archivo.endswith(".odt"):  # Verifica que el archivo sea .odt
            ruta_archivo = os.path.join(carpeta, archivo)
            # Cargar el archivo .odt
            documento = load(ruta_archivo)
            contenido = []
            # Extraer el texto de los párrafos
            for elemento in documento.getElementsByType(P):
                contenido.append(str(elemento))
            documentos[archivo] = "\n".join(contenido)
    return documentos

In [5]:
# Leemos los documentos en un diccionario {nombre_archivo: contenido}
docs_dict = leer_documentos_odt(carpeta)

In [6]:
docs_dict

{'octavio_cv.odt': "\nOctavio Deshays\nMechatronics Engineer - National University of Cuyo\nMendoza, Argentina\n22/12/1997\n+54 9 2615538396\noctaviodeshays@gmail.com\nln: Octavio Deshays Moreno\nEXPERIENCE\nMARVIK, Uruguay — Machine Learning Engineer\nDecember 2022 - Present\nMarvik is a hands-on ML consulting firm. In my role, I am involved in the entire process of developing an AI solution, from identifying the customer's problem to implementing the solution.\nProjects: \nPhotoStudio Editor: an app to allow sellers from the largest E-Commerce in LatinAmerica to edit their products images using Stable Diffusion, generating attractive backgrounds for each product. Currently in production being used by thousands of users every hour.\nFashion Recommendation System: designed and built MVP for a Tinder like recsys for a fashion company. Involved building a feature extraction pipeline for garments using CLIP based classifiers and a Reinforcement Learning algorithm.\nVirtual Try On: for thi

In [7]:
# Ahora creamos la lista de Documents con su contenido real
documents = [Document(page_content=contenido, metadata={"source": nombre_archivo}) 
                   for nombre_archivo, contenido in docs_dict.items()]

In [8]:
documents

[Document(metadata={'source': 'octavio_cv.odt'}, page_content="\nOctavio Deshays\nMechatronics Engineer - National University of Cuyo\nMendoza, Argentina\n22/12/1997\n+54 9 2615538396\noctaviodeshays@gmail.com\nln: Octavio Deshays Moreno\nEXPERIENCE\nMARVIK, Uruguay — Machine Learning Engineer\nDecember 2022 - Present\nMarvik is a hands-on ML consulting firm. In my role, I am involved in the entire process of developing an AI solution, from identifying the customer's problem to implementing the solution.\nProjects: \nPhotoStudio Editor: an app to allow sellers from the largest E-Commerce in LatinAmerica to edit their products images using Stable Diffusion, generating attractive backgrounds for each product. Currently in production being used by thousands of users every hour.\nFashion Recommendation System: designed and built MVP for a Tinder like recsys for a fashion company. Involved building a feature extraction pipeline for garments using CLIP based classifiers and a Reinforcement L

In [9]:
# Mostramos los primeros 50 caracteres de cada documento de Belén
for idx, doc in enumerate(documents):
    print(f"Documento {idx + 1}:")
    print(f"Contenido (primeros 50 caracteres):\n{doc.page_content[:50]}")
    print("-" * 40)

Documento 1:
Contenido (primeros 50 caracteres):

Octavio Deshays
Mechatronics Engineer - National 
----------------------------------------
Documento 2:
Contenido (primeros 50 caracteres):
Santiago Francisco BELEN DEAS
Personal Information
----------------------------------------
Documento 3:
Contenido (primeros 50 caracteres):
Isaias Tenorio

Profesional de la docencia con una
----------------------------------------


In [10]:
# Define una función para dividir los documentos en chunks
def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc = text_splitter.split_documents(docs)  
    return doc  

# Ahora aplicamos la función chunk_data
Cv = chunk_data(docs=documents, chunk_size=3000, chunk_overlap=50)

In [11]:
Cv

[Document(metadata={'source': 'octavio_cv.odt'}, page_content="Octavio Deshays\nMechatronics Engineer - National University of Cuyo\nMendoza, Argentina\n22/12/1997\n+54 9 2615538396\noctaviodeshays@gmail.com\nln: Octavio Deshays Moreno\nEXPERIENCE\nMARVIK, Uruguay — Machine Learning Engineer\nDecember 2022 - Present\nMarvik is a hands-on ML consulting firm. In my role, I am involved in the entire process of developing an AI solution, from identifying the customer's problem to implementing the solution.\nProjects: \nPhotoStudio Editor: an app to allow sellers from the largest E-Commerce in LatinAmerica to edit their products images using Stable Diffusion, generating attractive backgrounds for each product. Currently in production being used by thousands of users every hour.\nFashion Recommendation System: designed and built MVP for a Tinder like recsys for a fashion company. Involved building a feature extraction pipeline for garments using CLIP based classifiers and a Reinforcement Lea

In [12]:
# Revisamos que ahora Cv contenga los chunks correctamente
for chunk in Cv[:5]:
    print(chunk.page_content[:100])
    print("----")

Octavio Deshays
Mechatronics Engineer - National University of Cuyo
Mendoza, Argentina
22/12/1997
+5
----
EDUCATION
University of Buenos Aires, Argentina — Post Graduate Specialization in Artificial Intelli
----
Santiago Francisco BELEN DEAS
Personal Information
Address: Av. Corrientes 16204piso Dto: “B” CP: 10
----
2015 – 2018
Teaching Physician in Clinical Examination and Internal Medicine 5th Chair of Internal M
----
Isaias Tenorio

Profesional de la docencia con una sólida trayectoria. Soy capaz de coordinar el pla
----


---


In [13]:
#PINECONE

# Conecta a la base de datos Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)  
cloud = os.environ.get('PINECONE_CLOUD') or 'aws' 
region = os.environ.get('PINECONE_REGION') or 'us-east-1'  
spec = ServerlessSpec(cloud=cloud, region=region)
index_name = 'llm-tp1' 

# Elimina el índice si ya existe en Pinecone
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)  
    print("index {} borrado".format(index_name)) 

# Verifica si el índice no existe y lo crea
if index_name not in pc.list_indexes().names():
    print("index creado con el nombre: {}".format(index_name)) 
    pc.create_index(
        index_name,
        dimension=3072,  
        metric='cosine', 
        spec=spec 
    )
else:
    print("el index con el nombre {} ya estaba creado".format(index_name)) 


index llm-tp1 borrado
index creado con el nombre: llm-tp1


---

In [14]:
## INSERCIÓN DE LOS VECTORES EN LA BASE DE DATOS DE PINECONE

# Define el espacio de nombres
namespace = "Curriculums"

# Inserta los documentos en la base de datos Pinecone
docsearch = PineconeVectorStore.from_documents(
    documents=Cv,  
    index_name=index_name,  
    embedding=embed_model, 
    namespace=namespace
)

print("upserted values to {} index under namespace {}".format(index_name, namespace)) 
time.sleep(1)  

upserted values to llm-tp1 index under namespace Curriculums


---


In [15]:
# Inicializa el vectorstore para los documentos
vectorstore = PineconeVectorStore(
    index_name=index_name,  # Nombre del índice
    embedding=embed_model,  # Modelo de embeddings a usar
    namespace=namespace  # Espacio de nombres definido para tesis
)

---


In [18]:
# Realiza una búsqueda de similitud en el vectorstore de tesis
query = "in which companies did santiago used to work"  # Define la consulta para realizar la búsqueda
documents_result = vectorstore.similarity_search(query, k=1)  # Realiza la búsqueda de los documentos más similares con k=2

documents_result


[Document(id='1e0716b6-4a8c-4934-8059-c9a701de8c89', metadata={'source': 'santiago_cv.odt'}, page_content='Santiago Francisco BELEN DEAS\nPersonal Information\nAddress: Av. Corrientes 16204piso Dto: “B” CP: 1042 Nationality: Argentine and Spanish.\nCity: Capital Federal, Buenos Aires, Argentina\nPhone Number: +54 9 11-5715-9118\nDate of Birth: 26/06/1984\nNational ID: 32.182.510\nE-mail: Sbelen@fmed.uba.ar\nProfessional Profile\nMedical Specialist in Internal Medicine and Clinical Research Professional with extensive knowledge in multiple therapeutic areas, currently pursuing a Specialization in Artificial Intelligence at the University of Buenos Aires.\nExperienced Teaching Physician in Clinical Examination and Internal Medicine, with a strong commitment to medical education and clinical supervision. Actively participated as a Sub-Investigator in Phase 2/3 clinical trials, contributing to pivotal studies, includ ing Pfizer’s RSV and Influenza vaccines (C3671013 / C4781004), Moderna’s 

---

In [20]:
# Realiza una consulta para obtener información
#document_query = "does Santiago have any haircut experience"  
document_query = "who was the coordinator of a Undifferentiated Arthritis: Therapeutic Management conference?"

retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 1})


# Inicializa la cadena de recuperación y respuesta para los documentos
qa = RetrievalQA.from_chain_type(  
    llm=chat, 
    chain_type="stuff", 
    retriever=retriever 
)  

# Realiza la consulta en los documentos
result = qa.invoke(document_query)

# Imprime el resultado de la consulta
print("Resultado de la consulta:")
print(result['result']) 
print(result) 


Resultado de la consulta:
The coordinator of the Undifferentiated Arthritis: Therapeutic Management conference was the user.
{'query': 'who was the coordinator of a Undifferentiated Arthritis: Therapeutic Management conference?', 'result': 'The coordinator of the Undifferentiated Arthritis: Therapeutic Management conference was the user.'}
