
# <span style="color:red">INSTANTIATION OF THE LLM MODEL AND THE EMBEDDING</span>

In [None]:
import langchain_community
import os

from langchain.chat_models import ChatOpenAI

OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")
LANGCHAIN_TRACING_V2 = os.environ["LANGCHAIN_TRACING_V2"]
LANGCHAIN_API_KEY = os.environ["LANGCHAIN_API_KEY"]

chat = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model='gpt-3.5-turbo',
    temperature = 0,
    streaming = True
)


In [None]:
### EMBEDDINGS
from langchain.embeddings.openai import OpenAIEmbeddings
## Embedding Techinque of OPENAI
embed_model=OpenAIEmbeddings(model="text-embedding-3-large")
print(len(embed_model.embed_query('hola')))

In [11]:
## READ THE DIRECTORY AND LOAD THE FILE
from langchain.document_loaders import PyPDFDirectoryLoader
# read documents
def read_doc(directory):
    file_loader=PyPDFDirectoryLoader(directory)
    documents = file_loader.load()
    return documents
dir=r'C:\Users\Aulamultimedia\Documents\practicas\eguins2\pdf_tesis/'
doc=read_doc(dir)

dir_cv=r'C:\Users\Aulamultimedia\Documents\practicas\eguins2\pdf_cv/'
doc_cv=read_doc(dir_cv)

total=doc+doc_cv



In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc=text_splitter.split_documents(docs)
    return doc

documents=chunk_data(docs=total,chunk_size=3000, chunk_overlap=50)
# documents_cv=chunk_data(docs=doc_cv,chunk_size=3000, chunk_overlap=50)

type(documents)


# <span style="color:red">LOAD THE DOCUMENTS AND VECTORS TO PINESTORE DB</span>

In [None]:
## CONNECT WITH PINECONE DATABASE
from pinecone import Pinecone, ServerlessSpec
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")

#Connect to DB Pinecone
pc=Pinecone(api_key=PINECONE_API_KEY)
cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)
index_name = 'eguins'

if index_name in pc.list_indexes().names():
  pc.delete_index(index_name)
  print("index {} borrado".format(index_name))

# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    print("index creado con el nombre: {}".format(index_name))
    pc.create_index(
        index_name,
        dimension=3072,  # dimensionality of text-embedding models/embedding-001
        metric='cosine',
        spec=spec
        )
else:
    print("el index con el nombre {} ya estaba creado".format(index_name))

In [None]:
## UPSERT THE VECTORS IN TO THE PINECONE DATABASE

import time
from langchain_pinecone import PineconeVectorStore
namespace = "espacio"

docsearch = PineconeVectorStore.from_documents(
    documents=documents,
    index_name=index_name,
    embedding=embed_model, 
    namespace=namespace
)
print("upserted values to {} index".format(index_name))

time.sleep(1)




# <span style="color:red">RETRIEVE AND SEARCH INTO THE CREATED PINECONE DATABASES</span>

In [None]:
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
PINECONE_API_KEY=os.getenv("PINECONE_API_KEY")
pc=Pinecone(api_key=PINECONE_API_KEY)
index_name = 'eguins'
index_name_cv = 'eguinscv'
namespace = "espacio"


In [6]:
vectorstore = PineconeVectorStore(
    index_name=index_name,
    embedding=embed_model,
    namespace=namespace,
)
retriever=vectorstore.as_retriever()

vectorstore_cv = PineconeVectorStore(
    index_name=index_name_cv,
    embedding=embed_model,
    namespace=namespace,
)
retriever_cv=vectorstore_cv.as_retriever()

In [None]:
query = "in which companies did ezequiel used to work"
vectorstore.similarity_search(query, k=1)

In [9]:
from langchain.chains import RetrievalQA  

query = "does Ezequiel have any hands-on experience"


qa = RetrievalQA.from_chain_type(  
    llm=chat,  
    chain_type="stuff",  
    retriever=vectorstore.as_retriever()  
)  

qa_cv = RetrievalQA.from_chain_type(  
    llm=chat,  
    chain_type="stuff",  
    retriever=vectorstore_cv.as_retriever()  
) 
result = qa_cv.invoke(query)

print(result['result'])

In [None]:
print(result)