In [58]:
from langchain.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_history_aware_retriever
from langchain.chains import create_retrieval_chain
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain.vectorstores import Pinecone
from langchain_openai import OpenAIEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from pinecone import ServerlessSpec
from dotenv import load_dotenv
from pinecone import Pinecone
from openai import OpenAI
import pinecone 
import time
import os
from google.auth import compute_engine
from google.cloud import firestore
from langchain_google_firestore import FirestoreChatMessageHistory

In [56]:
client = firestore.Client(
    project="arpa-softtek",
    database="(default)",
    credentials=compute_engine.Credentials(),
)

In [30]:
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
MODEL = "text-embedding-3-small"
client = OpenAI(api_key=OPENAI_API_KEY)
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)


In [31]:
def obtener_texto(url):
    loader = PyPDFLoader("/Users/luisbarajas/Documents/AGI/Papers/weak-to-strong-generalization.pdf")
    data = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    texts = text_splitter.split_documents(data)
    return texts

docs = obtener_texto('./model2.ipynb')

In [36]:
index_name = 'arpa2'
spec = ServerlessSpec(cloud="aws", region="us-east-1")
pc = Pinecone(api_key=PINECONE_API_KEY)

# check if index already exists (it shouldn't if this is your first run)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name=index_name,
        dimension=1536,  # dimensionality of text-embed-3-small
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)
        
# connect to index
index = pc.Index(index_name)
time.sleep(1)

In [49]:
vectorstore = PineconeVectorStore(index_name=index_name, embedding=OpenAIEmbeddings(model=MODEL)) #cuando ya existe el index

# cuando tdv no existe el index
# vectorstore_from_documents = PineconeVectorStore.from_documents(
#         docs,
#         index_name=index_name,
#         embedding=OpenAIEmbeddings(model=MODEL),
# )

retreiver = vectorstore.as_retriever()

In [50]:
### Contextualize question ###
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retreiver, contextualize_q_prompt
)

### Answer question ###
qa_system_prompt = """You are an assistant for question-answering tasks related to scientific papers, articles and investigations. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
print(question_answer_chain)
print(qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), config={'run_name': 'format_inputs'})
| ChatPromptTemplate(input_variables=['chat_history', 'context', 'input'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], template="You are an assistant for question-answering tasks related to scientific papers, articles and investigations. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\n{context}")), MessagesPlaceholder(variable_name='chat_history'), HumanMessage

In [51]:
### Statefully manage chat history ###

#importar credenciales formato main de functions 
# llamar las funciones de firebase de libreria client
# (crear collection extra para pruebas)

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store: #Funcion para ver si existe un historial de chat    
        # Create a new FirestoreChatMessageHistory instance
        chat_history = FirestoreChatMessageHistory(session_id=session_id, collection="chat")
    else:
        chat_history = FirestoreChatMessageHistory(session_id=session_id, collection="chat")
    return chat_history

conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [54]:
prompt = "what is the paper about?"
answer = conversational_rag_chain.invoke(
    {"input": prompt},
    config={
        "configurable": {"session_id": "abc124"}
    },  # constructs a key "abc123" in `store`.
)["answer"]

chat_history.add_user_message(prompt)
chat_history.add_ai_message(answer)


ValueError: Missing keys ['session_id'] in config['configurable'] Expected keys are ['session_id'].When using via .invoke() or .stream(), pass in a config; e.g., chain.invoke({'input': 'foo'}, {'configurable': {'session_id': '[your-value-here]'}})