In [6]:
import os
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings
from dotenv import load_dotenv
from langchain_community.document_loaders import UnstructuredPDFLoader, OnlinePDFLoader, PyPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.chains import create_history_aware_retriever
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_openai import ChatOpenAI
from openai import OpenAI
from langchain_chroma import Chroma
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory


In [7]:
load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_API_ENV = os.getenv('PINECONE_API_ENV')

In [8]:
llm = ChatOpenAI(model="gpt-3.5-turbo")
client = OpenAI(api_key=OPENAI_API_KEY)   
index_name = "arpa"
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# vectorstore = PineconeVectorStore(index_name=index_name, embedding=embeddings)


In [33]:
loader = PyPDFLoader("/Users/luisbarajas/Documents/AGI/Papers/weak-to-strong-generalization.pdf")
data = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(data)
vectorstore = Chroma.from_documents(documents=docs, embedding=embeddings)
retriever = vectorstore.as_retriever()
# print(docs)
# res = client.embeddings.create(input=docs, model=MODEL)
# print(res)


<langchain_chroma.vectorstores.Chroma object at 0x169cf9c50>


In [34]:
print(vectorstore.)

AttributeError: 'Chroma' object has no attribute 'get_vector'

In [28]:
### Contextualize question ###
contextualize_q_system_prompt = """Given a chat history and the latest user question \
which might reference context in the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)
history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)


In [29]:
# Retrieve and generate using the relevant snippets of the blog.
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)

### Answer question ###
qa_system_prompt = """You are an assistant for question-answering tasks related to scientific papers, articles and investigations. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)


In [30]:
### Statefully manage chat history ###
store = {}

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]


conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [32]:
conversational_rag_chain.invoke(
    {"input": "what was the last paper about?"},
    config={
        "configurable": {"session_id": "luis"}
    },  # constructs a key "abc123" in `store`.
)["answer"]
# print(store)

'The last paper mentioned in the context is "Multiaccuracy: Black-box post-processing for fairness in classification" by Michael P Kim, Amirata Ghorbani, and James Zou. This paper discusses the concept of multiaccuracy, which is a black-box post-processing technique used to enhance fairness in classification tasks. The focus is on addressing fairness concerns in machine learning models by adjusting the decision boundaries to achieve fairness across different demographic groups.'