In [24]:
import os
import operator
from typing import TypedDict, Sequence, List, Dict
from typing_extensions import Annotated
from dotenv import load_dotenv

from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.messages import BaseMessage, AIMessage
from langgraph.graph import StateGraph, START, END
from typing import Annotated
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS

from sentence_transformers import CrossEncoder
from langgraph.graph import StateGraph, END

# --------------------------------------------------
# ENV
# --------------------------------------------------
load_dotenv()

# --------------------------------------------------
# STATE
# --------------------------------------------------
class ResearchAgentState(TypedDict):
    messages: Annotated[Sequence[BaseMessage], operator.add]
    papers: List[dict]
    query: str
    extracted_info: dict
    search_results: List[dict]
    comparison_matrix: dict | None
    research_gaps: List[str]
    iteration_count: int
    reflection: str
    chunks: list
    vectorstore: FAISS | None
    reranked_docs: list
    folder_path: str

# --------------------------------------------------
# NODES
# --------------------------------------------------
def load_pdfs(state: ResearchAgentState):
    papers = []
    for file in os.listdir(state["folder_path"]):
        if file.endswith(".pdf"):
            loader = PyPDFLoader(os.path.join(state["folder_path"], file))
            docs = loader.load()
            papers.extend(docs)

    return {"papers": papers}


def split_documents(state: ResearchAgentState):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200
    )
    chunks = splitter.split_documents(state["papers"])
    return {"chunks": chunks}


def create_vectorstore(state: ResearchAgentState):
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    vectorstore = FAISS.from_documents(state["chunks"], embeddings)
    return {"vectorstore": vectorstore}


def retrieve_documents(state: ResearchAgentState):
    docs = state["vectorstore"].similarity_search(
        state["query"], k=10
    )
    return {"search_results": docs}


def rerank_documents(state: ResearchAgentState):
    reranker = CrossEncoder("BAAI/bge-reranker-base")
    pairs = [(state["query"], doc.page_content) for doc in state["search_results"]]
    scores = reranker.predict(pairs)

    reranked = sorted(
        zip(state["search_results"], scores),
        key=lambda x: x[1],
        reverse=True
    )

    top_docs = [doc for doc, _ in reranked[:5]]
    return {"reranked_docs": top_docs}


def generate_answer(state: ResearchAgentState):
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    context = "\n\n".join([doc.page_content for doc in state["reranked_docs"]])

    prompt = f"""
Answer the following research question using the provided context.

Question:
{state["query"]}

Context:
{context}
"""

    response = llm.invoke(prompt)

    return {
        "messages": [AIMessage(content=response.content)],
        "reflection": "Answer generated using reranked evidence."
    }

# --------------------------------------------------
# GRAPH
# --------------------------------------------------
graph = StateGraph(ResearchAgentState)

graph.add_node("load_pdfs", load_pdfs)
graph.add_node("split_documents", split_documents)
graph.add_node("create_vectorstore", create_vectorstore)
graph.add_node("retrieve_documents", retrieve_documents)
graph.add_node("rerank_documents", rerank_documents)
graph.add_node("generate_answer", generate_answer)

graph.set_entry_point("load_pdfs")

graph.add_edge("load_pdfs", "split_documents")
graph.add_edge("split_documents", "create_vectorstore")
graph.add_edge("create_vectorstore", "retrieve_documents")
graph.add_edge("retrieve_documents", "rerank_documents")
graph.add_edge("rerank_documents", "generate_answer")
graph.add_edge("generate_answer", END)

app = graph.compile()

# --------------------------------------------------
# RUN
# --------------------------------------------------
initial_state: ResearchAgentState = {
    "folder_path": "C:/Users/sashi/OneDrive/Documents/langgraphProjects/researchPaper/papers",
    "messages": [],
    "papers": [],
    "query": input("Enter your research question: "),
    "extracted_info": {},
    "search_results": [],
    "comparison_matrix": None,
    "research_gaps": [],
    "iteration_count": 0,
    "reflection": "",
    "chunks": [],
    "vectorstore": None,
    "reranked_docs": []
}

final_state = app.invoke(initial_state)

print("\n===== ANSWER =====\n")
print(final_state["messages"][-1].content)



===== ANSWER =====

The paper "Attention Is All You Need" introduces a novel neural network architecture called the Transformer, which relies entirely on attention mechanisms, eliminating the need for recurrent and convolutional layers that were prevalent in previous sequence transduction models. The key contributions of the paper include:

1. **Attention Mechanism**: The Transformer utilizes an attention function that maps queries to key-value pairs, allowing the model to focus on different parts of the input sequence when generating outputs. This is achieved through a process that computes a weighted sum of the values based on the attention scores derived from the queries and keys.

2. **Self-Attention**: The paper discusses self-attention, which enables the model to relate different positions within a single sequence, thereby capturing dependencies regardless of their distance in the sequence. This is particularly useful for tasks such as reading comprehension and summarization.

3

In [None]:
#what does attention is all you need tell about