In [20]:
import os
from typing import TypedDict, List, Annotated
import operator
from dotenv import load_dotenv

# ---------------------------
# ENV SETUP
# ---------------------------
load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

# ---------------------------
# LANGCHAIN / LANGGRAPH
# ---------------------------
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.messages import BaseMessage, AIMessage
from langgraph.graph import StateGraph, END

# ---------------------------
# STATE DEFINITION
# ---------------------------
class ResearchAgentState(TypedDict):
    folder_path: str
    query: str

    papers: list
    chunks: list
    vectorstore: FAISS | None
    retrieved_docs: list

    extracted_info: str
    comparison_matrix: str
    conflicts: List[str]
    research_gaps: List[str]

    messages: Annotated[List[BaseMessage], operator.add]


# ---------------------------
# NODE 1: LOAD PDFs
# ---------------------------
def load_pdfs(state: ResearchAgentState):
    papers = []
    for file in os.listdir(state["folder_path"]):
        if file.endswith(".pdf"):
            loader = PyPDFLoader(os.path.join(state["folder_path"], file))
            papers.extend(loader.load())

    print(f"‚úÖ Loaded {len(papers)} pages from PDFs")
    return {"papers": papers}


# ---------------------------
# NODE 2: SPLIT DOCUMENTS
# ---------------------------
def split_documents(state: ResearchAgentState):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1200,
        chunk_overlap=200
    )
    chunks = splitter.split_documents(state["papers"])
    print(f"‚úÖ Created {len(chunks)} text chunks")
    return {"chunks": chunks}


# ---------------------------
# NODE 3: CREATE VECTOR STORE
# ---------------------------
def create_vectorstore(state: ResearchAgentState):
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
    vectorstore = FAISS.from_documents(state["chunks"], embeddings)
    print("‚úÖ Vector store created")
    return {"vectorstore": vectorstore}


# ---------------------------
# NODE 4: RETRIEVE SECTIONS
# ---------------------------
def retrieve_relevant_sections(state: ResearchAgentState):
    docs = state["vectorstore"].similarity_search(
        "methodology experimental design dataset results findings",
        k=12
    )
    print(f"‚úÖ Retrieved {len(docs)} relevant sections")
    return {"retrieved_docs": docs}


# ---------------------------
# NODE 5: COMPARE METHODOLOGIES
# ---------------------------
def compare_methodologies(state: ResearchAgentState):
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    context = "\n\n".join(
        doc.page_content[:1500] for doc in state["retrieved_docs"]
    )

    prompt = f"""
Analyze the methodologies used in the following research papers.

Extract for each paper:
- Methodology
- Dataset
- Model / Algorithm
- Evaluation Metrics
- Key Strengths
- Limitations

Return structured text.

Papers:
{context}
"""

    response = llm.invoke(prompt)
    print("‚úÖ Methodology comparison completed")
    return {"extracted_info": response.content}


# ---------------------------
# NODE 6: BUILD COMPARISON MATRIX
# ---------------------------
def build_comparison_matrix(state: ResearchAgentState):
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    prompt = f"""
Using the extracted information below, build a comparison matrix.

Rows = Papers  
Columns = Method, Dataset, Strengths, Limitations

Extracted Info:
{state["extracted_info"]}
"""

    response = llm.invoke(prompt)
    print("‚úÖ Comparison matrix built")
    return {"comparison_matrix": response.content}


# ---------------------------
# NODE 7: IDENTIFY CONFLICTS
# ---------------------------
def identify_conflicts(state: ResearchAgentState):
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    prompt = f"""
Identify conflicting or contradicting findings across the papers.

Comparison Matrix:
{state["comparison_matrix"]}

Return bullet points.
"""

    response = llm.invoke(prompt)
    print("‚úÖ Conflicts identified")
    return {"conflicts": response.content.split("\n")}


# ---------------------------
# NODE 8: FIND RESEARCH GAPS
# ---------------------------
def find_research_gaps(state: ResearchAgentState):
    llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

    prompt = f"""
Based on:
- Comparison matrix
- Conflicting results

Identify:
- Research gaps
- Missing datasets
- Underexplored methods
- Future research directions

Return bullet points.
"""

    response = llm.invoke(prompt)
    print("‚úÖ Research gaps identified")
    return {"research_gaps": response.content.split("\n")}


# ---------------------------
# BUILD LANGGRAPH
# ---------------------------
graph = StateGraph(ResearchAgentState)

graph.add_node("load_pdfs", load_pdfs)
graph.add_node("split_documents", split_documents)
graph.add_node("create_vectorstore", create_vectorstore)
graph.add_node("retrieve_relevant_sections", retrieve_relevant_sections)
graph.add_node("compare_methodologies", compare_methodologies)
graph.add_node("build_comparison_matrix", build_comparison_matrix)
graph.add_node("identify_conflicts", identify_conflicts)
graph.add_node("find_research_gaps", find_research_gaps)

graph.set_entry_point("load_pdfs")

graph.add_edge("load_pdfs", "split_documents")
graph.add_edge("split_documents", "create_vectorstore")
graph.add_edge("create_vectorstore", "retrieve_relevant_sections")
graph.add_edge("retrieve_relevant_sections", "compare_methodologies")
graph.add_edge("compare_methodologies", "build_comparison_matrix")
graph.add_edge("build_comparison_matrix", "identify_conflicts")
graph.add_edge("identify_conflicts", "find_research_gaps")
graph.add_edge("find_research_gaps", END)

app = graph.compile()

# ---------------------------
# RUN
# ---------------------------
if __name__ == "__main__":
    initial_state: ResearchAgentState = {
        "folder_path": "papers",
        "query": "Compare research methodologies",
        "papers": [],
        "chunks": [],
        "vectorstore": None,
        "retrieved_docs": [],
        "extracted_info": "",
        "comparison_matrix": "",
        "conflicts": [],
        "research_gaps": [],
        "messages": []
    }

    final_state = app.invoke(initial_state)

    print("\n================ FINAL OUTPUT ================\n")
    print("üìä COMPARISON MATRIX:\n")
    print(final_state["comparison_matrix"])

    print("\n‚ö†Ô∏è CONFLICTS:\n")
    for c in final_state["conflicts"]:
        print("-", c)

    print("\nüîç RESEARCH GAPS:\n")
    for g in final_state["research_gaps"]:
        print("-", g)


‚úÖ Loaded 85 pages from PDFs
‚úÖ Created 252 text chunks
‚úÖ Vector store created
‚úÖ Retrieved 12 relevant sections
‚úÖ Methodology comparison completed
‚úÖ Comparison matrix built
‚úÖ Conflicts identified
‚úÖ Research gaps identified


üìä COMPARISON MATRIX:

Here is a comparison matrix based on the extracted information from the three research papers:

| Papers                          | Method                                                                 | Dataset                                                                 | Strengths                                                                                          | Limitations                                                                                     |
|---------------------------------|------------------------------------------------------------------------|------------------------------------------------------------------------|-----------------------------------------------------------------------------