In [5]:

# ==========================================================
# LANGGRAPH PDF EXTRACTION AGENT (USING ResearchAgentState)
# Extraction-only: sections, concepts, methods, findings,
# citations, tables/figures/statistics
# ==========================================================

import os
import re
import operator
from typing import List, Dict, Sequence
from typing_extensions import TypedDict, Annotated

from langchain_core.messages import BaseMessage
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import StateGraph, END
from sklearn.feature_extraction.text import TfidfVectorizer


# ==========================================================
# STATE (AS REQUESTED)
# ==========================================================

class ResearchAgentState(TypedDict):
    messages: Annotated[Sequence[BaseMessage], operator.add]
    papers: list[dict]                 # {"name": str, "text": str}
    query: str
    extracted_info: dict               # All extracted structured data
    search_results: list[dict]         # Not used here (kept for compatibility)
    comparison_matrix: dict | None     # Not used
    research_gaps: list[str]            # Not used
    iteration_count: int
    reflection: str


# ==========================================================
# CONFIG
# ==========================================================

PDF_FOLDER = "C:/Users/sashi/OneDrive/Documents/langgraphProjects/researchPaper/papers"


# ==========================================================
# TOOLS / NODES
# ==========================================================

def load_pdfs_node(state: ResearchAgentState):
    papers = []

    for file in os.listdir(PDF_FOLDER):
        if file.endswith(".pdf"):
            loader = PyPDFLoader(os.path.join(PDF_FOLDER, file))
            docs = loader.load()
            text = "\n".join(d.page_content for d in docs)
            papers.append({
                "name": file,
                "text": text
            })

    return {"papers": papers}


def split_papers_node(state: ResearchAgentState):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=150
    )

    for paper in state["papers"]:
        paper["chunks"] = splitter.split_text(paper["text"])

    return {"papers": state["papers"]}


# ------------------ SECTION EXTRACTION ---------------------

SECTION_HEADERS = {
    "abstract": ["abstract"],
    "introduction": ["introduction"],
    "methods": ["methodology", "methods", "approach"],
    "results": ["results", "experiments"],
    "conclusion": ["conclusion", "future work"]
}

def extract_sections_node(state: ResearchAgentState):
    for paper in state["papers"]:
        text = paper["text"].lower()
        sections = {}

        for sec, headers in SECTION_HEADERS.items():
            for h in headers:
                pattern = rf"{h}\n(.+?)(?=\n[A-Z][a-z]+|\Z)"
                match = re.search(pattern, text, re.DOTALL)
                if match:
                    sections[sec] = match.group(1).strip()
                    break

        paper["sections"] = sections

    return {"papers": state["papers"]}


# ------------------ KEY CONCEPTS --------------------------

def extract_key_concepts_node(state: ResearchAgentState):
    for paper in state["papers"]:
        vectorizer = TfidfVectorizer(
            stop_words="english",
            ngram_range=(1, 2),
            max_features=500
        )
        X = vectorizer.fit_transform(paper["chunks"])
        scores = X.sum(axis=0).A1
        terms = vectorizer.get_feature_names_out()

        ranked = sorted(zip(terms, scores), key=lambda x: x[1], reverse=True)
        paper["concepts"] = [t for t, _ in ranked[:20]]

    return {"papers": state["papers"]}


# ------------------ METHODOLOGIES -------------------------

AI_ML_KEYWORDS = [
    "machine learning", "deep learning", "neural network",
    "cnn", "rnn", "transformer", "bert", "gpt",
    "random forest", "svm", "xgboost",
    "gradient descent", "backpropagation",
    "accuracy", "precision", "recall", "f1 score"
]

def extract_methodologies_node(state: ResearchAgentState):
    for paper in state["papers"]:
        found = set()
        for chunk in paper["chunks"]:
            c = chunk.lower()
            for kw in AI_ML_KEYWORDS:
                if kw in c:
                    found.add(kw)

        paper["methodologies"] = sorted(found)

    return {"papers": state["papers"]}


# ------------------ FINDINGS ------------------------------

def extract_findings_node(state: ResearchAgentState):
    for paper in state["papers"]:
        findings = []
        for chunk in paper["chunks"]:
            for sent in re.split(r"[.!?]", chunk):
                if any(w in sent.lower() for w in [
                    "outperform", "improve", "increase",
                    "significant", "accuracy"
                ]):
                    findings.append(sent.strip())

        paper["findings"] = findings[:25]

    return {"papers": state["papers"]}


# ------------------ CITATIONS -----------------------------

def extract_citations_node(state: ResearchAgentState):
    patterns = [
        r"\([A-Za-z]+ et al\., \d{4}\)",
        r"\[\d+\]"
    ]

    for paper in state["papers"]:
        citations = set()
        for p in patterns:
            citations.update(re.findall(p, paper["text"]))
        paper["citations"] = list(citations)

    return {"papers": state["papers"]}


# ------------------ TABLES / FIGURES / STATS --------------

def extract_tables_figures_node(state: ResearchAgentState):
    for paper in state["papers"]:
        extracted = []
        for chunk in paper["chunks"]:
            if any(k in chunk.lower() for k in [
                "table", "figure", "fig.", "%",
                "mean", "std", "accuracy"
            ]):
                extracted.append(chunk)

        paper["tables_figures_stats"] = extracted[:20]

    return {"papers": state["papers"]}


# ------------------ FINAL STRUCTURING ---------------------

def build_extracted_info_node(state: ResearchAgentState):
    extracted_info = {}

    for paper in state["papers"]:
        extracted_info[paper["name"]] = {
            "sections": paper.get("sections", {}),
            "concepts": paper.get("concepts", []),
            "methodologies": paper.get("methodologies", []),
            "findings": paper.get("findings", []),
            "citations": paper.get("citations", []),
            "tables_figures_stats": paper.get("tables_figures_stats", [])
        }

    return {"extracted_info": extracted_info}


# ==========================================================
# LANGGRAPH DEFINITION
# ==========================================================

graph = StateGraph(ResearchAgentState)

graph.add_node("load_pdfs", load_pdfs_node)
graph.add_node("split_papers", split_papers_node)
graph.add_node("extract_sections", extract_sections_node)
graph.add_node("extract_concepts", extract_key_concepts_node)
graph.add_node("extract_methods", extract_methodologies_node)
graph.add_node("extract_findings", extract_findings_node)
graph.add_node("extract_citations", extract_citations_node)
graph.add_node("extract_tables", extract_tables_figures_node)
graph.add_node("build_extracted_info", build_extracted_info_node)

graph.set_entry_point("load_pdfs")

graph.add_edge("load_pdfs", "split_papers")
graph.add_edge("split_papers", "extract_sections")
graph.add_edge("extract_sections", "extract_concepts")
graph.add_edge("extract_concepts", "extract_methods")
graph.add_edge("extract_methods", "extract_findings")
graph.add_edge("extract_findings", "extract_citations")
graph.add_edge("extract_citations", "extract_tables")
graph.add_edge("extract_tables", "build_extracted_info")
graph.add_edge("build_extracted_info", END)

research_extraction_graph = graph.compile()


# ==========================================================
# RUNNER
# ==========================================================

def run_extraction():
    state = {
        "messages": [],
        "papers": [],
        "query": "",
        "extracted_info": {},
        "search_results": [],
        "comparison_matrix": None,
        "research_gaps": [],
        "iteration_count": 0,
        "reflection": ""
    }

    result = research_extraction_graph.invoke(state)

    print("\n✅ EXTRACTION COMPLETE\n")

    for paper, info in result["extracted_info"].items():
        print(f"\n================ {paper} ================")
        for k, v in info.items():
            print(f"\n--- {k.upper()} ({len(v) if isinstance(v, list) else ''}) ---")
            if isinstance(v, dict):
                for sec, content in v.items():
                    print(f"\n{sec.upper()}:\n{content[:800]}...\n")
            else:
                for item in v[:10]:
                    print(f"- {str(item)[:300]}")


if __name__ == "__main__":
    run_extraction()



  from .autonotebook import tqdm as notebook_tqdm



✅ EXTRACTION COMPLETE



--- SECTIONS () ---

ABSTRACT:
the dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. the best
performing models also connect the encoder and decoder through an attention
mechanism. we propose a new simple network architecture, the transformer,
based solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. experiments on two machine translation tasks show these models to
be superior in quality while being more parallelizable and requiring significantly
less time to train. our model achieves 28.4 bleu on the wmt 2014 english-
to-german translation task, improving over the existing best results, including
ensembles, by over 2 bleu. on the wmt 2014 english-to-french translation task,
our model establishes ...


INTRODUCTION:
recurrent neural networks, long short-term memory [13] and gated recurrent [7] neural networks
in particular, have been fir