# Module 1 : Topic Input & Paper Search

### Install and Import required libraries and modules

In [1]:
# !pip install semanticscholar python-dotenv requests -q

import json
import os
import textwrap
import pandas as pd
from datetime import datetime
from typing import Dict, List, Optional, Any

from dotenv import load_dotenv
from semanticscholar import SemanticScholar

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


### Setup API key

In [2]:
def setup_api_key() -> SemanticScholar:
    """
    Initialize and return a SemanticScholar client.

    Behavior:
    - Attempts to load SEMANTIC_SCHOLAR_API_KEY from a .env file.
    - If not found, does NOT write a real API key to disk (hard-coded keys removed).
      Instead, continues without a key (limited rate) and prints clear instructions.
    - Returns an initialized SemanticScholar client (with or without api_key).

    Returns:
        SemanticScholar: Initialized client object.
    """
    load_dotenv()
    api_key = os.getenv("SEMANTIC_SCHOLAR_API_KEY")

    if not api_key:
        print(
            "SEMANTIC_SCHOLAR_API_KEY not found in environment. "
            "Proceeding without API key (limited rate)."
        )
        print(
            "To use a key: create a .env file with a line like:\n"
            "SEMANTIC_SCHOLAR_API_KEY=83rBkeaXb14D8vGpXJezU6nrCFFmyn5L8RCvT9MM\n"
            "Then re-run this script."
        )
        scholar_client = SemanticScholar()
    else:
        scholar_client = SemanticScholar(api_key=api_key)
        print("Semantic Scholar initialized with API key.")

    return scholar_client

### Search Research Papers

In [3]:
def search_papers(topic: str, limit: int = 20) -> Optional[Dict[str, Any]]:
    """
    Search Semantic Scholar for papers on a given topic.

    Args:
        topic (str): Topic/query string for searching papers.
        limit (int): Maximum number of papers to request.

    Returns:
        dict or None: Dictionary containing search metadata and papers list
                      or None if an error occurred.
    """
    if not topic or not topic.strip():
        raise ValueError("search_papers requires a non-empty topic string.")

    print(f"\nSearching for papers on: '{topic}' (limit={limit})")

    scholar_client = setup_api_key()

    try:
        # Request fields that are useful downstream
        results = scholar_client.search_paper(
            query=topic,
            limit=limit,
            fields=[
                "paperId", "title", "abstract", "year", "authors",
                "citationCount", "openAccessPdf", "url", "venue"
            ]
        )

        papers: List[Dict[str, Any]] = []

        for paper in results:
            raw_authors = getattr(paper, "authors", []) or []
            authors: List[str] = []
            for a in raw_authors:
                if hasattr(a, "name"):
                    authors.append(getattr(a, "name"))
                elif isinstance(a, dict) and "name" in a:
                    authors.append(a["name"])
                else:
                    authors.append(str(a))

            open_access_pdf = getattr(paper, "openAccessPdf", None)
            pdf_url = None
            has_pdf = False
            if open_access_pdf:
                if isinstance(open_access_pdf, dict):
                    pdf_url = open_access_pdf.get("url")
                else:
                    pdf_url = getattr(open_access_pdf, "get", lambda x: None)("url")
                has_pdf = bool(pdf_url)

            paper_entry = {
                "title": getattr(paper, "title", "") or "No title",
                "authors": authors,
                "year": getattr(paper, "year", None),
                "paperId": getattr(paper, "paperId", None),
                "abstract": (getattr(paper, "abstract", "") or "")[:300] + ("..." if getattr(paper, "abstract", None) and len(getattr(paper, "abstract", "")) > 300 else ""),
                "citationCount": getattr(paper, "citationCount", 0),
                "venue": getattr(paper, "venue", None),
                "url": getattr(paper, "url", None),
                "pdf_url": pdf_url,
                "has_pdf": has_pdf
            }
            papers.append(paper_entry)

        papers_with_pdf = sum(1 for p in papers if p["has_pdf"])

        print("Search complete!")
        print(f"  Total papers returned: {len(papers)}")
        print(f"  Papers with PDF available: {papers_with_pdf}")

        return {
            "topic": topic,
            "search_timestamp": datetime.now().isoformat(),
            "total_results": len(papers),
            "papers_with_pdf": papers_with_pdf,
            "papers": papers
        }

    except Exception as exc:
        print(f"Error searching papers: {exc}")
        return None

### Save Searched Research Papers

In [4]:
def save_search_results(data: Dict[str, Any], filename: Optional[str] = None) -> str:
    """
    Save search results dict to a JSON file under data/search_results.

    Args:
        data (dict): Data returned by `search_papers`.
        filename (str, optional): Custom filename. If None, generate from topic.

    Returns:
        str: Full path of the saved JSON file.
    """
    if not data or "topic" not in data:
        raise ValueError("save_search_results requires data dictionary with a 'topic' key.")

    # Create a filesystem-safe filename if not provided
    if not filename:
        safe_topic = "".join(c for c in data["topic"] if c.isalnum() or c == " ").strip()
        safe_topic = safe_topic.replace(" ", "_") or "search"
        filename = f"paper_search_results_{safe_topic}.json"

    os.makedirs("data/search_results", exist_ok=True)
    filepath = os.path.join("data/search_results", filename)

    with open(filepath, "w", encoding="utf-8") as fh:
        json.dump(data, fh, indent=4, ensure_ascii=False)

    print(f"Search results saved to: {filepath}")
    return filepath

### Display the Searched Results

In [5]:
def display_search_results(data: Dict[str, Any], max_display: int = 10) -> None:
    """
    Display search results as a pandas DataFrame (table).

    If running in a Jupyter / notebook environment the DataFrame will render
    as a nice HTML table. In a plain console, the DataFrame will be printed
    as text. Shows top `max_display` papers.
    """
    if not data or "papers" not in data:
        print("No data to display.")
        return

    papers = data["papers"]
    total = len(papers)
    pdf_count = sum(1 for p in papers if p.get("has_pdf"))
    no_pdf_count = total - pdf_count

    print("\n" + "=" * 72)
    print(f"SEARCH RESULTS: {data.get('topic', 'Unknown topic')}")
    print("=" * 72)
    print("\nStatistics:")
    print(f"  • Total papers: {total}")
    print(f"  • Papers with PDF: {pdf_count}")
    print(f"  • Papers without PDF: {no_pdf_count}")

    to_show = min(max_display, total)
    if to_show == 0:
        print("\nNo papers to display.")
        return

    # Build rows for DataFrame
    rows = []
    for idx, paper in enumerate(papers[:to_show], start=1):
        title = paper.get("title", "") or ""
        authors = paper.get("authors", []) or []
        authors_display = ", ".join(authors)
        year = paper.get("year", "")
        citations = paper.get("citationCount", 0)
        has_pdf = paper.get("has_pdf", False)
        pdf_url = paper.get("pdf_url", "") or ""
        url = paper.get("url", "") or ""
        abstract = (paper.get("abstract") or "")
        if len(abstract) > 300:
            abstract = abstract[:297] + "..."

        rows.append({
            "#": idx,
            "Title": title,
            "Authors": authors_display,
            "Year": year,
            "Citations": citations,
            "Has PDF": has_pdf,
            "PDF URL": pdf_url,
            "URL": url,
            "Abstract": abstract
        })

    df = pd.DataFrame(rows)

    col_order = ["#", "Title", "Authors", "Year", "Citations", "Has PDF", "PDF URL", "URL", "Abstract"]
    df = df[col_order]

    try:
        from IPython.display import display as _display, HTML
        _display(df)
    except Exception:
        pd.set_option("display.max_colwidth", 120)
        print("\nTop results (DataFrame):\n")
        print(df.to_string(index=False))

    print(f"\nShowing top {to_show} of {total} papers. Use `max_display` to change the table size.")

### Main Search

In [6]:
def main_search() -> (Optional[Dict[str, Any]], Optional[str]):
    """
    Interactive main entry for Module 1.

    Returns:
        Tuple of (results dict or None, path to saved file or None).
    """
    print("\n" + "=" * 72)
    print("MODULE 1: TOPIC INPUT & PAPER SEARCH")
    print("=" * 72)

    try:
        topic = input("\nEnter research topic: ").strip()
    except Exception:
        topic = ""

    if not topic:
        topic = "artificial intelligence"

    results = search_papers(topic, limit=20)
    if not results:
        print("No results found or an error occurred during search.")
        return None, None

    save_path = save_search_results(results)
    display_search_results(results)

    print("\nModule 1 complete. Results saved to:", save_path)
    print("Proceed to Module 2 for paper selection and PDF download.")
    return results, save_path


if __name__ == "__main__":
    main_search()


MODULE 1: TOPIC INPUT & PAPER SEARCH

Enter research topic: Sun

Searching for papers on: 'Sun' (limit=20)
Semantic Scholar initialized with API key.
Search complete!
  Total papers returned: 1000
  Papers with PDF available: 433
Search results saved to: data/search_results\paper_search_results_Sun.json

SEARCH RESULTS: Sun

Statistics:
  • Total papers: 1000
  • Papers with PDF: 433
  • Papers without PDF: 567


Unnamed: 0,#,Title,Authors,Year,Citations,Has PDF,PDF URL,URL,Abstract
0,1,The chemical composition of the Sun,"N. Grevesse, Martin Asplund, A. Sauval, Pat Scott",2009,6661,True,https://arxiv.org/pdf/0909.0948,https://www.semanticscholar.org/paper/abcacba6...,We present a redetermination of the solar abun...
1,2,SUN database: Large-scale scene recognition fr...,"Jianxiong Xiao, James Hays, Krista A. Ehinger,...",2010,4079,True,https://dspace.mit.edu/bitstream/1721.1/60690/...,https://www.semanticscholar.org/paper/908091b4...,Scene categorization is a fundamental problem ...
2,3,IEEE Standard for Low-Rate Wireless Networks A...,,2022,510,False,,https://www.semanticscholar.org/paper/3546467e...,
3,4,The validity and practicality of sun-reactive ...,T. Fitzpatrick,1988,4019,False,,https://www.semanticscholar.org/paper/e5739d50...,
4,5,SUN RGB-D: A RGB-D scene understanding benchma...,"Shuran Song, Samuel P. Lichtenberg, Jianxiong ...",2015,1964,True,http://vision.cs.princeton.edu/projects/2015/S...,https://www.semanticscholar.org/paper/b73e2d40...,
5,6,A flexible inversion algorithm for retrieval o...,"O. Dubovik, M. King",2000,2354,True,https://onlinelibrary.wiley.com/doi/pdfdirect/...,https://www.semanticscholar.org/paper/18a91fec...,
6,7,The chemical make-up of the Sun: A 2020 vision,"M. Asplund, A. Amarsi, N. Grevesse",2021,327,True,https://www.aanda.org/articles/aa/pdf/2021/09/...,https://www.semanticscholar.org/paper/1d6e72f6...,Context. The chemical composition of the Sun i...
7,8,Sun Microsystems,"Elena Loutskina, Eric Varney",2017,615,False,,https://www.semanticscholar.org/paper/60393847...,
8,9,Accuracy assessments of aerosol optical proper...,"O. Dubovik, A. Smirnov, B. Holben, M. King, Y....",2000,1664,True,https://onlinelibrary.wiley.com/doi/pdfdirect/...,https://www.semanticscholar.org/paper/5ac95c69...,
9,10,Tailoring Graphene Oxide‐Based Aerogels for Ef...,"Xiaozhen Hu, Weichao Xu, Lin Zhou, Yingling Ta...",2017,852,False,,https://www.semanticscholar.org/paper/80c59640...,



Showing top 10 of 1000 papers. Use `max_display` to change the table size.

Module 1 complete. Results saved to: data/search_results\paper_search_results_Sun.json
Proceed to Module 2 for paper selection and PDF download.


# Module 2 : Paper Selection & Pdf Downloads
### install and import libraries

In [7]:
# !pip install PyMuPDF requests -q

import json
import os
import hashlib
import time
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional, Any

import requests
import fitz 

### Load the Previous Seached Paper

In [8]:
def load_search_results(filepath: Optional[str] = None) -> Optional[Dict[str, Any]]:
    """
    Load previously saved search results JSON.

    If filepath is None, find the newest JSON in data/search_results.

    Args:
        filepath: Optional path to a specific results file.

    Returns:
        Parsed JSON dict or None if loading fails.
    """
    if filepath:
        results_path = Path(filepath)
    else:
        results_dir = Path("data/search_results")
        if not results_dir.exists():
            print("Search results directory not found. Run Module 1 first.")
            return None

        json_files = sorted(
            (f for f in results_dir.iterdir() if f.suffix == ".json"),
            key=lambda p: p.stat().st_mtime,
            reverse=True
        )

        if not json_files:
            print("No search results found. Run Module 1 first.")
            return None

        results_path = json_files[0]
        print(f"Loading most recent search results: {results_path.name}")

    try:
        with results_path.open("r", encoding="utf-8") as fh:
            data = json.load(fh)
        papers = data.get("papers", [])
        print(f"Loaded {len(papers)} papers on '{data.get('topic', 'Unknown')}'")
        return data
    except Exception as exc:
        print(f"Error loading file {results_path}: {exc}")
        return None


### Filter Research Papers as PDFs

In [9]:
def filter_papers_with_pdfs(papers: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Return subset of papers that contain a plausible PDF URL.

    Args:
        papers: list of paper dicts (from Module 1 save format).

    Returns:
        List of papers that likely have PDF links.
    """
    papers_with_pdf: List[Dict[str, Any]] = []
    for paper in papers:
        pdf_url = (paper.get("pdf_url") or "").strip()
        if not pdf_url:
            continue

        lower = pdf_url.lower()
        if lower.endswith(".pdf") or ".pdf?" in lower or "pdf" in lower:
            papers_with_pdf.append(paper)

    print(f"\nPDF Availability:")
    print(f"  • Total papers checked: {len(papers)}")
    print(f"  • Papers with PDF URLs: {len(papers_with_pdf)}")

    return papers_with_pdf

### Rank the Papers

In [10]:
def rank_papers(papers: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Rank papers by citation count (desc) then year (desc).

    Args:
        papers: list of paper dicts.

    Returns:
        Sorted list (highest ranked first).
    """
    valid = [p for p in papers if p.get("citationCount") is not None]
    ranked = sorted(
        valid,
        key=lambda x: (int(x.get("citationCount", 0)), int(x.get("year", 0) or 0)),
        reverse=True
    )
    return ranked

### Select the Top Papers

In [11]:
def select_top_papers(papers: List[Dict[str, Any]], count: int = 3) -> List[Dict[str, Any]]:
    """
    Select top N papers (by ranking) that have PDFs.

    Args:
        papers: All papers (from search results).
        count: Number of papers to select.

    Returns:
        Selected papers list.
    """
    papers_with_pdf = filter_papers_with_pdfs(papers)
    ranked = rank_papers(papers_with_pdf)
    selected = ranked[:count]

    print(f"\nSelected top {len(selected)} papers for download:")
    for i, p in enumerate(selected, 1):
        title = p.get("title", "")[:70]
        print(f"\n{i}. {title}{'...' if len(p.get('title', '')) > 70 else ''}")
        print(f"   Citations: {p.get('citationCount', 0)}")
        print(f"   Year: {p.get('year', 'N/A')}")
        authors = ", ".join(p.get("authors", [])[:2])
        print(f"   Authors: {authors}")

    return selected

### Verification the Research Papers for PDFs

In [12]:
def _is_response_pdf(response: requests.Response) -> bool:
    """
    Heuristic check if a requests response looks like a PDF.

    Args:
        response: requests.Response object.

    Returns:
        True if response content-type or initial bytes indicate PDF.
    """
    content_type = response.headers.get("content-type", "").lower()
    if "pdf" in content_type:
        return True
    # Check first bytes (PDF files start with '%PDF')
    start = response.content[:4]
    return start == b"%PDF"



def download_pdf_with_verification(url: str, filename: str, max_retries: int = 2, chunk_size: int = 1024*32) -> bool:
    """
    Download a PDF URL to `filename` with verification.

    - Streams to disk using iter_content (no resp.raw.seek usage).
    - Checks initial bytes for '%PDF' and content-type hints.
    - Retries on transient errors with simple backoff.
    - Returns True on success (valid PDF), False otherwise.
    """
    session = requests.Session()
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36",
    }

    target = Path(filename)
    target.parent.mkdir(parents=True, exist_ok=True)

    for attempt in range(1, max_retries + 1):
        try:
            print(f"  Attempt {attempt}/{max_retries}: {url}")
            with session.get(url, headers=headers, timeout=30, stream=True, allow_redirects=True) as resp:
                status = resp.status_code

                if status == 403:
                    print(f"    HTTP 403 - Forbidden (site blocked this request).")
                    return False

                if status >= 400:
                    print(f"    HTTP {status} - skipping this attempt")
                    time.sleep(1 * attempt)
                    continue

                # Basic content-type check
                content_type = resp.headers.get("content-type", "").lower()
                looks_like_pdf_by_ct = "pdf" in content_type

                first_bytes = b""
                bytes_written = 0

                with open(target, "wb") as fh:
                    for chunk in resp.iter_content(chunk_size=chunk_size):
                        if not chunk:
                            continue

                        if bytes_written < 8:
                            need = 8 - len(first_bytes)
                            first_bytes += chunk[:need]
                        fh.write(chunk)
                        bytes_written += len(chunk)

                if first_bytes.startswith(b"%PDF") or looks_like_pdf_by_ct:
                    # Final verification using PyMuPDF (fitz)
                    try:
                        import fitz
                        with fitz.open(str(target)) as doc:
                            if len(doc) > 0:
                                size = target.stat().st_size
                                print(f"    Downloaded: {size:,} bytes -> {target.name}")
                                return True
                            else:
                                print("    Downloaded file opened but has zero pages.")
                                target.unlink(missing_ok=True)
                                continue
                    except Exception as e:
                        print(f"    PDF verification failed (PyMuPDF): {e}")
                        target.unlink(missing_ok=True)
                        continue
                else:
                    print("    File does not look like a PDF (missing %PDF signature and content-type not PDF).")
                    target.unlink(missing_ok=True)
                    continue

        except requests.exceptions.Timeout:
            print("    Timeout during download attempt.")
            time.sleep(1 * attempt)
        except requests.exceptions.RequestException as e:
            # handle other network errors
            print(f"    Network error during download: {e}")
            time.sleep(1 * attempt)
        except Exception as e:
            print(f"    Unexpected error during download: {e}")
            time.sleep(1 * attempt)

    # All attempts failed
    return False

def get_pdf_info(filepath: str) -> Dict[str, Any]:
    """
    Return basic metadata about a PDF file.

    Args:
        filepath: path to PDF.

    Returns:
        Dict with pages, size_bytes, size_mb, is_valid.
    """
    try:
        p = Path(filepath)
        if not p.exists():
            return {"is_valid": False}

        size_bytes = p.stat().st_size
        with fitz.open(str(p)) as doc:
            pages = len(doc)
        return {
            "pages": pages,
            "size_bytes": size_bytes,
            "size_mb": round(size_bytes / (1024 * 1024), 2),
            "is_valid": True
        }
    except Exception:
        return {"is_valid": False}

### Download the Selected Papers

In [13]:
def download_selected_papers(selected_papers: List[Dict[str, Any]], output_dir: str = "downloads") -> List[Dict[str, Any]]:
    """
    Download selected papers to output directory and collect metadata.

    Args:
        selected_papers: list of paper dicts with keys 'title' and 'pdf_url'.
        output_dir: directory to place downloaded PDFs.

    Returns:
        List of paper dicts augmented with download metadata.
    """
    out_path = Path(output_dir)
    out_path.mkdir(parents=True, exist_ok=True)

    downloaded: List[Dict[str, Any]] = []
    print(f"\nStarting PDF downloads to: {out_path}/")
    print("-" * 60)

    for idx, paper in enumerate(selected_papers, start=1):
        title = paper.get("title", "untitled")
        print(f"\n[{idx}/{len(selected_papers)}] Downloading: {title[:60]}{'...' if len(title) > 60 else ''}")

        # Create safe filename derived from title and a short hash to avoid collisions
        safe_title = "".join(c for c in title if c.isalnum() or c in (" ", "-", "_")).strip()
        safe_title = safe_title[:50] if len(safe_title) > 50 else safe_title
        short_hash = hashlib.md5(safe_title.encode("utf-8")).hexdigest()[:8]
        filename = out_path / f"paper_{idx}_{short_hash}.pdf"

        pdf_url = paper.get("pdf_url")
        success = False
        if pdf_url:
            success = download_pdf_with_verification(str(pdf_url), str(filename))

        if success:
            pdf_info = get_pdf_info(str(filename))
            paper["downloaded"] = True
            paper["local_path"] = str(filename)
            paper["download_time"] = datetime.now().isoformat()
            paper["pdf_info"] = pdf_info
            downloaded.append(paper)
            print(f"    Success! {pdf_info.get('pages', 'N/A')} pages, {pdf_info.get('size_mb', 'N/A')} MB")
        else:
            paper["downloaded"] = False
            print("    Failed to download.")

    return downloaded

### Save & Verify the Downloaded Reports

In [14]:
def save_download_report(downloaded_papers: List[Dict[str, Any]], topic: str, output_dir: str = "downloads") -> str:
    """
    Save a detailed download report and a simple list of downloaded files.

    Args:
        downloaded_papers: list returned from download_selected_papers.
        topic: research topic string for report context.
        output_dir: directory where downloaded files are located.

    Returns:
        Path to the saved JSON report.
    """
    report = {
        "topic": topic,
        "download_timestamp": datetime.now().isoformat(),
        "total_selected": len(downloaded_papers),
        "successful_downloads": sum(1 for p in downloaded_papers if p.get("downloaded")),
        "failed_downloads": sum(1 for p in downloaded_papers if not p.get("downloaded")),
        "papers": downloaded_papers
    }

    Path("data/reports").mkdir(parents=True, exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    report_file = Path("data/reports") / f"download_report_{timestamp}.json"

    with report_file.open("w", encoding="utf-8") as fh:
        json.dump(report, fh, indent=4, ensure_ascii=False)

    print(f"\nDownload report saved to: {report_file}")

    download_list = []
    for p in downloaded_papers:
        if p.get("downloaded"):
            pdf_info = p.get("pdf_info", {})
            download_list.append({
                "title": p.get("title"),
                "local_file": p.get("local_path"),
                "size_mb": pdf_info.get("size_mb"),
                "pages": pdf_info.get("pages")
            })

    list_file = Path(output_dir) / "downloaded_papers_list.json"
    with list_file.open("w", encoding="utf-8") as fh:
        json.dump(download_list, fh, indent=4, ensure_ascii=False)

    return str(report_file)


def verify_downloads(output_dir: str = "downloads") -> int:
    """
    Verify all PDFs in the output directory and print a summary.

    Args:
        output_dir: directory containing downloaded PDFs.

    Returns:
        Number of valid PDF files.
    """
    out_path = Path(output_dir)
    if not out_path.exists():
        print(f"Directory '{output_dir}' does not exist!")
        return 0

    pdf_files = sorted(out_path.glob("*.pdf"))
    print("\n" + "=" * 60)
    print("VERIFICATION OF DOWNLOADS")
    print("=" * 60)
    print(f"\nDirectory: {out_path.resolve()}")
    print(f"PDF files found: {len(pdf_files)}")

    total_size = 0
    valid_count = 0

    if pdf_files:
        print("\nFile Details:")
        print("-" * 60)

        for pdf in pdf_files:
            size = pdf.stat().st_size
            total_size += size
            info = get_pdf_info(str(pdf))
            is_valid = info.get("is_valid", False)
            if is_valid:
                valid_count += 1
                try:
                    with fitz.open(str(pdf)) as doc:
                        pages = len(doc)
                except Exception:
                    pages = "N/A"
                print(f" {pdf.name}")
                print(f"   Size: {size:,} bytes ({size / (1024 * 1024):.2f} MB)")
                print(f"   Pages: {pages}")
            else:
                print(f" {pdf.name} - INVALID PDF")
                print(f"   Size: {size:,} bytes")

    print(f"\nSummary:")
    print(f"  • Total PDF files: {len(pdf_files)}")
    print(f"  • Valid PDFs: {valid_count}")
    print(f"  • Total size: {total_size / (1024 * 1024):.2f} MB")

    return valid_count

### Main Download Function

In [15]:
def main_download(filepath: Optional[str] = None, download_count: int = 3) -> Optional[List[Dict[str, Any]]]:
    """
    Main entry for Module 2: load search results, select top papers, download them, save a report.

    Args:lepath: optional specific search results JSON file to load.
        download_count: number of top papers to download.

    Returns:
        List of downloaded paper metadata, or None if pipeline could not proceed.
    """
    print("\n" + "=" * 72)
    print("MODULE 2: PAPER SELECTION & PDF DOWNLOAD")
    print("=" * 72)

    data = load_search_results(filepath)
    if not data:
        return None

    selected_papers = select_top_papers(data.get("papers", []), count=download_count)
    if not selected_papers:
        print("No papers with PDFs available for download.")
        return None

    downloaded = download_selected_papers(selected_papers)
    report_file = save_download_report(downloaded, data.get("topic", "unknown"))
    verify_downloads()

    print(f"\nModule 2 complete!")
    print(f"  Downloaded papers are in: downloads/")
    print(f"  Report saved to: {report_file}")

    return downloaded


if __name__ == "__main__":
    main_download(download_count=3)


MODULE 2: PAPER SELECTION & PDF DOWNLOAD
Loading most recent search results: paper_search_results_Sun.json
Loaded 1000 papers on 'Sun'

PDF Availability:
  • Total papers checked: 1000
  • Papers with PDF URLs: 344

Selected top 3 papers for download:

1. Microsoft COCO: Common Objects in Context
   Citations: 48983
   Year: 2014
   Authors: Tsung-Yi Lin, M. Maire

2. The CLUSTAL_X windows interface: flexible strategies for multiple sequ...
   Citations: 39945
   Year: 1997
   Authors: J. Thompson, T. Gibson

3. Solar water splitting cells.
   Citations: 8319
   Year: 2010
   Authors: M. Walter, E. Warren

Starting PDF downloads to: downloads/
------------------------------------------------------------

[1/3] Downloading: Microsoft COCO: Common Objects in Context
  Attempt 1/2: https://link.springer.com/content/pdf/10.1007%2F978-3-319-10602-1_48.pdf
    Downloaded: 2,887,169 bytes -> paper_1_b6ad4f9b.pdf
    Success! 16 pages, 2.75 MB

[2/3] Downloading: The CLUSTAL_X windows interfa

MuPDF error: format error: corrupt object stream (11 0 R)

 pan_card.pdf
   Size: 722,939 bytes (0.69 MB)
   Pages: 0
 paper_1_b6ad4f9b.pdf
   Size: 2,887,169 bytes (2.75 MB)
   Pages: 16
 paper_2_19dc86c4.pdf
   Size: 365,554 bytes (0.35 MB)
   Pages: 7
 paper_2_1c08f086.pdf
   Size: 710,934 bytes (0.68 MB)
   Pages: 7
 paper_3_829b5d7b.pdf
   Size: 184,494 bytes (0.18 MB)
   Pages: 1
 passbook.pdf
   Size: 131,244 bytes (0.13 MB)
   Pages: 1
 Policy.pdf
   Size: 422,201 bytes (0.40 MB)
   Pages: 6
 Practice problems.pdf
   Size: 4,021,126 bytes (3.83 MB)
   Pages: 2
 practice questions.pdf
   Size: 155,295 bytes (0.15 MB)
   Pages: 3
 practice_ques.pdf
   Size: 176,469 bytes (0.17 MB)
   Pages: 1
 prompt engineering.pdf
   Size: 121,171 bytes (0.12 MB)
   Pages: 1
 Provisional Allotment Letter.pdf
   Size: 202,707 bytes (0.19 MB)
   Pages: 1
 Python_Complete_Notes.pdf
   Size: 27,330,548 bytes (26.06 MB)
   Pages: 64
 questions.pdf
   Size: 180,938 bytes (0.17 MB)
   Pages: 2
 questi

# Module 3 : PDF Text Extraction

In [16]:
# MODULE 3: PDF TEXT EXTRACTION

import json
import os
import re
from pathlib import Path
from typing import List, Dict, Optional, Any

from datetime import datetime
from tqdm import tqdm
try:
    import fitz
    pymupdf = fitz
except Exception as exc:
    raise ImportError("PyMuPDF (fitz) is required for Module 3. Install with `pip install pymupdf`.") from exc

try:
    import pymupdf4llm
    HAS_PYMUPDF4LLM = True
except Exception:
    pymupdf4llm = None
    HAS_PYMUPDF4LLM = False


# -------------------------
# 1. TEXT EXTRACTION
# -------------------------
def extract_text_improved(pdf_path: Path) -> Optional[str]:
    """
    Improved text extraction that tries layout-aware markdown first (if available),
    otherwise performs robust plain-text extraction with page limits and heuristics.

    Args:
        pdf_path (Path): Path to the PDF file.

    Returns:
        str or None: Extracted text (possibly markdown). None if extraction failed or PDF appears restricted.
    """
    pdf_path = Path(pdf_path)
    if not pdf_path.exists():
        print(f"  File not found: {pdf_path}")
        return None

    try:
        doc = pymupdf.open(str(pdf_path))

        if getattr(doc, "isEncrypted", False):
            print(f"  PDF appears encrypted: {pdf_path.name}. Attempting extraction...")

        first_page_text = ""
        if len(doc) > 0:
            try:
                first_page_text = doc[0].get_text().strip()
            except Exception:
                first_page_text = ""

        copyright_indicators = ["removed", "deleted", "takedown", "not available"]
        if first_page_text and any(keyword in first_page_text.lower() for keyword in copyright_indicators):
            print(f"  PDF appears to contain takedown/copyright notice: {pdf_path.name}")
            doc.close()
            return None

        extracted_candidates: List[tuple] = []

        if HAS_PYMUPDF4LLM:
            try:
                md_text = pymupdf4llm.to_markdown(str(pdf_path))
                if md_text and len(md_text) > 500:
                    extracted_candidates.append(("markdown", md_text))
            except Exception:
                pass

        plain_text = []

        pages_to_process = min(50, len(doc))
        for page_no in range(pages_to_process):
            try:
                page = doc[page_no]
                p_text = page.get_text()
                if p_text:
                    plain_text.append(p_text)
            except Exception:
                continue

        full_text = "\n".join(plain_text).strip()
        if full_text and len(full_text) > 500:
            extracted_candidates.append(("regular", full_text))

        doc.close()

        if not extracted_candidates:
            return None

        for method, text in extracted_candidates:
            if method == "markdown" and len(text) > 1000:
                return text

        best_text = max(extracted_candidates, key=lambda x: len(x[1]))[1]
        return best_text

    except Exception as exc:
        print(f"  Extraction error for {pdf_path.name}: {str(exc)[:200]}")
        return None


# -------------------------
# 2. SECTION EXTRACTION
# -------------------------
def extract_sections_improved(text: str) -> Dict[str, Any]:
    """
    Extract standard paper sections (title, abstract, introduction, methods, results, conclusion, references)
    using header heuristics and keyword fallbacks.

    Args:
        text (str): The full extracted text from a PDF.

    Returns:
        dict: Mapping of section names to content plus an 'extracted_text' preview.
    """
    sections = {
        "title": "",
        "abstract": "",
        "introduction": "",
        "methods": "",
        "results": "",
        "conclusion": "",
        "references": "",
        "extracted_text": text[:20000] if text else ""
    }

    if not text or len(text) < 500:
        return sections

    clean = clean_text_basic(text)
    lines = clean.splitlines()

    header_patterns = {
        "abstract": [r'\babstract\b', r'\bsummary\b'],
        "introduction": [r'^\d+\.\s*introduction\b', r'\bintroduction\b', r'\bbackground\b'],
        "methods": [r'^\d+\.\s*methods?\b', r'\bmethods?\b', r'\bmethodology\b', r'\bexperimental\b'],
        "results": [r'^\d+\.\s*results?\b', r'\bresults?\b', r'\bfindings?\b'],
        "conclusion": [r'^\d+\.\s*conclusions?\b', r'\bconclusions?\b', r'\bdiscussion\b'],
        "references": [r'^\s*references\s*$', r'\bbibliography\b']
    }

    # Find line indices for section headers
    boundaries: Dict[str, int] = {}
    for idx, raw_line in enumerate(lines):
        line = raw_line.strip()
        low = line.lower()

        if len(line) > 200:
            continue
        for section_key, patterns in header_patterns.items():
            for pat in patterns:
                try:
                    if re.search(pat, low):
                        # Record first occurrence
                        if section_key not in boundaries:
                            boundaries[section_key] = idx
                            break
                except re.error:
                    continue

    if boundaries:
        sorted_sections = sorted(boundaries.items(), key=lambda x: x[1])
        for i, (section_name, start_idx) in enumerate(sorted_sections):
            start = start_idx + 1
            if i + 1 < len(sorted_sections):
                end = sorted_sections[i + 1][1]
            else:
                end = len(lines)
            content = "\n".join(lines[start:end]).strip()
            if len(content) > 100:
                sections[section_name] = content[:5000] 

    for line in lines[:10]:
        line_stripped = line.strip()
        if 20 < len(line_stripped) < 200 and not line_stripped.lower().startswith("http"):
            sections["title"] = line_stripped
            break

    major_keys = ["abstract", "introduction", "methods", "results", "conclusion"]
    if not any(len(sections[k]) > 200 for k in major_keys):
        sections = extract_by_keywords_fallback(clean, sections)

    return sections


def extract_by_keywords_fallback(text: str, existing_sections: Dict[str, Any]) -> Dict[str, Any]:
    """
    Fallback strategy: look for keywords for each section and return contextual snippets.

    Args:
        text (str): Full text.
        existing_sections (dict): Current sections dict to update.

    Returns:
        dict: Updated sections mapping.
    """
    text_lower = text.lower()

    section_keywords = {
        "abstract": ["abstract", "summary", "we present", "this paper"],
        "introduction": ["introduction", "background", "motivation", "related work"],
        "methods": ["method", "experiment", "procedure", "dataset", "implementation"],
        "results": ["result", "finding", "table", "figure", "experiment shows"],
        "conclusion": ["conclusion", "discussion", "future work", "limitations", "summary"]
    }

    sentences = re.split(r'(?<=[.!?])\s+', text)

    for section, keywords in section_keywords.items():
        if existing_sections.get(section):
            continue

        contexts: List[str] = []
        for idx, sentence in enumerate(sentences):
            s_low = sentence.lower()
            if any(kw in s_low for kw in keywords):
                start = max(0, idx - 2)
                end = min(len(sentences), idx + 5)
                context = " ".join(sentences[start:end])
                contexts.append(context.strip())

        if contexts:
            existing_sections[section] = " ".join(contexts)[:5000]

    return existing_sections


def clean_text_basic(text: str) -> str:
    """
    Basic cleaning to reduce PDF extraction noise:
    - Normalize whitespace
    - Fix common hyphenation across line breaks
    - Remove non-printable characters (except newline)

    Args:
        text (str): Raw extracted text.

    Returns:
        str: Cleaned text.
    """
    if not text:
        return ""

    # Normalize whitespace and remove excessive breaks
    t = re.sub(r'\r\n?', '\n', text)
    t = re.sub(r'\s+', ' ', t)

    # Fix hyphenation at end-of-line (common in PDFs)
    t = re.sub(r'-\s+', '', t)
    t = re.sub(r'\s*-\s*', '-', t)

    # Remove control characters except newline
    t = "".join(ch for ch in t if ch == '\n' or ord(ch) >= 32)

    return t.strip()


# -------------------------
# 3. PAPER PROCESSING
# -------------------------
def process_paper_smart(pdf_path: Path) -> Optional[Dict[str, Any]]:
    """
    Validate PDF size, extract text, detect meaningful sections, and return structured metadata.

    Args:
        pdf_path (Path): Path to a single PDF file.

    Returns:
        dict or None: Result dict with metadata and sections, or None if skipped/failed.
    """
    pdf_path = Path(pdf_path)
    print(f"\nProcessing: {pdf_path.name}")

    try:
        file_size = pdf_path.stat().st_size
    except Exception as exc:
        print(f"  Could not read file size: {exc}")
        return None

    if file_size < 10_240:  # 10 KB
        print(f"  File too small ({file_size:,} bytes) — skipping")
        return None

    raw_text = extract_text_improved(pdf_path)
    if raw_text is None:
        print("  Skipping — empty or restricted PDF")
        return None

    if len(raw_text) < 1000:
        print(f"  Warning: extracted text very short ({len(raw_text):,} chars) — may be incomplete")

    print(f"  Extracted {len(raw_text):,} characters")

    sections = extract_sections_improved(raw_text)

    meaningful_sections = [
        name for name, content in sections.items()
        if name != "extracted_text" and content and len(content) > 200
    ]

    print(f"   Found {len(meaningful_sections)} meaningful sections")
    for s in meaningful_sections[:3]:
        print(f"    • {s}: {len(sections[s]):,} chars")

    result = {
        "paper_id": pdf_path.stem,
        "filename": pdf_path.name,
        "file_size_bytes": file_size,
        "total_characters": len(raw_text),
        "meaningful_sections": meaningful_sections,
        "sections": sections,
        "status": "success"
    }

    return result


# -------------------------
# 4. MAIN EXTRACTION
# -------------------------
def extract_all_papers(download_dir: str = "downloads/research_papers", max_papers: Optional[int] = None) -> List[Dict[str, Any]]:
    """
    Extract text and sections from all PDFs found in `download_dir`.

    Default directory now looks in downloads/research_papers for organized storage.
    Falls back to 'downloads' if the specified directory is missing or empty.
    """
    print("\n" + "=" * 72)
    print("MODULE 3: PDF TEXT EXTRACTION")
    print("=" * 72)

    pdf_files = get_downloaded_papers(download_dir)
    if not pdf_files:
        print("No PDFs found in the target directories. Run Module 2 or check paths.")
        return []

    if max_papers:
        pdf_files = pdf_files[:max_papers]

    print(f"\nProcessing {len(pdf_files)} PDF file(s)...")

    results: List[Dict[str, Any]] = []
    skipped = 0

    for pdf_file in tqdm(pdf_files, desc="Processing PDFs"):
        try:
            res = process_paper_smart(pdf_file)
            if res:
                results.append(res)
            else:
                skipped += 1
        except Exception as exc:
            print(f"  Unexpected error processing {pdf_file.name}: {str(exc)[:200]}")
            skipped += 1
            continue

    if results:
        save_results_final(results)

    print("\nExtraction complete!")
    print(f"  Successfully processed: {len(results)}")
    print(f"  Skipped: {skipped}")

    return results


def get_downloaded_papers(download_dir: str = "downloads/research_papers") -> List[Path]:
    """
    Return list of PDF paths.

    Behavior:
      - First checks the preferred directory (default 'downloads/research_papers').
      - If that directory doesn't exist or contains no PDFs, falls back to 'downloads'.
      - Returns a sorted list of Path objects for PDF files.
    """
    preferred = Path(download_dir)
    fallback = Path("downloads")

    def _list_pdfs(path: Path) -> List[Path]:
        if not path.exists():
            return []
        return sorted(path.glob("*.pdf"))

    pdf_list = _list_pdfs(preferred)
    if pdf_list:
        print(f"Using PDFs from: {preferred.resolve()}")
        return pdf_list

    pdf_list = _list_pdfs(fallback)
    if pdf_list:
        print(f"No PDFs in {preferred}. Falling back to: {fallback.resolve()}")
        return pdf_list

    print(f"No PDF files found in either {preferred} or {fallback}.")
    return []

def save_results_final(results: List[Dict[str, Any]], output_dir: str = "data/extracted") -> None:
    """
    Save per-paper JSON files and a summary extraction file.

    Args:
        results (list): List of result dictionaries.
        output_dir (str): Directory to save outputs.
    """
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    for res in results:
        paper_id = res.get("paper_id", "unknown")
        output_file = output_path / f"{paper_id}_extracted.json"

        if "extracted_text" in res.get("sections", {}) and len(res["sections"]["extracted_text"]) > 10_000:
            res["sections"]["extracted_text"] = res["sections"]["extracted_text"][:10_000] + "...[truncated]"

        with output_file.open("w", encoding="utf-8") as fh:
            json.dump(res, fh, indent=2, ensure_ascii=False)

        print(f"   Saved: {output_file.name}")

    # Save a summary file
    summary = {
        "extraction_date": datetime.now().isoformat(),
        "total_papers": len(results),
        "papers": [
            {
                "paper_id": r["paper_id"],
                "filename": r["filename"],
                "file_size": r["file_size_bytes"],
                "total_chars": r["total_characters"],
                "sections_found": r["meaningful_sections"]
            }
            for r in results
        ]
    }

    summary_file = output_path / "extraction_summary.json"
    with summary_file.open("w", encoding="utf-8") as fh:
        json.dump(summary, fh, indent=2, ensure_ascii=False)

    print(f"\nSummary saved to: {summary_file}")


# -------------------------
# 5. ANALYZE RESULTS
# -------------------------
def analyze_extraction_results() -> None:
    """
    Read saved extracted JSON files and print an analysis summary.
    """
    print("\n" + "=" * 72)
    print("EXTRACTION ANALYSIS")
    print("=" * 72)

    data_path = Path("data/extracted")
    if not data_path.exists():
        print("No extraction directory found")
        return

    json_files = sorted(data_path.glob("*_extracted.json"))
    if not json_files:
        print("No extracted paper files found")
        return

    print(f"\nFound {len(json_files)} extracted papers:\n")

    total_chars = 0
    papers_with_abstract = 0
    papers_with_multiple_sections = 0

    for jf in json_files:
        try:
            with jf.open("r", encoding="utf-8") as fh:
                data = json.load(fh)

            paper_id = data.get("paper_id", "Unknown")
            total_chars += data.get("total_characters", 0)

            sections = data.get("sections", {})
            meaningful_sections = data.get("meaningful_sections", [])

            if sections.get("abstract") and len(sections["abstract"]) > 200:
                papers_with_abstract += 1

            if len(meaningful_sections) >= 2:
                papers_with_multiple_sections += 1

            print(f" {paper_id}")
            print(f"   Size: {data.get('file_size_bytes', 0):,} bytes")
            print(f"   Text: {data.get('total_characters', 0):,} chars")
            print(f"   Sections found: {len(meaningful_sections)}")

            if sections.get("title"):
                print(f"   Title: {sections['title'][:80]}")

            if sections.get("abstract"):
                print(f"   Abstract: {sections['abstract'][:150]}...")

            print()

        except Exception as exc:
            print(f" Error reading {jf.name}: {str(exc)[:200]}")

    print("\n" + "=" * 60)
    print("EXTRACTION SUMMARY")
    print("=" * 60)
    print(f"Total papers processed: {len(json_files)}")
    print(f"Total characters extracted: {total_chars:,}")
    print(f"Papers with abstract: {papers_with_abstract}/{len(json_files)}")
    print(f"Papers with multiple sections: {papers_with_multiple_sections}/{len(json_files)}")


# -------------------------
# 6. GENERATE REPORT
# -------------------------
def generate_report() -> Optional[Dict[str, Any]]:
    """
    Generate a per-paper quality report and an aggregated review JSON that a mentor can inspect.

    Returns:
        dict: Report dictionary (also saved to data/extracted/_review_report.json)
    """
    print("\n" + "=" * 72)
    print("  REVIEW REPORT")
    print("=" * 72)

    data_path = Path("data/extracted")
    if not data_path.exists():
        print(" No extraction directory found")
        return None

    json_files = sorted(data_path.glob("*_extracted.json"))
    if not json_files:
        print(" No extracted papers found")
        return None

    report: Dict[str, Any] = {
        "generated_date": datetime.now().isoformat(),
        "total_papers": len(json_files),
        "quality_checks": [],
        "papers": []
    }

    for jf in json_files:
        try:
            with jf.open("r", encoding="utf-8") as fh:
                data = json.load(fh)

            sections = data.get("sections", {})
            paper_report = {
                "paper_id": data.get("paper_id"),
                "filename": data.get("filename"),
                "checks": {
                    "text_clean": False,
                    "sections_correct": False,
                    "no_hallucinations": False,
                    "no_missing_chunks": False
                },
                "section_lengths": {},
                "issues": []
            }

            sample_text = sections.get("abstract") or sections.get("extracted_text", "")
            artifacts = ['�', '\x00', '[?]', '[ ]']
            has_artifacts = any(art in sample_text for art in artifacts)
            paper_report["checks"]["text_clean"] = not has_artifacts
            if has_artifacts:
                paper_report["issues"].append("Text contains extraction artifacts")

            major_sections = ["abstract", "introduction", "methods", "results", "conclusion"]
            found_major = [s for s in major_sections if sections.get(s) and len(sections[s]) > 200]
            paper_report["checks"]["sections_correct"] = len(found_major) >= 2
            if len(found_major) < 2:
                paper_report["issues"].append(f"Only found {len(found_major)} major sections")

            total_chars = data.get("total_characters", 0)
            paper_report["checks"]["no_hallucinations"] = 1000 <= total_chars <= 500_000
            if total_chars < 1000:
                paper_report["issues"].append(f"Text too short: {total_chars} chars")
            elif total_chars > 500_000:
                paper_report["issues"].append(f"Text suspiciously long: {total_chars} chars")

            section_lengths_sum = sum(len(str(v)) for v in sections.values() if v)
            coverage = section_lengths_sum / total_chars if total_chars > 0 else 0
            paper_report["checks"]["no_missing_chunks"] = coverage >= 0.3
            if coverage < 0.3:
                paper_report["issues"].append(f"Low coverage: {coverage:.1%}")

            for sec_name, content in sections.items():
                if content and len(str(content)) > 50:
                    paper_report["section_lengths"][sec_name] = len(str(content))

            report["papers"].append(paper_report)
        except Exception as exc:
            print(f"Error processing {jf.name}: {str(exc)[:200]}")

    # Aggregate overall scores
    total_checks = 0
    passed_checks = 0
    for paper in report["papers"]:
        for check_name, passed in paper["checks"].items():
            total_checks += 1
            if passed:
                passed_checks += 1

    report["overall_score"] = f"{passed_checks}/{total_checks}" if total_checks > 0 else "N/A"
    report["success_rate"] = (passed_checks / total_checks) if total_checks > 0 else 0

    report_file = data_path / "_review_report.json"
    with report_file.open("w", encoding="utf-8") as fh:
        json.dump(report, fh, indent=2, ensure_ascii=False)

    print("\nreport generated!")
    print(f"  Overall score: {report['overall_score']}")
    print(f"  Success rate: {report['success_rate']:.1%}")
    print(f"  Report saved to: {report_file}")

    print("\nQUALITY CHECK SUMMARY:")
    print("-" * 40)
    check_names = ["text_clean", "sections_correct", "no_hallucinations", "no_missing_chunks"]
    for check_name in check_names:
        passed_count = sum(1 for paper in report["papers"] if paper["checks"].get(check_name, False))
        total = len(report["papers"])
        percentage = (passed_count / total * 100) if total > 0 else 0
        status = "✅" if percentage >= 70 else "⚠️" if percentage >= 50 else "❌"
        print(f"{status} {check_name}: {passed_count}/{total} ({percentage:.0f}%)")

    return report


# -------------------------
# 7. RUN COMPLETE PIPELINE
# -------------------------
def run_complete_extraction() -> (List[Dict[str, Any]], Optional[Dict[str, Any]]):
    """
    Full extraction pipeline entrypoint:
      - Extract text from PDFs (up to a limit)
      - Analyze saved results
      - Generate mentor review report

    Returns:
        (results_list, report_dict)
    """
    print("\n" + "=" * 72)
    print("PDF TEXT EXTRACTION MODULE")
    print("=" * 72)

    print("\nSTEP 1: Extracting text from PDFs...")
    results = extract_all_papers(max_papers=5)

    if not results:
        print("No papers extracted successfully.")
        return [], None

    print("\nSTEP 2: Analyzing extraction quality...")
    analyze_extraction_results()

    print("\nSTEP 3: Generating review report...")
    report = generate_report()

    print("\n" + "=" * 72)
    print("COMPLETE!")
    print("=" * 72)

    return results, report


if __name__ == "__main__":
    results, report = run_complete_extraction()

    if results:
        print("\n" + "=" * 72)
        print("EXAMPLE OF EXTRACTED CONTENT")
        print("=" * 72)

        first_paper = results[0]
        sections = first_paper.get("sections", {})

        print(f"\nPaper: {first_paper.get('paper_id')}")

        for section_name in ["title", "abstract", "introduction"]:
            content = sections.get(section_name)
            if content and len(content) > 50:
                preview = content[:500] + ("..." if len(content) > 500 else "")
                print(f"\n{section_name.upper()}:")
                print("-" * 40)
                print(preview)
                print(f"[Total length: {len(content):,} characters]")


PDF TEXT EXTRACTION MODULE

STEP 1: Extracting text from PDFs...

MODULE 3: PDF TEXT EXTRACTION
Using PDFs from: C:\Users\abhis\Downloads\research_papers

Processing 2 PDF file(s)...


Processing PDFs:   0%|                                                                           | 0/2 [00:00<?, ?it/s]


Processing: paper_1_b6ad4f9b.pdf


Processing PDFs:  50%|█████████████████████████████████▌                                 | 1/2 [00:04<00:04,  4.32s/it]

  Extracted 44,237 characters
   Found 5 meaningful sections
    • abstract: 2,779 chars
    • introduction: 4,303 chars
    • methods: 5,000 chars

Processing: paper_2_19dc86c4.pdf


Processing PDFs: 100%|███████████████████████████████████████████████████████████████████| 2/2 [00:07<00:00,  3.53s/it]

  Extracted 35,094 characters
   Found 5 meaningful sections
    • abstract: 1,045 chars
    • introduction: 3,243 chars
    • methods: 5,000 chars
   Saved: paper_1_b6ad4f9b_extracted.json
   Saved: paper_2_19dc86c4_extracted.json

Summary saved to: data\extracted\extraction_summary.json

Extraction complete!
  Successfully processed: 2
  Skipped: 0

STEP 2: Analyzing extraction quality...

EXTRACTION ANALYSIS

Found 4 extracted papers:

 paper_1_b6ad4f9b
   Size: 2,887,169 bytes
   Text: 44,237 chars
   Sections found: 5
   Abstract: # **Microsoft COCO: Common Objects in Context** Tsung-Yi Lin [1], Michael Maire [2], Serge Belongie [1], James Hays [3], Pietro Perona [2], Deva Raman...

 paper_2_19dc86c4
   Size: 365,554 bytes
   Text: 35,094 chars
   Sections found: 5
   Abstract: Thompson, Toby J. Gibson [1], Frédéric Plewniak, François Jeanmougin* and** **Desmond G. Higgins** **[2]** Institut de Genetique et de Biologie Molecu...

 paper_2_1c08f086
   Size: 710,934 bytes
   Text: 4




#  Module 4 : Key-finding Extraction Logic and Cross-paper Comparison 

In [17]:
# pip install scikit-learn numpy

import json
import re
from pathlib import Path
from collections import defaultdict
from typing import List, Dict, Any, Optional, Tuple, Set

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# -------------------------
# 1. LOAD EXTRACTED PAPERS
# -------------------------
def load_extracted_papers(data_dir: str = "data/extracted") -> List[Dict[str, Any]]:
    """
    Load all extracted paper JSON files from the given directory.

    Args:
        data_dir: Directory containing `*_extracted.json` files.

    Returns:
        List of paper dictionaries (parsed JSON).
    """
    data_path = Path(data_dir)
    papers: List[Dict[str, Any]] = []

    json_files = sorted(data_path.glob("*_extracted.json"))
    if not json_files:
        print("No extracted papers found. Run Module 3 first.")
        return []

    print(f"Loading {len(json_files)} extracted papers from {data_path}...")

    for jf in json_files:
        try:
            with jf.open("r", encoding="utf-8") as fh:
                data = json.load(fh)
                papers.append(data)
                print(f"  ✓ {data.get('paper_id', jf.stem)}: {data.get('total_characters', 0):,} chars")
        except Exception as exc:
            print(f"  Error loading {jf.name}: {exc}")

    return papers


# -------------------------
# 2. SINGLE PAPER ANALYSIS
# -------------------------
def analyze_single_paper(paper: Dict[str, Any]) -> Dict[str, Any]:
    """
    Perform an in-depth analysis of a single extracted paper.

    Args:
        paper: A single paper dict (as produced by Module 3).

    Returns:
        A structured analysis dictionary with extracted insights.
    """
    print("\nPerforming deep analysis of a single paper...")

    info = extract_key_information(paper)

    analysis = {
        "paper_id": info.get("paper_id"),
        "title": info.get("title"),
        "year": info.get("year"),
        "methods_used": info.get("methods", []),
        "datasets_mentioned": info.get("datasets", []),
        "key_findings": info.get("key_findings", []),
        "limitations": info.get("limitations", []),
        "contributions": info.get("contributions", []),
        "metrics_reported": info.get("metrics", []),
        "paper_structure": analyze_paper_structure(paper),
        "research_quality_indicators": assess_research_quality(info),
        "recommendations_for_future_research": generate_recommendations(info)
    }

    return analysis


def analyze_paper_structure(paper: Dict[str, Any]) -> Dict[str, Any]:
    """
    Analyze which standard sections are present and their lengths.

    Args:
        paper: extracted paper dict.

    Returns:
        Dict describing present/missing sections and lengths.
    """
    sections = paper.get("sections", {})
    structure: Dict[str, Any] = {
        "sections_present": [],
        "sections_missing": [],
        "section_lengths": {}
    }

    expected_sections = ["title", "abstract", "introduction", "methods", "results", "conclusion", "references"]

    for sec in expected_sections:
        content = sections.get(sec, "")
        if content and len(content.split()) > 10:
            structure["sections_present"].append(sec)
            structure["section_lengths"][sec] = len(content.split())
        else:
            structure["sections_missing"].append(sec)

    return structure


def assess_research_quality(info: Dict[str, Any]) -> Dict[str, Any]:
    """
    Assess research quality using simple heuristic indicators.

    Args:
        info: result of extract_key_information.

    Returns:
        Dict with quality indicators and an overall score.
    """
    quality_indicators: Dict[str, Any] = {
        "has_methods": bool(info.get("methods")),
        "has_datasets": bool(info.get("datasets")),
        "has_findings": bool(info.get("key_findings")),
        "has_limitations": bool(info.get("limitations")),
        "has_metrics": bool(info.get("metrics")),
        "method_diversity": len(info.get("methods", [])),
        "finding_clarity": len(info.get("key_findings", []))
    }

    score = 0
    max_score = 7
    if quality_indicators["has_methods"]:
        score += 1
    if quality_indicators["has_datasets"]:
        score += 1
    if quality_indicators["has_findings"]:
        score += 1
    if quality_indicators["has_limitations"]:
        score += 1
    if quality_indicators["has_metrics"]:
        score += 1
    if quality_indicators["method_diversity"] >= 2:
        score += 1
    if quality_indicators["finding_clarity"] >= 2:
        score += 1

    quality_indicators["overall_score"] = f"{score}/{max_score}"
    quality_indicators["percentage"] = (score / max_score) * 100

    return quality_indicators


def generate_recommendations(info: Dict[str, Any]) -> List[str]:
    """
    Produce short recommendations for future research based on extracted info.

    Args:
        info: key information extracted from a paper.

    Returns:
        List of recommendation strings.
    """
    recommendations: List[str] = []

    methods = info.get("methods", [])
    if methods:
        recommendations.append(f"Compare with other studies using: {methods[0]}")

    limitations = info.get("limitations", [])
    if limitations:
        recommendations.append(f"Address limitations such as: {limitations[0][:120]}...")

    datasets = info.get("datasets", [])
    if datasets:
        recommendations.append("Explore additional datasets to validate findings")

    recommendations.append("Compare with recent papers in the same domain")
    recommendations.append("Consider using alternative methodologies referenced in related work")

    return recommendations[:3]


# -------------------------
# 3. KEY INFORMATION EXTRACTION
# -------------------------
def extract_key_information(paper: Dict[str, Any]) -> Dict[str, Any]:
    """
    Extract structured key information from a paper.

    Args:
        paper: paper dict with 'sections' etc.

    Returns:
        Dict with fields: paper_id, title, year, methods, datasets, key_findings, limitations, contributions, metrics.
    """
    sections = paper.get("sections", {})
    info = {
        "paper_id": paper.get("paper_id", "unknown"),
        "title": sections.get("title", "Unknown"),
        "year": extract_year(paper),
        "methods": extract_methods(paper),
        "datasets": extract_datasets(paper),
        "key_findings": extract_key_findings(paper),
        "limitations": extract_limitations(paper),
        "contributions": extract_contributions(paper),
        "metrics": extract_metrics(paper)
    }
    return info


def extract_year(paper: Dict[str, Any]) -> str:
    """
    Try to extract a 4-digit year from the title or extracted_text.

    Returns year as string or 'Unknown'.
    """
    title = paper.get("sections", {}).get("title", "") or ""
    match = re.search(r"\b(19|20)\d{2}\b", title)
    if match:
        return match.group(0)

    text = paper.get("sections", {}).get("extracted_text", "") or ""
    match = re.search(r"\b(19|20)\d{2}\b", text[:5000])
    if match:
        return match.group(0)

    if paper.get("year"):
        return str(paper.get("year"))

    return "Unknown"


def extract_methods(paper: Dict[str, Any]) -> List[str]:
    """
    Extract likely methods mentioned in methods/results/conclusion sections using keywords.

    Returns up to 5 method snippets.
    """
    methods_text = paper.get("sections", {}).get("methods", "") or ""
    if not methods_text:
        methods_text = (paper.get("sections", {}).get("extracted_text", "") or "")[:5000]

    method_keywords = [
        "deep learning", "machine learning", "neural network", "transformer",
        "cnn", "rnn", "lstm", "bert", "gpt", "reinforcement learning",
        "statistical", "regression", "classification", "clustering",
        "svm", "random forest", "xgboost", "bayesian", "monte carlo",
        "simulation", "experiment", "analysis", "framework", "model",
        "algorithm", "approach", "technique", "methodology"
    ]

    found: List[str] = []
    sentences = re.split(r"[.!?]+", methods_text.lower())

    for sent in sentences:
        for kw in method_keywords:
            if kw in sent and len(sent.strip()) > 20:
                clean = re.sub(r"\s+", " ", sent).strip()
                if clean not in found:
                    found.append(clean[:200])
                    break
        if len(found) >= 5:
            break

    # Fallback: look in results/conclusion
    if not found:
        combined = (paper.get("sections", {}).get("results", "") or "") + " " + (paper.get("sections", {}).get("conclusion", "") or "")
        for sent in re.split(r"[.!?]+", combined.lower()):
            for kw in method_keywords[:10]:
                if kw in sent and len(sent.strip()) > 20:
                    clean = re.sub(r"\s+", " ", sent).strip()
                    if clean not in found:
                        found.append(clean[:200])
                        break
            if len(found) >= 5:
                break

    return found[:5]


def extract_datasets(paper: Dict[str, Any]) -> List[str]:
    """
    Find mentions of common datasets or dataset-like phrases within the extracted_text.
    """
    text = (paper.get("sections", {}).get("extracted_text", "") or "").lower()[:10000]

    dataset_patterns = [
        r"imagenet", r"cifar", r"mnist", r"coco", r"pascal", r"wikitext",
        r"bookcorpus", r"squad", r"glue", r"superglue", r"kaggle", r"uci",
        r"pubmed", r"arxiv", r"dataset", r"corpus", r"benchmark", r"repository"
    ]
    found: List[str] = []

    for pat in dataset_patterns:
        if re.search(pat, text):
            found.append(pat)

    sentences = re.split(r"[.!?]+", text)
    for sent in sentences:
        if any(k in sent for k in ["dataset", "corpus", "benchmark", "collection"]):
            clean = re.sub(r"\s+", " ", sent).strip()[:150]
            if clean and clean not in found:
                found.append(clean)

    unique = []
    for x in found:
        if x not in unique:
            unique.append(x)
        if len(unique) >= 5:
            break

    return unique


def extract_key_findings(paper: Dict[str, Any]) -> List[str]:
    """
    Heuristically extract sentences that appear to describe findings/results.
    """
    text = (paper.get("sections", {}).get("results", "") or "") or (paper.get("sections", {}).get("conclusion", "") or "")
    if not text:
        text = (paper.get("sections", {}).get("extracted_text", "") or "")[:3000]

    result_keywords = [
        "result shows", "findings show", "we found", "we demonstrate",
        "achieves", "outperforms", "improves", "increases", "reduces",
        "accuracy", "precision", "recall", "f1", "score", "performance",
        "significant", "better than", "compared to", "surpasses"
    ]

    findings: List[str] = []
    sentences = re.split(r"[.!?]+", text.lower())

    for sent in sentences:
        if any(kw in sent for kw in result_keywords) and len(sent.strip()) > 30:
            clean = re.sub(r"\s+", " ", sent).strip()
            if clean not in findings:
                findings.append(clean[:300])
        if len(findings) >= 5:
            break

    if len(findings) < 2:
        conclusion = (paper.get("sections", {}).get("conclusion", "") or "")
        for sent in re.split(r"[.!?]+", conclusion.lower())[:5]:
            if len(sent.strip()) > 50:
                findings.append(re.sub(r"\s+", " ", sent).strip()[:300])
            if len(findings) >= 5:
                break

    return findings[:5]


def extract_limitations(paper: Dict[str, Any]) -> List[str]:
    """
    Extract sentences indicating limitations or future work.
    """
    text = (paper.get("sections", {}).get("conclusion", "") or "") or (paper.get("sections", {}).get("extracted_text", "") or "")[:5000]

    limitation_keywords = [
        "limitation", "drawback", "shortcoming", "weakness",
        "future work", "further research", "need to", "could be improved",
        "challenge", "difficulty", "issue", "problem", "not consider",
        "assumption", "restriction", "constraint", "only work"
    ]

    limitations: List[str] = []
    sentences = re.split(r"[.!?]+", text.lower())

    for sent in sentences:
        if any(kw in sent for kw in limitation_keywords) and len(sent.strip()) > 30:
            clean = re.sub(r"\s+", " ", sent).strip()
            if clean not in limitations:
                limitations.append(clean[:300])
        if len(limitations) >= 3:
            break

    return limitations[:3]


def extract_contributions(paper: Dict[str, Any]) -> List[str]:
    """
    Extract statements of contribution from abstract/introduction.
    """
    abstract = (paper.get("sections", {}).get("abstract", "") or "")[:1000]
    intro = (paper.get("sections", {}).get("introduction", "") or "")[:1000]
    text = (abstract + " " + intro).lower()

    contribution_keywords = [
        "contribution", "contribute", "propose", "introduce",
        "novel", "new method", "new approach", "we present",
        "this paper", "our work", "main contribution", "key contribution"
    ]

    contributions: List[str] = []
    for sent in re.split(r"[.!?]+", text):
        if any(kw in sent for kw in contribution_keywords) and len(sent.strip()) > 30:
            clean = re.sub(r"\s+", " ", sent).strip()
            if clean not in contributions:
                contributions.append(clean[:300])
        if len(contributions) >= 3:
            break

    return contributions[:3]


def extract_metrics(paper: Dict[str, Any]) -> List[str]:
    """
    Extract numeric metric mentions from the results section.
    """
    results_text = paper.get("sections", {}).get("results", "") or ""
    if not results_text:
        return []

    metric_patterns = [
        r"accuracy\s*[:=]\s*\d+\.?\d*%?", r"precision\s*[:=]\s*\d+\.?\d*%?",
        r"recall\s*[:=]\s*\d+\.?\d*%?", r"f1[\s\-]?score\s*[:=]\s*\d+\.?\d*%?",
        r"auc\s*[:=]\s*\d+\.?\d*", r"mae\s*[:=]\s*\d+\.?\d*", r"rmse\s*[:=]\s*\d+\.?\d*",
        r"\d+\.?\d*\s*%"
    ]

    metrics: List[str] = []
    for pat in metric_patterns:
        matches = re.findall(pat, results_text.lower())
        for m in matches:
            if m not in metrics:
                metrics.append(m)

    return metrics[:5]


# -------------------------
# 4. COMPARISON FUNCTIONS
# -------------------------
def compare_papers(papers_info: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Compare multiple paper summaries to find similarities, differences, and gaps.

    Args:
        papers_info: list of paper summary dicts (from extract_key_information).

    Returns:
        Dict containing comparison results.
    """
    print(f"\nComparing {len(papers_info)} papers...")

    comparison = {
        "total_papers": len(papers_info),
        "papers": papers_info,
        "similarities": find_similarities(papers_info),
        "differences": find_differences(papers_info),
        "common_methods": find_common_elements(papers_info, "methods"),
        "common_datasets": find_common_elements(papers_info, "datasets"),
        "timeline_analysis": analyze_timeline(papers_info),
        "research_gaps": identify_research_gaps(papers_info)
    }

    return comparison


def find_similarities(papers_info: List[Dict[str, Any]]) -> Dict[str, List[str]]:
    """
    Identify methods/datasets/findings that appear in more than one paper.
    """
    methods_count: defaultdict = defaultdict(int)
    datasets_count: defaultdict = defaultdict(int)
    findings_count: defaultdict = defaultdict(int)

    for paper in papers_info:
        for m in paper.get("methods", []):
            key = m[:50].lower()
            methods_count[key] += 1
        for d in paper.get("datasets", []):
            key = d[:50].lower()
            datasets_count[key] += 1
        for f in paper.get("key_findings", []):
            key = f[:50].lower()
            findings_count[key] += 1

    similar_items = {
        "methods": [item for item, cnt in methods_count.items() if cnt > 1 and len(item) > 10],
        "datasets": [item for item, cnt in datasets_count.items() if cnt > 1 and len(item) > 10],
        "findings": [item for item, cnt in findings_count.items() if cnt > 1 and len(item) > 10]
    }

    return similar_items


def find_differences(papers_info: List[Dict[str, Any]]) -> Dict[str, Dict[str, List[str]]]:
    """
    For each paper, find unique methods/datasets/findings compared to other papers.
    """
    differences = {
        "unique_methods": defaultdict(list),
        "unique_datasets": defaultdict(list),
        "unique_findings": defaultdict(list)
    }

    paper_methods: Dict[str, Set[str]] = defaultdict(set)
    paper_datasets: Dict[str, Set[str]] = defaultdict(set)
    paper_findings: Dict[str, Set[str]] = defaultdict(set)
    all_methods: Set[str] = set()
    all_datasets: Set[str] = set()
    all_findings: Set[str] = set()

    for paper in papers_info:
        pid = paper.get("paper_id", "")
        for m in paper.get("methods", []):
            key = m[:50].lower()
            paper_methods[pid].add(key)
            all_methods.add(key)
        for d in paper.get("datasets", []):
            key = d[:50].lower()
            paper_datasets[pid].add(key)
            all_datasets.add(key)
        for f in paper.get("key_findings", []):
            key = f[:50].lower()
            paper_findings[pid].add(key)
            all_findings.add(key)

    for pid in paper_methods:
        unique_methods = paper_methods[pid] - set().union(*(paper_methods[qid] for qid in paper_methods if qid != pid))
        if unique_methods:
            differences["unique_methods"][pid] = list(unique_methods)[:3]

        unique_datasets = paper_datasets[pid] - set().union(*(paper_datasets[qid] for qid in paper_datasets if qid != pid))
        if unique_datasets:
            differences["unique_datasets"][pid] = list(unique_datasets)[:3]

        unique_findings = paper_findings[pid] - set().union(*(paper_findings[qid] for qid in paper_findings if qid != pid))
        if unique_findings:
            differences["unique_findings"][pid] = list(unique_findings)[:3]

    return {
        "unique_methods": dict(differences["unique_methods"]),
        "unique_datasets": dict(differences["unique_datasets"]),
        "unique_findings": dict(differences["unique_findings"])
    }


def find_common_elements(papers_info: List[Dict[str, Any]], element_type: str) -> List[str]:
    """
    Find elements (methods/datasets) that appear in every paper.

    Args:
        papers_info: list of paper info dicts.
        element_type: 'methods' or 'datasets' or other list-key.

    Returns:
        List of common elements (first 5).
    """
    element_sets: List[Set[str]] = []
    for paper in papers_info:
        elements = paper.get(element_type, [])
        element_sets.append({e[:50].lower() for e in elements if len(e) > 10})

    if not element_sets:
        return []

    common = set.intersection(*element_sets)
    return list(common)[:5]


def analyze_timeline(papers_info: List[Dict[str, Any]]) -> Dict[str, Any]:
    """
    Analyze temporal distribution of papers (earliest, latest, range, counts).

    Returns:
        timeline dict or note if insufficient data.
    """
    years: List[int] = []
    for p in papers_info:
        y = p.get("year", "Unknown")
        try:
            if isinstance(y, str) and y.isdigit():
                yi = int(y)
            elif isinstance(y, int):
                yi = y
            else:
                continue
            if 1900 <= yi <= 2100:
                years.append(yi)
        except Exception:
            continue

    if len(years) >= 2:
        timeline = {
            "earliest": min(years),
            "latest": max(years),
            "range": max(years) - min(years),
            "count_by_year": {str(year): years.count(year) for year in sorted(set(years))}
        }
    else:
        timeline = {"note": "Insufficient year data"}

    return timeline


def identify_research_gaps(papers_info: List[Dict[str, Any]]) -> List[str]:
    """
    Heuristically identify research gaps from aggregated limitations and un-used popular methods.
    """
    gaps: List[str] = []

    all_limitations: List[str] = []
    for p in papers_info:
        all_limitations.extend(p.get("limitations", []))

    limitation_counts: defaultdict = defaultdict(int)
    for lim in all_limitations:
        key = lim[:100].lower()
        limitation_counts[key] += 1

    frequent_limitations = [lim for lim, cnt in limitation_counts.items() if cnt > 1 and len(lim) > 20]
    if frequent_limitations:
        gaps.append("Common limitations across papers:")
        gaps.extend(frequent_limitations[:3])

    methods_used = {m.lower() for p in papers_info for m in p.get("methods", [])}
    datasets_used = {d.lower() for p in papers_info for d in p.get("datasets", [])}

    common_methods_in_field = [
        "deep learning", "transfer learning", "reinforcement learning",
        "explainable ai", "few-shot learning", "meta learning"
    ]

    missing_methods = [m for m in common_methods_in_field if m not in methods_used]
    if missing_methods:
        gaps.append("Potentially unexplored methods in this set of papers:")
        gaps.extend(missing_methods[:3])

    return gaps[:5]


def calculate_similarity_scores(papers_info: List[Dict[str, Any]]) -> Dict[str, Dict[str, float]]:
    """
    Compute pairwise cosine similarity between papers using TF-IDF on title+abstract+findings.

    Returns:
        Nested dict mapping paper_id -> other_paper_id -> similarity score (0..1).
    """
    texts: List[str] = []
    paper_ids: List[str] = []

    for p in papers_info:
        pid = p.get("paper_id", "")
        paper_ids.append(pid)
        title = p.get("title", "")
        abstract = p.get("sections", {}).get("abstract", "")[:1000]
        findings = " ".join(p.get("key_findings", []))[:1000]
        combined = " ".join([title, abstract, findings])
        texts.append(combined)

    if not texts:
        return {}

    vectorizer = TfidfVectorizer(stop_words="english", max_features=2000)
    try:
        tfidf = vectorizer.fit_transform(texts)
        sim_matrix = cosine_similarity(tfidf)
    except Exception as exc:
        print(f"Error computing similarity matrix: {exc}")
        # Return empty similarity if failure
        return {}

    similarity_scores: Dict[str, Dict[str, float]] = {}
    n = len(paper_ids)
    for i in range(n):
        similarity_scores[paper_ids[i]] = {}
        for j in range(n):
            if i == j:
                continue
            similarity_scores[paper_ids[i]][paper_ids[j]] = float(f"{sim_matrix[i, j]:.3f}")

    return similarity_scores


# -------------------------
# 5. SAVE RESULTS
# -------------------------
def save_results(analysis_type: str, data: Dict[str, Any], output_dir: str = "data/analysis") -> str:
    """
    Save analysis results to JSON and generate human-readable reports.

    Args:
        analysis_type: "single" or "comparison"
        data: analysis data dict
        output_dir: directory to save results

    Returns:
        Path to output directory as string.
    """
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    if analysis_type == "single":
        output_file = output_path / "single_paper_analysis.json"
        with output_file.open("w", encoding="utf-8") as fh:
            json.dump(data, fh, indent=2, ensure_ascii=False)
        print(f"   Single paper analysis saved to: {output_file}")
        generate_single_paper_report(data, output_path)

    elif analysis_type == "comparison":
        comparison_file = output_path / "comparison.json"
        with comparison_file.open("w", encoding="utf-8") as fh:
            json.dump(data.get("comparison", {}), fh, indent=2, ensure_ascii=False)
        print(f"  Comparison saved to: {comparison_file}")

        similarity_file = output_path / "similarity_scores.json"
        with similarity_file.open("w", encoding="utf-8") as fh:
            json.dump(data.get("similarity_scores", {}), fh, indent=2, ensure_ascii=False)
        print(f"   Similarity scores saved to: {similarity_file}")

        generate_comparison_report(data, output_path)

    return str(output_path)


def generate_single_paper_report(analysis: Dict[str, Any], output_path: Path) -> None:
    """
    Create a human-readable text report for a single-paper analysis.
    """
    report_lines: List[str] = []
    report_lines.append("=" * 80)
    report_lines.append("SINGLE PAPER IN-DEPTH ANALYSIS REPORT")
    report_lines.append("=" * 80)
    report_lines.append(f"\n PAPER: {analysis.get('paper_id')}")
    report_lines.append(f" Title: {analysis.get('title')}")
    report_lines.append(f" Year: {analysis.get('year')}\n")

    report_lines.append("METHODS IDENTIFIED:")
    report_lines.append("-" * 40)
    if analysis.get("methods_used"):
        for m in analysis["methods_used"]:
            report_lines.append(f"• {m}")
    else:
        report_lines.append("No specific methods identified")

    report_lines.append("\nKEY FINDINGS:")
    report_lines.append("-" * 40)
    if analysis.get("key_findings"):
        for f in analysis["key_findings"]:
            report_lines.append(f"• {f}")
    else:
        report_lines.append("No key findings extracted")

    report_lines.append("\nLIMITATIONS:")
    report_lines.append("-" * 40)
    if analysis.get("limitations"):
        for lim in analysis["limitations"]:
            report_lines.append(f"• {lim}")
    else:
        report_lines.append("No limitations mentioned")

    report_lines.append("\nRESEARCH QUALITY:")
    report_lines.append("-" * 40)
    quality = analysis.get("research_quality_indicators", {})
    report_lines.append(f"Overall Score: {quality.get('overall_score', 'N/A')} ({quality.get('percentage', 0):.1f}%)")
    report_lines.append(f"Has Methods: {'✅' if quality.get('has_methods') else '❌'}")
    report_lines.append(f"Has Datasets: {'✅' if quality.get('has_datasets') else '❌'}")
    report_lines.append(f"Has Findings: {'✅' if quality.get('has_findings') else '❌'}")
    report_lines.append(f"Has Limitations: {'✅' if quality.get('has_limitations') else '❌'}")

    report_lines.append("\nRECOMMENDATIONS FOR FUTURE RESEARCH:")
    report_lines.append("-" * 40)
    for r in analysis.get("recommendations_for_future_research", []):
        report_lines.append(f"• {r}")

    report_lines.append("\n" + "=" * 80)
    report_lines.append("ANALYSIS COMPLETE")
    report_lines.append("=" * 80)

    report_file = output_path / "single_paper_report.txt"
    with report_file.open("w", encoding="utf-8") as fh:
        fh.write("\n".join(report_lines))

    print(f"   Summary report saved to: {report_file}")


def generate_comparison_report(data: Dict[str, Any], output_path: Path) -> None:
    """
    Create a human-readable comparison report for multiple papers.
    """
    comparison = data.get("comparison", {})
    similarity_scores = data.get("similarity_scores", {})

    lines: List[str] = []
    lines.append("=" * 80)
    lines.append("CROSS-PAPER COMPARISON REPORT")
    lines.append("=" * 80)
    lines.append(f"\nTotal papers analyzed: {comparison.get('total_papers', 0)}\n")

    lines.append("PAPERS ANALYZED:")
    lines.append("-" * 40)
    for paper in comparison.get("papers", []):
        lines.append(f"\n• {paper.get('paper_id', 'unknown')}")
        lines.append(f"  Title: {paper.get('title', 'Unknown')}")
        lines.append(f"  Year: {paper.get('year', 'Unknown')}")
        lines.append(f"  Methods: {len(paper.get('methods', []))} found")
        lines.append(f"  Datasets: {len(paper.get('datasets', []))} found")

    lines.append("\nKEY SIMILARITIES:")
    lines.append("-" * 40)
    sim = comparison.get("similarities", {})
    if sim.get("methods"):
        lines.append("\nCommon Methods:")
        for m in sim["methods"]:
            lines.append(f"  • {m}")
    if sim.get("datasets"):
        lines.append("\nCommon Datasets:")
        for d in sim["datasets"]:
            lines.append(f"  • {d}")

    lines.append("\nPAPER SIMILARITY SCORES:")
    lines.append("-" * 40)
    for pid, scores in similarity_scores.items():
        lines.append(f"\n{pid}:")
        for other, score in scores.items():
            lines.append(f"  vs {other}: {score:.3f}")

    if comparison.get("research_gaps"):
        lines.append("\nIDENTIFIED RESEARCH GAPS:")
        lines.append("-" * 40)
        for gap in comparison["research_gaps"]:
            lines.append(f"• {gap}")

    lines.append("\n" + "=" * 80)
    lines.append("COMPARISON COMPLETE")
    lines.append("=" * 80)

    report_file = output_path / "comparison_report.txt"
    with report_file.open("w", encoding="utf-8") as fh:
        fh.write("\n".join(lines))

    print(f"  Comparison report saved to: {report_file}")


# -------------------------
# 6. MAIN ANALYSIS PIPELINE
# -------------------------
def run_analysis() -> Optional[Dict[str, Any]]:
    """
    Main pipeline entrypoint for Module 4.

    - Loads extracted papers
    - If a single paper: run deep single-paper analysis
    - If multiple papers: extract key info, compare, compute similarities, and save
    """
    print("\n" + "=" * 80)
    print("PAPER ANALYSIS MODULE")
    print("=" * 80)

    print("\nSTEP 1: Loading extracted papers...")
    papers = load_extracted_papers()
    if not papers:
        print(" No papers to analyze")
        return None

    if len(papers) == 1:
        print("\nℹ Only 1 paper found. Performing in-depth single paper analysis...")
        paper = papers[0]
        analysis = analyze_single_paper(paper)
        info = extract_key_information(paper)

        print("\nSTEP 2: Saving analysis results...")
        save_path = save_results("single", analysis)

        print("\nSINGLE PAPER ANALYSIS COMPLETE!")
        print(f"Files saved to: {save_path}")
        return {"type": "single", "analysis": analysis, "paper_info": info}

    else:
        print(f"\nSTEP 2: Analyzing {len(papers)} papers for comparison...")
        papers_info: List[Dict[str, Any]] = []
        for p in papers:
            info = extract_key_information(p)
            papers_info.append(info)
            print(f"  ✓ {info.get('paper_id')}: {len(info.get('methods', []))} methods, {len(info.get('key_findings', []))} findings")

        print("\nSTEP 3: Comparing papers...")
        comparison = compare_papers(papers_info)

        print("\nSTEP 4: Calculating similarity scores...")
        similarity_scores = calculate_similarity_scores(papers_info)

        print("\nSTEP 5: Saving comparison results...")
        data = {
            "comparison": comparison,
            "similarity_scores": similarity_scores
        }
        save_path = save_results("comparison", data)

        print("\nCROSS-PAPER ANALYSIS COMPLETE!")
        print(f"Files saved to: {save_path}")

        return {"type": "comparison", "data": data, "papers_info": papers_info}


# -------------------------
# 7. DEMO / TEST HELPERS
# -------------------------
def create_demo_paper_for_testing() -> Dict[str, Any]:
    """
    Create a small demo paper info dict for testing comparison features.
    """
    demo_paper = {
        "paper_id": "demo_paper_ai_ethics",
        "title": "Ethical Considerations in Artificial Intelligence Systems",
        "year": "2023",
        "methods": ["machine learning", "ethical framework analysis", "case studies"],
        "datasets": ["AI ethics guidelines corpus", "public opinion surveys"],
        "key_findings": [
            "AI systems show bias in 78% of tested scenarios",
            "Current ethical frameworks lack enforcement mechanisms",
            "Transparency is the most cited ethical concern"
        ],
        "limitations": [
            "Study limited to Western ethical frameworks",
            "Small sample size for public opinion data"
        ],
        "contributions": [
            "Proposes new AI ethics assessment framework",
            "Identifies key gaps in current regulations"
        ],
        "metrics": ["accuracy: 85%", "f1-score: 0.82"]
    }
    return demo_paper


def run_with_demo_data() -> Tuple[Dict[str, Any], Dict[str, Any]]:
    """
    Example function showing how to run comparison with a real paper + a demo paper.
    """
    real_papers = load_extracted_papers()
    if not real_papers:
        raise RuntimeError("No real papers found to demo with.")

    demo = create_demo_paper_for_testing()
    real_info = extract_key_information(real_papers[0])

    papers_info = [real_info, demo]
    comparison = compare_papers(papers_info)
    similarity_scores = calculate_similarity_scores(papers_info)

    print("\nDemo comparison complete.")
    return comparison, similarity_scores


# -------------
# Entry point
# -------------
if __name__ == "__main__":
    result = run_analysis()
    if result:
        if result.get("type") == "single":
            print("\nSingle paper analysis completed.")
        else:
            print("\nComparison analysis completed.")


PAPER ANALYSIS MODULE

STEP 1: Loading extracted papers...
Loading 4 extracted papers from data\extracted...
  ✓ paper_1_b6ad4f9b: 44,237 chars
  ✓ paper_2_19dc86c4: 35,094 chars
  ✓ paper_2_1c08f086: 48,752 chars
  ✓ paper_3_829b5d7b: 1,281 chars

STEP 2: Analyzing 4 papers for comparison...
  ✓ paper_1_b6ad4f9b: 4 methods, 4 findings
  ✓ paper_2_19dc86c4: 5 methods, 5 findings
  ✓ paper_2_1c08f086: 4 methods, 5 findings
  ✓ paper_3_829b5d7b: 0 methods, 0 findings

STEP 3: Comparing papers...

Comparing 4 papers...

STEP 4: Calculating similarity scores...

STEP 5: Saving comparison results...
  Comparison saved to: data\analysis\comparison.json
   Similarity scores saved to: data\analysis\similarity_scores.json
  Comparison report saved to: data\analysis\comparison_report.txt

CROSS-PAPER ANALYSIS COMPLETE!
Files saved to: data\analysis

Comparison analysis completed.


# Module 5: Generate Draft Sections

In [18]:
# pip install tiktoken

import json
import os
import re
from pathlib import Path
from datetime import datetime
from typing import Dict, Any, Optional, List

try:
    import tiktoken
    _HAS_TIKTOKEN = True
except Exception:
    _HAS_TIKTOKEN = False


# -------------------------
# 1. GPT SECTION GENERATOR
# -------------------------
class GPTSectionGenerator:
    """
    Simulated GPT-based section generator.

    This class provides simple template-based generation for:
      - abstract
      - introduction
      - methods
      - results
      - conclusion
      - references

    The class is intentionally conservative: it does not call any external API.
    In production, replace the template functions with actual API calls while
    keeping the same method signatures.
    """

    def __init__(self, api_key: Optional[str] = None, model: str = "gpt-3.5-turbo") -> None:
        """
        Initialize the generator.

        Args:
            api_key: Optional API key (not used in this educational template).
            model: Model name string (used for token estimation).
        """
        self.model = model
        if _HAS_TIKTOKEN:
            try:
                self.encoding = tiktoken.encoding_for_model(model)
            except Exception:
                self.encoding = tiktoken.get_encoding("gpt2")
        else:
            self.encoding = None

        print(f" GPTSectionGenerator initialized (simulated, model={self.model})")

    def count_tokens(self, text: str) -> int:
        """
        Count tokens in text. Uses tiktoken when available for realistic counts,
        otherwise falls back to a conservative word-based estimate.
        """
        if not text:
            return 0
        if _HAS_TIKTOKEN and self.encoding is not None:
            try:
                return len(self.encoding.encode(text))
            except Exception:
                return len(text.split())
        return len(text.split())

    # -------------------------
    # Template generation API
    # -------------------------
    def create_system_prompt(self) -> str:
        """
        Create a fixed system prompt for academic writing.
        (Kept as a function so production replacements can reuse it.)
        """
        return (
            "You are an academic research assistant. Generate structured academic "
            "sections based on provided analysis data. Use formal academic language; "
            "base content on provided analysis; use APA-style references where possible."
        )

    def generate_with_template(self, section_type: str, analysis_data: Dict[str, Any], paper_count: int = 1) -> str:
        """
        Dispatch to the appropriate template generator.

        Args:
            section_type: One of 'abstract', 'introduction', 'methods', 'results', 'conclusion', 'references'.
            analysis_data: Dict returned by Module 4 (single or comparison).
            paper_count: Number of papers the analysis covered.

        Returns:
            Generated text for the section.
        """
        if section_type == "abstract":
            return self._generate_abstract(analysis_data, paper_count)
        if section_type == "introduction":
            return self._generate_introduction(analysis_data, paper_count)
        if section_type == "methods":
            return self._generate_methods_comparison(analysis_data, paper_count)
        if section_type == "results":
            return self._generate_results_synthesis(analysis_data, paper_count)
        if section_type == "conclusion":
            return self._generate_conclusion(analysis_data, paper_count)
        if section_type == "references":
            return self._generate_references(analysis_data)
        return "Section type not recognized"

    # -------------------------
    # Internal template methods
    # -------------------------
    def _generate_abstract(self, analysis_data: Dict[str, Any], paper_count: int) -> str:
        """Generate a short abstract. Keep <= 100 words where possible."""

        if paper_count == 1 and "analysis" in analysis_data:
            paper = analysis_data.get("analysis", {})
            title = paper.get("title", "This paper")
            methods = paper.get("methods_used", [])
            findings = paper.get("key_findings", [])
            abstract = f"This review examines '{title}'. "
            if methods:
                abstract += f"The approach uses {methods[0]}. "
            if findings:
                abstract += f"Key finding: {findings[0][:140]}. "
            abstract += "This analysis summarizes methodological choices and implications."
        else:
            # Multi-paper abstract
            comp = analysis_data.get("data", {}).get("comparison", {})
            common_methods = comp.get("common_methods", []) if comp else []
            abstract = f"This comparative analysis synthesizes findings from {paper_count} research papers. "
            if common_methods:
                abstract += f"Common approaches include {', '.join(common_methods[:2])}. "
            abstract += "The synthesis highlights patterns, divergences, and research gaps."
        # Enforce rough 100-word limit
        words = abstract.split()
        if len(words) > 100:
            abstract = " ".join(words[:100]) + "..."
        return abstract

    def _generate_introduction(self, analysis_data: Dict[str, Any], paper_count: int) -> str:
        """Generate an introduction tailored to single- or multi-paper analyses."""
        if paper_count == 1:
            paper = analysis_data.get("analysis", {})
            title = paper.get("title", "this research")
            year = paper.get("year", "")
            intro = f"This analysis examines {title}"
            if year and year != "Unknown":
                intro += f" ({year})"
            intro += ". The paper addresses important questions and applies appropriate methods. "
            intro += "This review evaluates research design, methodological choices, and implications."
        else:
            papers_info = analysis_data.get("papers_info", [])
            years = [p.get("year") for p in papers_info if p.get("year") and p.get("year") != "Unknown"]
            intro = f"This comparative review considers {paper_count} papers"
            if years:
                intro += f" spanning {min(years)}–{max(years)}"
            intro += ". It synthesizes methodologies and findings to identify trends and gaps."
        return intro

    def _generate_methods_comparison(self, analysis_data: Dict[str, Any], paper_count: int) -> str:
        """Generate a methods section that summarizes methodological commonalities and differences."""
        if paper_count == 1:
            paper = analysis_data.get("analysis", {})
            methods = paper.get("methods_used", [])
            datasets = paper.get("datasets_mentioned", [])
            text = "The study's methodology is characterized by "
            if methods:
                text += f"{methods[0]}"
                if len(methods) > 1:
                    text += f" and {methods[1]}"
                text += ". "
            else:
                text += "standard and appropriate approaches for the research problem. "
            if datasets:
                text += f"The dataset used includes {datasets[0]}. "
            text += "Methodological choices appear aligned with the objectives."
        else:
            comparison = analysis_data.get("data", {}).get("comparison", {})
            common = comparison.get("common_methods", []) if comparison else []
            text = "Across studies, methodological approaches show both overlap and variation. "
            if common:
                text += f"Common methods observed include {', '.join(common[:3])}. "
            text += "Unique approaches highlight different research focuses across papers."
        return text

    def _generate_results_synthesis(self, analysis_data: Dict[str, Any], paper_count: int) -> str:
        """Synthesize results/findings from the analysis data."""
        if paper_count == 1:
            paper = analysis_data.get("analysis", {})
            findings = paper.get("key_findings", [])
            metrics = paper.get("metrics_reported", [])
            text = "The analysis reveals the following key findings: "
            if findings:
                for i, f in enumerate(findings[:3], 1):
                    text += f"{i}. {f[:140]}. "
            if metrics:
                text += f"Reported metrics include {', '.join(metrics[:3])}. "
            text += "These results inform the paper's contributions and limitations."
        else:
            papers_info = analysis_data.get("papers_info", [])
            all_findings = []
            for p in papers_info:
                all_findings.extend(p.get("key_findings", []))
            text = "Synthesis across papers indicates several recurring findings: "
            if all_findings:
                for i, f in enumerate(all_findings[:4], 1):
                    text += f"{i}. {f[:100]}. "
            text += "Comparative results illuminate both convergences and divergences among studies."
        return text

    def _generate_conclusion(self, analysis_data: Dict[str, Any], paper_count: int) -> str:
        """Produce a brief conclusion summarizing contributions, limitations, and future directions."""
        if paper_count == 1:
            paper = analysis_data.get("analysis", {})
            limitations = paper.get("limitations", [])
            recs = paper.get("recommendations_for_future_research", [])
            text = "In conclusion, the analysis highlights the work's methodological strengths and contributions. "
            if limitations:
                text += f"Identified limitations include {limitations[0][:140]}. "
            if recs:
                text += f"Future work should consider {recs[0][:140]}. "
            text += "Overall, the paper provides a useful foundation for further research."
        else:
            comp = analysis_data.get("data", {}).get("comparison", {})
            gaps = comp.get("research_gaps", []) if comp else []
            text = "This comparative review identifies key trends and open research areas. "
            if gaps:
                text += f"Notable research gaps include {gaps[0]}. "
            text += "These directions suggest fruitful opportunities for future studies."
        return text

    def _generate_references(self, analysis_data: Dict[str, Any]) -> str:
        """Generate a small APA-style references block for demo purposes."""

        if "analysis" in analysis_data:
            paper = analysis_data.get("analysis", {})
            pid = paper.get("paper_id", "paper")
            title = paper.get("title", "Untitled")
            year = paper.get("year", "n.d.")
            refs = f"{pid}. ({year}). {title}. [Analyzed research paper].\n\n"
            refs += "American Psychological Association. (2020). Publication manual of the American Psychological Association (7th ed.).\n"
            refs += "Smith, J., & Johnson, A. (2019). Research methods in academic writing. Academic Press.\n"
            return refs
        papers_info = analysis_data.get("papers_info", []) or []
        lines = []
        for p in papers_info:
            pid = p.get("paper_id", "paper")
            title = p.get("title", "Untitled")
            year = p.get("year", "n.d.")
            lines.append(f"{pid}. ({year}). {title}.")
        lines.append("\nAmerican Psychological Association. (2020). Publication manual (7th ed.).")
        return "\n".join(lines)


# -------------------------
# 2. LOAD ANALYSIS DATA
# -------------------------
def load_analysis_data() -> Optional[Dict[str, Any]]:
    """
    Load analysis data produced by Module 4 (single or comparison).

    Returns:
        A dict describing the analysis context, or None if no data found.
    """
    analysis_path = Path("data/analysis")
    comparison_file = analysis_path / "comparison.json"
    single_file = analysis_path / "single_paper_analysis.json"

    if comparison_file.exists():
        try:
            with comparison_file.open("r", encoding="utf-8") as fh:
                comparison_data = json.load(fh)
        except Exception as exc:
            print(f" Error reading comparison.json: {exc}")
            return None

        papers_info = []
        for summary in comparison_data.get("papers", []):
            pid = summary.get("paper_id")
            candidate = Path("data/extracted") / f"{pid}_extracted.json"
            if candidate.exists():
                try:
                    with candidate.open("r", encoding="utf-8") as pf:
                        papers_info.append(json.load(pf))
                except Exception:
                    continue

        return {
            "type": "comparison",
            "data": {"comparison": comparison_data},
            "papers_info": papers_info,
            "paper_count": len(papers_info) if papers_info else len(comparison_data.get("papers", []))
        }

    if single_file.exists():
        try:
            with single_file.open("r", encoding="utf-8") as fh:
                analysis_data = json.load(fh)
        except Exception as exc:
            print(f" Error reading single_paper_analysis.json: {exc}")
            return None
        return {"type": "single", "analysis": analysis_data, "paper_count": 1}

    print(" No analysis data found in data/analysis. Run Module 4 first.")
    return None


# -------------------------
# 3. DRAFT GENERATION
# -------------------------
def generate_all_sections(analysis_data: Dict[str, Any]) -> Dict[str, Dict[str, Any]]:
    """
    Generate all standard draft sections using GPTSectionGenerator templates.

    Returns:
        sections: Dict mapping section_key -> {name, content, word_count, token_count}
    """
    print("\n" + "=" * 72)
    print(" GENERATING ACADEMIC DRAFT SECTIONS")
    print("=" * 72)

    if not analysis_data:
        raise ValueError("analysis_data required for generation")

    paper_count = analysis_data.get("paper_count", 1)
    generator = GPTSectionGenerator()

    section_specs = [
        ("abstract", "Abstract (100 words max)"),
        ("introduction", "Introduction"),
        ("methods", "Methods Comparison"),
        ("results", "Results Synthesis"),
        ("conclusion", "Conclusion"),
        ("references", "APA References"),
    ]

    sections: Dict[str, Dict[str, Any]] = {}

    print(f"\n Generating sections for {paper_count} paper(s)...")
    for key, display_name in section_specs:
        print(f"  - Generating: {display_name}...")
        content = generator.generate_with_template(key, analysis_data, paper_count)
        word_count = len(content.split())
        token_count = generator.count_tokens(content)
        sections[key] = {
            "name": display_name,
            "content": content,
            "word_count": word_count,
            "token_count": token_count
        }
        print(f"    ✓ {key}: {word_count} words, {token_count} tokens")

    return sections


# -------------------------
# 4. VALIDATION CHECKS
# -------------------------
def validate_sections(sections: Dict[str, Dict[str, Any]], analysis_data: Dict[str, Any]) -> Dict[str, Any]:
    """
    Run simple validation checks on generated sections.

    Checks performed:
      - Abstract word limit <= 100 words
      - References contain basic APA-like patterns
      - Sections are factually tied to analysis (simple keyword check)
      - All required sections present

    Returns:
        validation_results: dict with boolean flags and issues list.
    """
    print("\n" + "=" * 72)
    print(" VALIDATING GENERATED SECTIONS")
    print("=" * 72)

    results = {
        "abstract_word_limit": False,
        "references_apa_format": False,
        "sections_factual": False,
        "all_sections_present": False,
        "issues": []
    }

    # Abstract check
    abstract_text = sections.get("abstract", {}).get("content", "")
    abstract_words = len(abstract_text.split())
    results["abstract_word_limit"] = abstract_words <= 100
    if not results["abstract_word_limit"]:
        results["issues"].append(f"Abstract exceeds 100 words ({abstract_words})")
    else:
        print(f" Abstract word count OK: {abstract_words}/100")

    references_text = sections.get("references", {}).get("content", "")
    has_parenthetical_dates = bool(re.search(r"\(\d{4}\)", references_text))
    has_author_initials = bool(re.search(r"[A-Z][a-z]+,?\s+[A-Z]\.", references_text))
    results["references_apa_format"] = has_parenthetical_dates and has_author_initials
    if not results["references_apa_format"]:
        results["issues"].append("References may not follow basic APA structure")

    if analysis_data.get("type") == "single":
        analysis = analysis_data.get("analysis", {})
        key_terms: List[str] = []
        if analysis.get("title"):
            key_terms.append(analysis["title"].split()[:3] and " ".join(analysis["title"].split()[:3]))
        if analysis.get("methods_used"):
            key_terms.extend([m.split()[:3] and " ".join(m.split()[:3]) for m in analysis["methods_used"][:2]])
        all_text = " ".join([s["content"] for s in sections.values()]) if sections else ""
        matches = sum(1 for term in key_terms if term and term.lower() in all_text.lower())
        results["sections_factual"] = matches >= 1
        if not results["sections_factual"]:
            results["issues"].append("Generated sections do not reference analysis key terms")
        else:
            print(f" Sections reference {matches} key terms from analysis")
    else:
        combined_text = " ".join([s["content"] for s in sections.values()])
        results["sections_factual"] = any(word in combined_text.lower() for word in ["method", "result", "finding", "study"])
        if not results["sections_factual"]:
            results["issues"].append("Generated multi-paper sections may lack method/result mentions")

    required = {"abstract", "introduction", "methods", "results", "conclusion", "references"}
    missing = required - set(sections.keys())
    results["all_sections_present"] = len(missing) == 0
    if missing:
        results["issues"].append(f"Missing sections: {', '.join(sorted(missing))}")

    # Print summary
    passed = sum(1 for key in ["abstract_word_limit", "references_apa_format", "sections_factual", "all_sections_present"] if results.get(key))
    print("\nValidation summary:")
    print(f" Checks passed: {passed}/4")
    if results["issues"]:
        print(" Issues found:")
        for issue in results["issues"]:
            print("  -", issue)
    else:
        print(" No validation issues detected.")

    return results


# -------------------------
# 5. SAVE OUTPUTS
# -------------------------
def save_draft_outputs(sections: Dict[str, Dict[str, Any]], analysis_data: Dict[str, Any], validation_results: Dict[str, Any]) -> str:
    """
    Save generated sections and metadata to /outputs/.

    Returns:
        Path to outputs directory (string).
    """
    outputs_dir = Path("outputs")
    outputs_dir.mkdir(parents=True, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    print(f"\n Saving outputs to: {outputs_dir.resolve()}")

    # Save each section as a separate file
    for key, data in sections.items():
        filename = outputs_dir / f"{key}_{timestamp}.txt"
        with filename.open("w", encoding="utf-8") as fh:
            fh.write(f"{data['name']}\n")
            fh.write("=" * len(data["name"]) + "\n\n")
            fh.write(data["content"])
            fh.write(f"\n\n[Word count: {data['word_count']}]\n")
            fh.write(f"[Token count: {data['token_count']}]\n")
        print(f"  Saved: {filename.name}")

    # Save complete draft
    complete = outputs_dir / f"complete_draft_{timestamp}.txt"
    with complete.open("w", encoding="utf-8") as fh:
        fh.write("ACADEMIC DRAFT - RESEARCH PAPER ANALYSIS\n")
        fh.write("=" * 50 + "\n\n")
        fh.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        fh.write(f"Papers analyzed: {analysis_data.get('paper_count', 1)}\n")
        fh.write("-" * 50 + "\n\n")
        for sec in ["abstract", "introduction", "methods", "results", "conclusion", "references"]:
            if sec in sections:
                s = sections[sec]
                fh.write(f"\n{s['name'].upper()}\n")
                fh.write("-" * len(s['name']) + "\n\n")
                fh.write(s["content"] + "\n\n")
    print(f"  Saved complete draft: {complete.name}")

    # Save metadata
    metadata = {
        "generation_date": timestamp,
        "paper_count": analysis_data.get("paper_count", 1),
        "analysis_type": analysis_data.get("type", "unknown"),
        "sections_generated": len(sections),
        "validation_results": validation_results,
        "section_stats": {k: {"word_count": v["word_count"], "token_count": v["token_count"]} for k, v in sections.items()}
    }
    meta_file = outputs_dir / f"draft_metadata_{timestamp}.json"
    with meta_file.open("w", encoding="utf-8") as fh:
        json.dump(metadata, fh, indent=2, ensure_ascii=False)
    print(f"  Saved metadata: {meta_file.name}")

    return str(outputs_dir.resolve())


# -------------------------
# 6. GENERATE REPORT
# -------------------------
def generate_report(sections: Dict[str, Dict[str, Any]], validation_results: Dict[str, Any], output_path: str) -> str:
    """
    Create a short review report summarizing generated content and validation.

    Returns:
        Path to the report file as string.
    """
    outdir = Path(output_path)
    report_file = outdir / "review_report.txt"
    lines: List[str] = []
    lines.append("=" * 80)
    lines.append("REVIEW REPORT - GENERATED DRAFT SECTIONS")
    lines.append("=" * 80)
    lines.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    lines.append("\nOBJECTIVE CHECKLIST:")
    lines.append("-" * 40)

    objectives = [
        ("Abstract (<=100 words)", validation_results.get("abstract_word_limit", False),
         f"Abstract words: {sections.get('abstract', {}).get('word_count', 0)}"),
        ("References basic APA", validation_results.get("references_apa_format", False),
         "Basic APA elements detected" if validation_results.get("references_apa_format") else "May need formatting"),
        ("Sections factually tied", validation_results.get("sections_factual", False),
         "Evidence of analysis terms in sections" if validation_results.get("sections_factual") else "Review factual alignment"),
        ("All sections present", validation_results.get("all_sections_present", False),
         "6/6 sections" if validation_results.get("all_sections_present") else "Missing sections")
    ]

    for title, passed, details in objectives:
        status = "PASSED" if passed else "NEEDS REVIEW"
        lines.append(f"\n{title}:\n  Status: {status}\n  Details: {details}\n")

    lines.append("\nSECTION STATISTICS:")
    lines.append("-" * 40)
    for key, s in sections.items():
        lines.append(f"\n{s['name']}: Words={s['word_count']}, Tokens={s['token_count']}")

    lines.append("\nVALIDATION ISSUES:")
    lines.append("-" * 40)
    if validation_results.get("issues"):
        for issue in validation_results["issues"]:
            lines.append(f"• {issue}")
    else:
        lines.append("No significant issues found")

    lines.append("\nNEXT STEPS:")
    lines.append("-" * 40)
    lines.append("1. Manually verify factual accuracy against original papers.")
    lines.append("2. Edit references to full APA format as needed.")
    lines.append("3. Expand sections if reviewer requests more detail.")

    with report_file.open("w", encoding="utf-8") as fh:
        fh.write("\n".join(lines))
    print(f"\n report saved to: {report_file}")

    return str(report_file)


# -------------------------
# 7. MAIN GENERATION PIPELINE
# -------------------------
def run_draft_generation() -> Optional[Dict[str, Any]]:
    """
    End-to-end pipeline to generate draft sections from analysis data.

    Steps:
      1. Load analysis data from Module 4 outputs
      2. Generate sections using GPTSectionGenerator templates
      3. Validate generated content
      4. Save outputs and generate a review report
    """
    print("\n" + "=" * 72)
    print("GENERATE DRAFT SECTIONS WITH GPT (Pipeline)")
    print("=" * 72)

    analysis_data = load_analysis_data()
    if not analysis_data:
        print("Cannot proceed without analysis data.")
        return None

    paper_count = analysis_data.get("paper_count", 1)
    print(f" Loaded analysis data for {paper_count} paper(s)")

    # Step 2: generate
    sections = generate_all_sections(analysis_data)

    # Step 3: validate
    validation_results = validate_sections(sections, analysis_data)

    # Step 4: save
    output_path = save_draft_outputs(sections, analysis_data, validation_results)

    # Step 5: create review report
    mentor_report = generate_report(sections, validation_results, output_path)

    print("\nGeneration complete. Outputs saved to:", output_path)
    return {"sections": sections, "validation": validation_results, "output_path": output_path}


# -------------------------
# 8. PREVIEW FUNCTION
# -------------------------
def preview_generated_draft() -> None:
    """
    Show a short preview of the most recent complete draft file and metadata.
    """
    outputs_dir = Path("outputs")
    if not outputs_dir.exists():
        print("No outputs found. Run run_draft_generation() first.")
        return

    drafts = sorted(outputs_dir.glob("complete_draft_*.txt"), key=lambda p: p.stat().st_mtime)
    if not drafts:
        print("No complete draft file found in outputs/")
        return

    latest = drafts[-1]
    print("\n" + "=" * 72)
    print("PREVIEW OF GENERATED DRAFT:", latest.name)
    print("=" * 72)
    try:
        with latest.open("r", encoding="utf-8") as fh:
            content = fh.read()
            preview = content[:1000] + ("..." if len(content) > 1000 else "")
            print(preview)
            print(f"\nTotal words in draft: {len(content.split())}")
    except Exception as exc:
        print("Error reading draft:", exc)
        return

    metadata_files = sorted(outputs_dir.glob("draft_metadata_*.json"), key=lambda p: p.stat().st_mtime)
    if metadata_files:
        try:
            with metadata_files[-1].open("r", encoding="utf-8") as fh:
                metadata = json.load(fh)
            val = metadata.get("validation_results", {})
            passed = sum(1 for k, v in val.items() if isinstance(v, bool) and v)
            print(f"\nValidation checks passed (approx): {passed}/4")
        except Exception:
            pass


# -------------------------
# 9. ENTRYPOINT
# -------------------------
if __name__ == "__main__":
    result = run_draft_generation()
    if result:
        print("\nDraft generation finished successfully.")
        try:
            preview = input("Would you like to preview the recent draft? (y/n): ")
        except Exception:
            preview = "n"
        if preview.strip().lower().startswith("y"):
            preview_generated_draft()
    else:
        print("Draft generation did not complete.")


GENERATE DRAFT SECTIONS WITH GPT (Pipeline)
 Loaded analysis data for 4 paper(s)

 GENERATING ACADEMIC DRAFT SECTIONS
 GPTSectionGenerator initialized (simulated, model=gpt-3.5-turbo)

 Generating sections for 4 paper(s)...
  - Generating: Abstract (100 words max)...
    ✓ abstract: 17 words, 25 tokens
  - Generating: Introduction...
    ✓ introduction: 16 words, 20 tokens
  - Generating: Methods Comparison...
    ✓ methods: 17 words, 21 tokens
  - Generating: Results Synthesis...
    ✓ results: 16 words, 23 tokens
  - Generating: Conclusion...
    ✓ conclusion: 30 words, 37 tokens
  - Generating: APA References...
    ✓ references: 20 words, 85 tokens

 VALIDATING GENERATED SECTIONS
 Abstract word count OK: 17/100

Validation summary:
 Checks passed: 3/4
 Issues found:
  - References may not follow basic APA structure

 Saving outputs to: C:\Users\abhis\outputs
  Saved: abstract_20251224_233458.txt
  Saved: introduction_20251224_233458.txt
  Saved: methods_20251224_233458.txt
  Saved

# Module 6 : Validation of Correctness and Completeness of the Extracted Textual Data

In [20]:
import json
import re
from pathlib import Path
from datetime import datetime
from collections import defaultdict
from typing import Dict, Any, List, Optional, Tuple


# -------------------------
# 1. LOAD GENERATED DRAFT
# -------------------------
def load_latest_draft(outputs_dir: str = "outputs") -> Optional[str]:
    """
    Load the most recent aggregated complete draft text.

    Args:
        outputs_dir: Path to outputs folder where complete drafts are saved.

    Returns:
        The draft text (string) or None if not found.
    """
    out_path = Path(outputs_dir)
    if not out_path.exists():
        print("No outputs found. Run Module 5 first.")
        return None

    draft_files = sorted(out_path.glob("complete_draft_*.txt"), key=lambda p: p.stat().st_mtime)
    if not draft_files:
        print("No complete draft found in outputs/")
        return None

    latest = draft_files[-1]
    try:
        with latest.open("r", encoding="utf-8") as fh:
            content = fh.read()
        print(f"Loaded draft: {latest.name}")
        return content
    except Exception as exc:
        print(f"Error reading draft {latest.name}: {exc}")
        return None


def load_individual_sections(outputs_dir: str = "outputs") -> Dict[str, str]:
    """
    Load the latest individual section files (abstract, introduction, etc.).

    Args:
        outputs_dir: Path containing individual section files.

    Returns:
        Dict mapping section keys to section content.
    """
    out_path = Path(outputs_dir)
    if not out_path.exists():
        return {}

    section_patterns = {
        "abstract": "abstract_*.txt",
        "introduction": "introduction_*.txt",
        "methods": "methods_*.txt",
        "results": "results_*.txt",
        "conclusion": "conclusion_*.txt",
        "references": "references_*.txt",
    }

    sections: Dict[str, str] = {}
    for key, pattern in section_patterns.items():
        files = list(out_path.glob(pattern))
        if not files:
            continue
        latest = max(files, key=lambda p: p.stat().st_mtime)
        try:
            with latest.open("r", encoding="utf-8") as fh:
                content = fh.read()
            lines = content.splitlines()
            section_content = "\n".join(lines[2:]) if len(lines) > 2 else content
            sections[key] = section_content.strip()
        except Exception as exc:
            print(f"Error reading section file {latest.name}: {exc}")
            continue

    return sections


# -------------------------
# 2. AGGREGATE FULL DRAFT
# -------------------------
def create_full_draft_markdown(
    sections: Dict[str, str],
    critique_feedback: Optional[Dict[str, Any]] = None,
    title: str = "Research Paper Analysis Review",
) -> str:
    """
    Combine individual sections into a polished markdown draft.

    Args:
        sections: Dict mapping section_key -> content
        critique_feedback: Optional critique results to append as revision notes
        title: Title for the markdown draft

    Returns:
        The combined markdown string.
    """
    now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    lines: List[str] = []
    lines.append(f"# {title}\n")
    lines.append(f"*Generated: {now}*")
    lines.append(f"*Status: {'Revised' if critique_feedback else 'Initial'} Draft*")
    lines.append("\n---\n")

    section_order = ["abstract", "introduction", "methods", "results", "conclusion", "references"]
    for key in section_order:
        content = sections.get(key)
        if content:
            lines.append(f"\n## {key.upper()}\n")
            lines.append(content)
            lines.append("\n---\n")

    if critique_feedback:
        lines.append("\n## Critique & Revision Notes\n")
        lines.append("### Issues Identified:\n")
        issues_found = False
        checks = critique_feedback.get("checks", {})
        for check_type, check_data in checks.items():
            passed = bool(check_data.get("passed", False))
            suggestion = check_data.get("suggestion", "")
            if not passed:
                issues_found = True
                lines.append(f"- **{check_type.replace('_', ' ').title()}**: {suggestion}")

        if not issues_found:
            lines.append("No major issues identified. Draft is well-structured.")

        lines.append("\n### Suggested Revisions:\n")
        for suggestion in critique_feedback.get("suggestions", [])[:5]:
            lines.append(f"- {suggestion}")

    # Add basic word count
    full_text = "\n".join(lines)
    word_count = len(re.findall(r"\b\w+\b", full_text))
    lines.append(f"\n\n*Word count: {word_count}*")

    return "\n".join(lines)


# -------------------------
# 3. CRITIQUE SYSTEM
# -------------------------
class DraftCritique:
    """
    Critique system that analyzes the draft for clarity, flow, references, repetition,
    academic style, and structure.

    The critique methods return (passed: bool, feedback: List[str]).
    """

    def __init__(self) -> None:
        self.criteria = {
            "clarity": self.check_clarity,
            "flow": self.check_flow,
            "missing_references": self.check_missing_references,
            "repetition": self.check_repetition,
            "style": self.check_academic_style,
            "structure": self.check_structure,
        }

    def critique_draft(self, draft_text: str, sections: Dict[str, str]) -> Dict[str, Any]:
        """
        Run the full critique over the draft.

        Args:
            draft_text: Full draft text (markdown)
            sections: Individual sections dictionary

        Returns:
            critique_results: structured dict containing checks, suggestions, score metrics
        """
        print("Analyzing draft quality...")

        total_checks = len(self.criteria)
        critique_results: Dict[str, Any] = {
            "timestamp": datetime.now().isoformat(),
            "checks": {},
            "suggestions": [],
            "score": 0,
            "total_checks": total_checks,
        }

        passed_checks = 0
        for name, func in self.criteria.items():
            print(f"  • Checking {name}...", end=" ")
            try:
                passed, feedback = func(draft_text, sections)
            except Exception as exc:
                passed = False
                feedback = [f"Error running check: {exc}"]

            critique_results["checks"][name] = {
                "passed": passed,
                "feedback": feedback,
                "suggestion": self.generate_suggestion(name, passed, feedback),
            }

            if passed:
                passed_checks += 1
                print("✅")
            else:
                print("❌")

        critique_results["score"] = passed_checks
        critique_results["passed_checks"] = passed_checks
        critique_results["suggestions"] = self.generate_overall_suggestions(critique_results)

        return critique_results

    # ---------- individual checks ----------
    def check_clarity(self, draft_text: str, sections: Dict[str, str]) -> Tuple[bool, List[str]]:
        """
        Check for clarity issues: long sentences and overuse of passive voice.
        """
        issues: List[str] = []
        # Simple sentence split
        sentences = [s.strip() for s in re.split(r"[.!?]+", draft_text) if s.strip()]
        long_sentences = [s for s in sentences if len(s.split()) > 40]
        if long_sentences:
            issues.append(f"{len(long_sentences)} sentences are very long (>40 words)")

        passive_patterns = [r"\b(is|are|was|were)\s+\w+ed\b", r"\bbe\s+\w+ed\b"]
        passive_count = sum(len(re.findall(pat, draft_text.lower())) for pat in passive_patterns)
        if passive_count > 10:
            issues.append(f"High use of passive voice ({passive_count} instances)")

        passed = len(issues) == 0
        return passed, issues

    def check_flow(self, draft_text: str, sections: Dict[str, str]) -> Tuple[bool, List[str]]:
        """
        Check logical flow between sections: presence of required sections and referencing.
        """
        issues: List[str] = []
        required_order = ["abstract", "introduction", "methods", "results", "conclusion"]
        missing = [s for s in required_order if s not in sections]
        if missing:
            issues.append(f"Missing sections: {', '.join(missing)}")

        if "conclusion" in sections and "introduction" in sections:
            intro_keywords = ["paper", "study", "research", "analysis"]
            conclusion_text = sections.get("conclusion", "").lower()
            if not any(k in conclusion_text for k in intro_keywords):
                issues.append("Conclusion does not clearly reference introduction/key aims")

        passed = len(issues) == 0
        return passed, issues

    def check_missing_references(self, draft_text: str, sections: Dict[str, str]) -> Tuple[bool, List[str]]:
        """
        Check for basic reference completeness (years, authors, count).
        """
        issues: List[str] = []
        if "references" in sections:
            ref_text = sections["references"]
            has_years = bool(re.search(r"\(\d{4}\)", ref_text))
            has_authors = bool(re.search(r"[A-Z][a-z]+,\s+[A-Z]\.", ref_text))
            if not has_years:
                issues.append("References missing publication years")
            if not has_authors:
                issues.append("References may be missing author names or initials")

            ref_lines = [line for line in ref_text.splitlines() if line.strip()]
            if len(ref_lines) < 3:
                issues.append(f"Only {len(ref_lines)} references found (suggest 5+ for review)")
        else:
            issues.append("No references section found")

        passed = len(issues) == 0
        return passed, issues

    def check_repetition(self, draft_text: str, sections: Dict[str, str]) -> Tuple[bool, List[str]]:
        """
        Check for repetitive words/phrases and similar section openings.
        """
        issues: List[str] = []
        words = re.findall(r"\b\w+\b", draft_text.lower())
        freq: Dict[str, int] = defaultdict(int)
        for w in words:
            if len(w) > 4:
                freq[w] += 1

        common_exclude = {"paper", "study", "research", "analysis", "method", "result", "finding"}
        overused = sorted([(w, c) for w, c in freq.items() if c > 5 and w not in common_exclude], key=lambda x: -x[1])
        if overused:
            top = overused[:3]
            issues.append("Overused words: " + ", ".join([f"{w}({c})" for w, c in top]))

        section_texts = list(sections.values())
        for i in range(len(section_texts)):
            for j in range(i + 1, len(section_texts)):
                a_start = section_texts[i][:50].strip()
                b_start = section_texts[j][:50].strip()
                if a_start and a_start == b_start:
                    issues.append("Two sections share the same opening text — possible duplication")
                    break

        passed = len(issues) == 0
        return passed, issues

    def check_academic_style(self, draft_text: str, sections: Dict[str, str]) -> Tuple[bool, List[str]]:
        """
        Check for informal language, first-person usage, and paragraph length.
        """
        issues: List[str] = []
        informal_words = ["really", "very", "a lot", "got", "stuff", "thing"]
        informal_count = sum(draft_text.lower().count(w) for w in informal_words)
        if informal_count > 3:
            issues.append(f"Informal language used ({informal_count} instances)")

        first_person_matches = re.findall(r"\b(I|we|our|us|my|mine)\b", draft_text, flags=re.I)
        if len(first_person_matches) > 5:
            issues.append(f"High use of first-person pronouns ({len(first_person_matches)})")

        paragraphs = [p for p in draft_text.split("\n\n") if p.strip()]
        short_paragraphs = [p for p in paragraphs if len(p.split()) < 50]
        if len(short_paragraphs) > 3:
            issues.append(f"{len(short_paragraphs)} very short paragraphs (<50 words)")

        passed = len(issues) == 0
        return passed, issues

    def check_structure(self, draft_text: str, sections: Dict[str, str]) -> Tuple[bool, List[str]]:
        """
        Check that required sections exist and have reasonable lengths.
        """
        issues: List[str] = []
        required = {"abstract", "introduction", "methods", "results", "conclusion", "references"}
        present = set(sections.keys())
        missing = required - present
        if missing:
            issues.append(f"Missing required sections: {', '.join(sorted(missing))}")

        for sec, content in sections.items():
            words = len(re.findall(r"\b\w+\b", content))

            if sec == "abstract" and words > 150:
                issues.append(f"Abstract too long ({words} words; aim for <150)")

            if sec == "introduction" and words < 100:
                issues.append(f"Introduction too short ({words} words; aim for 100+)")

        if "methods" in sections and "results" in sections and "conclusion" in sections:
            m_len = len(re.findall(r"\b\w+\b", sections.get("methods", "")))
            r_len = len(re.findall(r"\b\w+\b", sections.get("results", "")))
            c_len = len(re.findall(r"\b\w+\b", sections.get("conclusion", "")))
            if c_len > (m_len + r_len):
                issues.append("Conclusion unusually long compared to Methods and Results")

        passed = len(issues) == 0
        return passed, issues

    # ---------- suggestion helpers ----------
    def generate_suggestion(self, criterion: str, passed: bool, feedback: List[str]) -> str:
        """
        Create a short actionable suggestion for a failed criterion.
        """
        base_recs = {
            "clarity": "Use shorter sentences and prefer active voice.",
            "flow": "Add clear transitions and ensure the conclusion ties back to the introduction.",
            "missing_references": "Expand and format the references list (use APA).",
            "repetition": "Vary vocabulary and rephrase repeated phrases.",
            "style": "Avoid informal words and excessive first-person pronouns.",
            "structure": "Ensure all required sections are present and balanced in length.",
        }
        if passed:
            return f"{criterion.title()} is fine."
        suggestion = base_recs.get(criterion, "Review and improve this area.")
        if feedback:
            return f"{suggestion} Issues: {'; '.join(feedback[:3])}"
        return suggestion

    def generate_overall_suggestions(self, critique_results: Dict[str, Any]) -> List[str]:
        """
        Create a concise list of next-step suggestions based on failed checks.
        """
        suggestions: List[str] = []
        failed = [name for name, c in critique_results.get("checks", {}).items() if not c.get("passed")]
        if not failed:
            suggestions.append("Draft is well-structured. Minor polishing only needed.")
            suggestions.append("Check references formatting before submission.")
            return suggestions

        if "clarity" in failed:
            suggestions.append("Revise long sentences; convert passive voice to active where appropriate.")
        if "flow" in failed:
            suggestions.append("Add transition sentences between sections; make conclusion reference introduction.")
        if "missing_references" in failed:
            suggestions.append("Add 2-3 more relevant citations and ensure APA format.")
        if "repetition" in failed:
            suggestions.append("Replace frequently repeated words with synonyms.")
        if "style" in failed:
            suggestions.append("Eliminate informal language; reduce first-person pronouns.")
        if "structure" in failed:
            suggestions.append("Ensure all sections are present and appropriately balanced by length.")

        # General final tips
        suggestions.append("Read the draft aloud to catch awkward phrasing.")
        suggestions.append("Have a peer review for factual accuracy.")
        return suggestions[:7]


# -------------------------
# 4. REVISION CYCLE
# -------------------------
def run_revision_cycle(
    draft_text: str,
    sections: Dict[str, str],
    critique_results: Dict[str, Any],
    iteration: int = 1,
) -> Tuple[str, Dict[str, str]]:
    """
    Run a single revision cycle applying simple automated fixes based on critique.

    Args:
        draft_text: Full draft text (not strictly required for some operations)
        sections: Individual section contents
        critique_results: Output of DraftCritique.critique_draft
        iteration: Current iteration number (for logging)

    Returns:
        revised_draft: New markdown draft text
        revised_sections: Updated sections dict
    """
    print(f"Running revision cycle {iteration}...")
    revised = dict(sections)

    checks = critique_results.get("checks", {})
    for criterion, info in checks.items():
        if not info.get("passed"):
            feedback = info.get("feedback", [])
            revised = apply_revisions(revised, criterion, feedback)

    revised_draft = create_full_draft_markdown(revised, critique_results)
    return revised_draft, revised


def apply_revisions(sections: Dict[str, str], criterion: str, feedback: List[str]) -> Dict[str, str]:
    """
    Apply rule-based revisions to sections for a specific failing criterion.

    This function uses conservative edits — it avoids altering factual content,
    instead focusing on surface-level improvements (sentence splitting, synonyms, small expansions).

    Args:
        sections: dict of section_name -> content
        criterion: the name of the failed criterion
        feedback: list of feedback strings from the critique

    Returns:
        revised_sections: updated sections dict
    """
    revised = dict(sections) 

    if criterion == "clarity":
        for name, content in list(revised.items()):
            sentences = re.split(r'(?<=[.!?])\s+', content)
            new_sentences: List[str] = []
            for s in sentences:
                words = s.split()
                if len(words) > 60:
                    mid = len(words) // 2
                    new_sentences.append(" ".join(words[:mid]) + ".")
                    new_sentences.append(" ".join(words[mid:]) + ".")
                else:
                    new_sentences.append(s)
            revised[name] = " ".join(new_sentences).strip()

    elif criterion == "repetition":
        replacements = {
            "paper": "study",
            "research": "investigation",
            "analysis": "examination",
            "method": "approach",
            "result": "finding"
        }
        for name in ("abstract", "conclusion"):
            if name in revised:
                content = revised[name]
                for old, new in replacements.items():
                    if content.lower().count(old) > 2:
                        content = re.sub(rf'\b{old}\b', new, content, count=2, flags=re.I)
                revised[name] = content

    elif criterion == "structure":
        intro = revised.get("introduction", "")
        intro_words = len(re.findall(r"\b\w+\b", intro))
        if intro and intro_words < 100:
            addition = (
                " This analysis provides a more detailed examination of the methodological "
                "approaches and findings. The review situates the work within the broader "
                "research context and evaluates implications and potential improvements."
            )
            revised["introduction"] = (intro + " " + addition).strip()

    elif criterion == "missing_references":
        if "references" in revised:
            if "PLEASE_ADD_FULL_REFERENCES" not in revised["references"]:
                revised["references"] = revised["references"].strip() + "\n\n[PLEASE_ADD_FULL_REFERENCES]"
        else:
            revised["references"] = "[PLEASE_ADD_FULL_REFERENCES]"

    elif criterion == "flow":
        if "conclusion" in revised and "introduction" in revised:
            cons = revised["conclusion"]
            intro_title = "the introduction"
            trans = "In line with the introduction, this conclusion revisits the main aims and synthesizes the outcomes."
            if trans not in cons:
                revised["conclusion"] = trans + "\n\n" + cons

    elif criterion == "style":
        for name, content in list(revised.items()):
            for informal in [" really ", " a lot ", " got ", " stuff ", " thing "]:
                if informal in content:
                    content = content.replace(informal, " ")
            revised[name] = content

    return revised


# -------------------------
# 5. SAVE OUTPUTS
# -------------------------
def save_critique_results(critique_results: Dict[str, Any], outputs_dir: str = "outputs", iteration: int = 1) -> str:
    """
    Save critique feedback JSON to outputs/critique_feedback_iteration_{n}.json

    Returns:
        Path (string) to saved JSON file
    """
    out_path = Path(outputs_dir)
    out_path.mkdir(parents=True, exist_ok=True)
    filename = out_path / f"critique_feedback_iteration_{iteration}.json"
    try:
        with filename.open("w", encoding="utf-8") as fh:
            json.dump(critique_results, fh, indent=2, ensure_ascii=False)
        print(f"Critique feedback saved: {filename.name}")
        return str(filename)
    except Exception as exc:
        print(f"Error saving critique feedback: {exc}")
        return ""


def save_revised_draft(revised_draft: str, outputs_dir: str = "outputs", iteration: int = 1) -> str:
    """
    Save revised draft markdown and plain text files.

    Returns:
        Path to the primary saved markdown file (string).
    """
    out_path = Path(outputs_dir)
    out_path.mkdir(parents=True, exist_ok=True)
    md_file = out_path / f"revised_draft_iteration_{iteration}.md"
    txt_file = out_path / f"revised_draft_iteration_{iteration}.txt"

    try:
        with md_file.open("w", encoding="utf-8") as fh:
            fh.write(revised_draft)
        with txt_file.open("w", encoding="utf-8") as fh:
            fh.write(revised_draft)
        print(f"Revised draft saved: {md_file.name}")
        return str(md_file)
    except Exception as exc:
        print(f"Error saving revised draft: {exc}")
        return ""


def save_revision_summary(
    original_critique: Dict[str, Any],
    revised_critique: Optional[Dict[str, Any]],
    iterations: int,
    outputs_dir: str = "outputs",
) -> str:
    """
    Save a revision summary JSON that compares original and final critique scores.

    Returns:
        Path (string) to saved summary file.
    """
    out_path = Path(outputs_dir)
    out_path.mkdir(parents=True, exist_ok=True)
    summary = {
        "revision_date": datetime.now().isoformat(),
        "total_iterations": iterations,
        "improvement_summary": {
            "original_score": original_critique.get("score", 0),
            "final_score": revised_critique.get("score", 0) if revised_critique else original_critique.get("score", 0),
            "improvement": (revised_critique.get("score", 0) - original_critique.get("score", 0)) if revised_critique else 0,
        },
        "issues_resolved": [],
        "remaining_issues": [],
    }

    if revised_critique:
        for crit in original_critique.get("checks", {}):
            orig_passed = bool(original_critique["checks"][crit].get("passed", False))
            new_passed = bool(revised_critique["checks"].get(crit, {}).get("passed", False))
            if not orig_passed and new_passed:
                summary["issues_resolved"].append(crit)
            elif not orig_passed and not new_passed:
                summary["remaining_issues"].append(crit)

    filename = out_path / "revision_summary.json"
    try:
        with filename.open("w", encoding="utf-8") as fh:
            json.dump(summary, fh, indent=2, ensure_ascii=False)
        print(f"Revision summary saved: {filename.name}")
        return str(filename)
    except Exception as exc:
        print(f"Error saving revision summary: {exc}")
        return ""


# -------------------------
# 6. MAIN PIPELINE
# -------------------------
def run_draft_aggregation_and_critique(max_iterations: int = 2, outputs_dir: str = "outputs") -> Optional[Dict[str, Any]]:
    """
    Main pipeline to aggregate draft, critique, run revisions, and save results.

    Args:
        max_iterations: Number of automated revision cycles to run (conservative edits).
        outputs_dir: Directory to read/write outputs.

    Returns:
        summary dict with initial/final critiques and file paths, or None if pipeline fails.
    """
    print("\n" + "=" * 72)
    print("DRAFT AGGREGATION & CRITIQUE MODULE")
    print("=" * 72)

    print("STEP 1: Loading generated draft...")
    draft_text = load_latest_draft(outputs_dir)
    if not draft_text:
        return None

    print("Loading individual sections...")
    sections = load_individual_sections(outputs_dir)
    print(f"  Loaded {len(sections)} sections")

    print("STEP 2: Creating full markdown draft...")
    full_draft = create_full_draft_markdown(sections)

    # Save initial full draft snapshot
    out_path = Path(outputs_dir)
    out_path.mkdir(parents=True, exist_ok=True)
    initial_file = out_path / "full_draft_initial.md"
    try:
        with initial_file.open("w", encoding="utf-8") as fh:
            fh.write(full_draft)
        print(f"Full draft saved: {initial_file.name}")
    except Exception as exc:
        print(f"Error saving initial draft: {exc}")

    # Step 3: Run critique
    print("STEP 3: Running draft critique...")
    critic = DraftCritique()
    critique_results = critic.critique_draft(full_draft, sections)
    print(f"Critique Score: {critique_results['score']}/{critique_results['total_checks']}")

    # Save initial critique
    save_critique_results(critique_results, outputs_dir, iteration=1)

    # Step 4: Revision cycles
    current_sections = sections
    current_critique = critique_results
    revised_files: List[str] = []
    revised_critique: Optional[Dict[str, Any]] = None

    for iteration in range(1, max_iterations + 1):
        print(f"\nIteration {iteration}/{max_iterations}")
        revised_draft, revised_sections = run_revision_cycle(full_draft, current_sections, current_critique, iteration)
        saved_path = save_revised_draft(revised_draft, outputs_dir, iteration)
        revised_files.append(saved_path)

        revised_critique = critic.critique_draft(revised_draft, revised_sections)
        save_critique_results(revised_critique, outputs_dir, iteration + 1)

        # Prepare for next iteration
        current_sections = revised_sections
        current_critique = revised_critique
        # update full_draft for the next revision cycle
        full_draft = revised_draft
        print(f"  Score after revision {iteration}: {revised_critique['score']}/{revised_critique['total_checks']}")

    # Step 5: Create final summary
    print("\nSTEP 5: Creating revision summary...")
    summary_file = save_revision_summary(critique_results, revised_critique, max_iterations, outputs_dir)

    print("\n" + "=" * 72)
    print("COMPLETE!")
    print("=" * 72)

    summary = {
        "initial_draft_file": str(initial_file),
        "initial_critique": critique_results,
        "revised_files": revised_files,
        "final_critique": revised_critique,
        "revision_summary_file": summary_file,
    }

    # Print short overview
    print("\nOUTPUTS GENERATED:")
    print(f" • {initial_file.name} - Initial full draft")
    for i, f in enumerate(revised_files, 1):
        print(f" • revised_draft_iteration_{i}.md - Revised draft v{i}")
    print(f" • revision_summary.json - Revision summary")

    return summary


# -------------------------
# 7. PREVIEW FUNCTION
# -------------------------
def preview_critique_results(outputs_dir: str = "outputs") -> None:
    """
    Preview the latest critique JSON and print top suggestions.

    Args:
        outputs_dir: Directory with critique JSON files.
    """
    out_path = Path(outputs_dir)
    critique_files = sorted(out_path.glob("critique_feedback_iteration_*.json"), key=lambda p: p.stat().st_mtime)
    if not critique_files:
        print("No critique files found")
        return

    latest = critique_files[-1]
    try:
        with latest.open("r", encoding="utf-8") as fh:
            data = json.load(fh)
        print("\n" + "=" * 72)
        print("CRITIQUE RESULTS PREVIEW")
        print("=" * 72)
        print(f"File: {latest.name}")
        print(f"Score: {data.get('score', 0)}/{data.get('total_checks', 0)}")
        print("\nTop Suggestions:")
        for i, sug in enumerate(data.get("suggestions", [])[:5], 1):
            print(f"{i}. {sug}")
    except Exception as exc:
        print(f"Error reading critique file {latest.name}: {exc}")


# -------------------------
# 8. ENTRYPOINT
# -------------------------
if __name__ == "__main__":
    results = run_draft_aggregation_and_critique(max_iterations=2)
    if results:
        print("\nDRAFT AGGREGATION & CRITIQUE SUCCESSFUL!")
        try:
            preview = input("Would you like to preview critique results? (y/n): ").strip().lower()
        except Exception:
            preview = "n"
        if preview and preview.startswith("y"):
            preview_critique_results()
    else:
        print("Draft aggregation & critique did not run to completion.")


DRAFT AGGREGATION & CRITIQUE MODULE
STEP 1: Loading generated draft...
Loaded draft: complete_draft_20251224_233458.txt
Loading individual sections...
  Loaded 6 sections
STEP 2: Creating full markdown draft...
Full draft saved: full_draft_initial.md
STEP 3: Running draft critique...
Analyzing draft quality...
  • Checking clarity... ✅
  • Checking flow... ✅
  • Checking missing_references... ❌
  • Checking repetition... ❌
  • Checking style... ❌
  • Checking structure... ❌
Critique Score: 2/6
Critique feedback saved: critique_feedback_iteration_1.json

Iteration 1/2
Running revision cycle 1...
Revised draft saved: revised_draft_iteration_1.md
Analyzing draft quality...
  • Checking clarity... ✅
  • Checking flow... ✅
  • Checking missing_references... ❌
  • Checking repetition... ❌
  • Checking style... ❌
  • Checking structure... ❌
Critique feedback saved: critique_feedback_iteration_2.json
  Score after revision 1: 2/6

Iteration 2/2
Running revision cycle 2...
Revised draft saved: