In [3]:
# ============================================
# MODULE 1: Topic Input & Paper Search (arXiv)
# ============================================

!pip install feedparser requests python-dotenv -q

import json
import os
import requests
import feedparser
from dotenv import load_dotenv

# ====================
# 1. SETUP
# ====================

def setup_arxiv():
    """Setup nothing, arXiv does not require authentication."""
    print("arXiv API initialized (no API key required)")
    return True


# ====================
# 2. PAPER SEARCH (arXiv)
# ====================

def search_papers(topic, limit=20):
    """
    Search arXiv for papers on a given topic.
    Returns: Dictionary with search results
    """

    print(f"\n Searching for papers on: '{topic}'")
    print(f"   Requesting {limit} papers from arXiv...")

    setup_arxiv()

    # Build arXiv API query
    base_url = "http://export.arxiv.org/api/query"
    query = f"search_query=all:{topic}&start=0&max_results={limit}"

    try:
        response = requests.get(f"{base_url}?{query}", timeout=20)

        if response.status_code != 200:
            print(" Error fetching arXiv results:", response.status_code)
            return None

        feed = feedparser.parse(response.text)

        papers = []

        for entry in feed.entries:

            pdf_link = None
            for link in entry.links:
                if link.rel == "alternate":
                    continue
                if link.type == "application/pdf":
                    pdf_link = link.href
                    break

            # Extract data
            paper_data = {
                "title": entry.title,
                "authors": [a.name for a in entry.authors],
                "year": entry.published[:4],
                "paperId": entry.id,                      # arXiv ID
                "abstract": entry.summary.replace("\n", " ").strip(),
                "citationCount": None,                    # arXiv does NOT provide citations
                "venue": "arXiv",
                "url": entry.id,
                "pdf_url": pdf_link,
                "has_pdf": pdf_link is not None
            }

            papers.append(paper_data)

        papers_with_pdf = sum(1 for p in papers if p["has_pdf"])

        print(" Search complete!")
        print(f"   Total papers found: {len(papers)}")
        print(f"   Papers with PDF available: {papers_with_pdf}")

        return {
            "topic": topic,
            "search_timestamp": "timestamp_placeholder",
            "total_results": len(papers),
            "papers_with_pdf": papers_with_pdf,
            "papers": papers
        }

    except Exception as e:
        print(f" Error searching arXiv: {e}")
        return None


# ====================
# 3. SAVE METADATA
# ====================

def save_search_results(data, filename=None):
    """
    Save search results to JSON file
    """
    if not filename:
        safe_topic = "".join(c for c in data["topic"] if c.isalnum() or c == " ").replace(" ", "_")
        filename = f"paper_search_results_{safe_topic}.json"

    os.makedirs("data/search_results", exist_ok=True)
    filepath = os.path.join("data/search_results", filename)

    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

    print(f" Search results saved to: {filepath}")
    return filepath


# ====================
# 4. DISPLAY RESULTS
# ====================

def display_search_results(data, max_display=10):
    """
    Display search results in a readable format
    """
    if not data or "papers" not in data:
        print("No data to display")
        return

    papers = data["papers"]
    print("\n" + "="*80)
    print(f"SEARCH RESULTS: {data['topic']}")
    print("="*80)

    print(f"\nStatistics:")
    print(f"  • Total papers: {len(papers)}")
    print(f"  • Papers with PDF: {sum(1 for p in papers if p['has_pdf'])}")
    print(f"  • Papers without PDF: {sum(1 for p in papers if not p['has_pdf'])}")

    print(f"\n Top {min(max_display, len(papers))} Papers:")
    print("-"*80)

    for i, paper in enumerate(papers[:max_display]):
        print(f"\n{i+1}. {paper['title'][:80]}{'...' if len(paper['title']) > 80 else ''}")
        print(f"   Authors: {', '.join(paper['authors'][:3])}" +
              ("..." if len(paper['authors']) > 3 else ""))
        print(f"   Year: {paper['year']} | Citations: {paper['citationCount']}")
        print(f"   PDF Available: {'✅' if paper['has_pdf'] else '❌'}")
        print(f"   Abstract: {paper['abstract'][:100]}...")


# ====================
# 5. MAIN SEARCH FUNCTION
# ====================

def main_search():
    print("\n" + "="*80)
    print("MODULE 1: TOPIC INPUT & PAPER SEARCH (arXiv)")
    print("="*80)

    topic = input("\nEnter research topic: ").strip()
    if not topic:
        topic = "machine learning"

    results = search_papers(topic, limit=20)

    if results:
        save_path = save_search_results(results)
        display_search_results(results)

        print(f"\n Module 1 complete! Results saved to: {save_path}")
        print("   Proceed to Module 2 for paper selection and PDF download.")

        return results, save_path
    else:
        print(" No results found. Please try a different topic.")
        return None, None


if __name__ == "__main__":
    main_search()



MODULE 1: TOPIC INPUT & PAPER SEARCH (arXiv)

Enter research topic: Stock price forecasting

 Searching for papers on: 'Stock price forecasting'
   Requesting 20 papers from arXiv...
arXiv API initialized (no API key required)
 Search complete!
   Total papers found: 20
   Papers with PDF available: 20
 Search results saved to: data/search_results/paper_search_results_Stock_price_forecasting.json

SEARCH RESULTS: Stock price forecasting

Statistics:
  • Total papers: 20
  • Papers with PDF: 20
  • Papers without PDF: 0

 Top 10 Papers:
--------------------------------------------------------------------------------

1. Probabilistic Forecasting in Day-Ahead Electricity Markets: Simulating Peak and ...
   Authors: Peru Muniain, Florian Ziel
   Year: 2018 | Citations: None
   PDF Available: ✅
   Abstract: In this paper we include dependency structures for electricity price forecasting and forecasting eva...

2. Machine learning approach to stock price crash risk
   Authors: Abdullah Kar

In [4]:
# ============================================
# MODULE 2: Paper Selection & PDF Download (arXiv Version)
# ============================================

!pip install PyMuPDF requests -q

import json
import os
import requests
import fitz   # PyMuPDF
import hashlib
from datetime import datetime

# ============================================================
# 1. LOAD SEARCH RESULTS
# ============================================================

def load_search_results(filepath=None):
    """
    Load previously saved arXiv search results
    """
    if not filepath:
        results_dir = "data/search_results"
        if os.path.exists(results_dir):
            json_files = [f for f in os.listdir(results_dir) if f.endswith('.json')]
            if json_files:
                json_files.sort(key=lambda x: os.path.getmtime(os.path.join(results_dir, x)), reverse=True)
                filepath = os.path.join(results_dir, json_files[0])
                print(f" Loading latest search results: {json_files[0]}")
            else:
                print(" No search results found.")
                return None
        else:
            print(" Search results folder not found.")
            return None

    try:
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)

        print(f" Loaded {len(data['papers'])} papers for topic '{data['topic']}'")
        return data

    except Exception as e:
        print(" Error loading:", e)
        return None


# ============================================================
# 2. FILTER & SELECT PAPERS
# ============================================================

def filter_papers_with_pdfs(papers):
    """
    arXiv always provides PDF links like:
    https://arxiv.org/pdf/XXXX.XXXX.pdf
    """
    papers_with_pdf = [p for p in papers if p.get("pdf_url")]

    print("\n PDF Availability:")
    print(f" • Total papers: {len(papers)}")
    print(f" • Papers with PDF: {len(papers_with_pdf)}")

    return papers_with_pdf


def rank_papers(papers):
    """
    Rank by:
    - newest year first
    - (optional future ranking: citations)
    """
    ranked = sorted(
        papers,
        key=lambda x: (x.get("year", 0)),
        reverse=True
    )
    return ranked


def select_top_papers(papers, count=3):
    papers_with_pdf = filter_papers_with_pdfs(papers)
    ranked = rank_papers(papers_with_pdf)

    selected = ranked[:count]

    print(f"\n Selected top {len(selected)} papers:")
    for i, p in enumerate(selected):
        print(f"\n{i+1}. {p['title'][:70]}...")
        print(f"   Year: {p['year']}")
        print(f"   Authors: {', '.join(p['authors'][:3])}")

    return selected


# ============================================================
# 3. DOWNLOAD PDF
# ============================================================

def download_pdf_with_verification(url, filename, max_retries=2):
    """
    Download PDF from arXiv with verification
    """
    headers = {"User-Agent": "Mozilla/5.0"}

    for attempt in range(max_retries):
        try:
            print(f"  Attempt {attempt+1}/{max_retries}")
            r = requests.get(url, headers=headers, timeout=30)

            if r.status_code != 200:
                print("   HTTP Error:", r.status_code)
                continue

            if not (r.content[:4] == b"%PDF"):
                print("   Not a PDF file")
                continue

            with open(filename, "wb") as f:
                f.write(r.content)

            if verify_pdf(filename):
                print("   PDF verified successfully.")
                return True
            else:
                print("   Invalid PDF, retrying...")
                os.remove(filename)

        except Exception as e:
            print("   Error:", e)

    return False


def verify_pdf(filepath):
    """
    Verify PDF integrity
    """
    try:
        if not os.path.exists(filepath):
            return False

        if os.path.getsize(filepath) < 2048:
            return False

        with fitz.open(filepath) as doc:
            return len(doc) > 0

    except:
        return False


def get_pdf_info(filepath):
    try:
        with fitz.open(filepath) as doc:
            return {
                "pages": len(doc),
                "size_mb": round(os.path.getsize(filepath) / (1024 * 1024), 2),
                "is_valid": True
            }
    except:
        return {"is_valid": False}


def download_selected_papers(selected, output_dir="downloads"):
    os.makedirs(output_dir, exist_ok=True)

    print("\n Starting Downloads...")
    print("-" * 60)

    downloaded = []

    for i, paper in enumerate(selected):
        print(f"\n[{i+1}/{len(selected)}] {paper['title'][:60]}...")

        safe_title = "".join(c for c in paper["title"] if c.isalnum() or c in " _-")
        safe_title = safe_title[:60]

        filename = f"{output_dir}/paper_{i+1}_{hashlib.md5(safe_title.encode()).hexdigest()[:8]}.pdf"

        success = download_pdf_with_verification(paper["pdf_url"], filename)

        if success:
            pdf_info = get_pdf_info(filename)
            paper["downloaded"] = True
            paper["local_path"] = filename
            paper["pdf_info"] = pdf_info
            paper["download_time"] = datetime.now().isoformat()

            downloaded.append(paper)
            print(f"   SUCCESS: {pdf_info['pages']} pages ({pdf_info['size_mb']} MB)")
        else:
            print("   FAILED to download")
            paper["downloaded"] = False

    return downloaded


# ============================================================
# 4. SAVE REPORT
# ============================================================

def save_download_report(downloaded, topic, output_dir="downloads"):
    report = {
        "topic": topic,
        "timestamp": datetime.now().isoformat(),
        "total_selected": len(downloaded),
        "successful": sum(1 for p in downloaded if p["downloaded"]),
        "failed": sum(1 for p in downloaded if not p["downloaded"]),
        "papers": downloaded
    }

    os.makedirs("data/reports", exist_ok=True)
    file = f"data/reports/download_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

    with open(file, "w", encoding="utf-8") as f:
        json.dump(report, f, indent=4)

    print(f"\n Report saved to: {file}")
    return file


# ============================================================
# 5. VERIFY DIRECTORY
# ============================================================

def verify_downloads(output_dir="downloads"):
    print("\n Verifying downloaded PDFs...")
    print("=" * 60)

    files = [f for f in os.listdir(output_dir) if f.endswith(".pdf")]
    valid = 0
    total_size = 0

    for f in files:
        path = os.path.join(output_dir, f)
        size = os.path.getsize(path)
        total_size += size

        if verify_pdf(path):
            valid += 1
            print(f" ✔ {f} ({size/1024/1024:.2f} MB)")
        else:
            print(f" ❌ INVALID: {f}")

    print("\n Summary:")
    print(f" • PDFs found: {len(files)}")
    print(f" • Valid PDFs: {valid}")
    print(f" • Total Size: {total_size/1024/1024:.2f} MB")

    return valid


# ============================================================
# 6. MAIN DRIVER
# ============================================================

def main_download(filepath=None, download_count=3):
    print("\n" + "="*80)
    print(" MODULE 2 — arXiv PDF DOWNLOAD")
    print("="*80)

    data = load_search_results(filepath)
    if not data:
        return None

    selected = select_top_papers(data["papers"], count=download_count)

    downloaded = download_selected_papers(selected)

    report_file = save_download_report(downloaded, data["topic"])

    verify_downloads()

    print("\n Module 2 Complete!")
    print(" Downloaded PDFs stored in: downloads/")
    print(" Report:", report_file)

    return downloaded


# Allow direct execution
if __name__ == "__main__":
    main_download(download_count=3)


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m85.8 MB/s[0m eta [36m0:00:00[0m
[?25h
 MODULE 2 — arXiv PDF DOWNLOAD
 Loading latest search results: paper_search_results_Stock_price_forecasting.json
 Loaded 20 papers for topic 'Stock price forecasting'

 PDF Availability:
 • Total papers: 20
 • Papers with PDF: 20

 Selected top 3 papers:

1. Machine learning approach to stock price crash risk...
   Year: 2025
   Authors: Abdullah Karasan, Ozge Sezgin Alp, Gerhard-Wilhelm Weber

2. Tokenizing Stock Prices for Enhanced Multi-Step Forecast and Predictio...
   Year: 2025
   Authors: Zhuohang Zhu, Haodong Chen, Qiang Qu

3. CSPO: Cross-Market Synergistic Stock Price Movement Forecasting with P...
   Year: 2025
   Authors: Sida Lin, Yankai Chen, Yiyan Qi

 Starting Downloads...
------------------------------------------------------------

[1/3] Machine learning approach to stock price crash risk...
  Attempt 1/2
   PDF verified successfully.
   SUCCES

In [5]:
# MODULE 3: FINAL PDF TEXT EXTRACTION
# ============================ INSTALLS =============================
!pip install PyMuPDF4LLM tqdm pymupdf -q

# ============================ IMPORTS ===============================
import os
import re
import json
from pathlib import Path
from tqdm import tqdm
from datetime import datetime

import pymupdf
import pymupdf4llm


# ============================ CLEAN TEXT ============================
def clean_text_basic(text):
    if not text:
        return ""

    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'-\s+', '', text)
    text = re.sub(r'\s*-\s*', '-', text)
    text = ''.join(char for char in text if ord(char) >= 32)

    return text.strip()


# ============================ EXTRACTION ============================
def extract_text_improved(pdf_path):
    """
    Improved PDF text extraction with safe fallbacks.
    """
    try:
        doc = pymupdf.open(pdf_path)

        # Encrypted PDF? Try but warn
        if doc.is_encrypted:
            print(" ⚠ PDF encrypted, attempting extraction...")

        # Extract first-page check
        first_page = doc[0].get_text().strip().lower() if len(doc) else ""
        copyright_words = ["copyright", "takedown", "removed", "restricted"]

        if any(word in first_page for word in copyright_words):
            print(" ⚠ Copyright / restricted content detected")
            return None

        # Try markdown extraction
        texts = []

        try:
            md = pymupdf4llm.to_markdown(str(pdf_path))
            if md and len(md) > 500:
                texts.append(("markdown", md))
        except Exception:
            pass

        # Regular extraction
        raw = ""
        for i in range(min(len(doc), 50)):
            raw += doc[i].get_text() + "\n"

        if len(raw) > 500:
            texts.append(("regular", raw))

        doc.close()

        if not texts:
            return None

        # Prefer markdown if long enough
        for method, text in texts:
            if method == "markdown" and len(text) > 1000:
                return text

        # Otherwise longest text
        return max(texts, key=lambda x: len(x[1]))[1]

    except Exception as e:
        print(f" Extraction error: {e}")
        return None


# ================= SECTION EXTRACTION HELPERS =======================
def extract_by_keywords_fallback(text, sections):
    """
    Backup section extraction based on keyword sentence search.
    """
    text_lower = text.lower()
    sentences = re.split(r'[.!?]', text)

    keywords = {
        "abstract": ["abstract", "summary", "this paper"],
        "introduction": ["introduction", "motivation", "background"],
        "methods": ["method", "experiment", "procedure", "dataset"],
        "results": ["result", "finding"],
        "conclusion": ["conclusion", "summary", "future work"]
    }

    for sec, keys in keywords.items():
        if sections[sec]:
            continue

        section_lines = []
        for i, s in enumerate(sentences):
            s_l = s.lower()
            if any(k in s_l for k in keys):
                start = max(0, i - 2)
                end = min(len(sentences), i + 5)
                context = " ".join(sentences[start:end])
                section_lines.append(context)

        if section_lines:
            sections[sec] = " ".join(section_lines)[:5000]

    return sections


def extract_sections_improved(text):
    """
    Extract structured sections from academic PDFs.
    """
    if not text or len(text) < 500:
        return {
            "title": "",
            "abstract": "",
            "introduction": "",
            "methods": "",
            "results": "",
            "conclusion": "",
            "references": "",
            "extracted_text": text
        }

    text = clean_text_basic(text)
    lines = text.split("\n")

    # Output container
    sections = {
        "title": "",
        "abstract": "",
        "introduction": "",
        "methods": "",
        "results": "",
        "conclusion": "",
        "references": "",
        "extracted_text": text[:20000]
    }

    # Identify section headers
    header_patterns = {
        "abstract": r"abstract",
        "introduction": r"introduction",
        "methods": r"method|experiment|methodology",
        "results": r"results?|findings",
        "conclusion": r"conclusion|discussion",
        "references": r"references|bibliography"
    }

    boundaries = {}

    for i, line in enumerate(lines):
        L = line.strip().lower()
        for name, pat in header_patterns.items():
            if re.fullmatch(pat, L) or (pat in L and len(L) < 80):
                boundaries[name] = i

    # Create text segments
    if boundaries:
        order = sorted(boundaries.items(), key=lambda x: x[1])

        for idx, (sec, start) in enumerate(order):
            end = order[idx + 1][1] if idx + 1 < len(order) else len(lines)
            content = "\n".join(lines[start + 1:end]).strip()

            if len(content) > 100:
                sections[sec] = content[:5000]

    # Extract title
    for line in lines[:10]:
        if 20 < len(line) < 200 and not line.lower().startswith("http"):
            sections["title"] = line.strip()
            break

    # Use keyword fallback if sections missing
    if sum(len(sections[s]) > 200 for s in ["abstract", "introduction",
                                            "methods", "results", "conclusion"]) < 2:
        sections = extract_by_keywords_fallback(text, sections)

    return sections


# ===================== PROCESS ONE PDF =============================
def process_paper_smart(pdf_path):
    print(f"\nProcessing: {pdf_path.name}")

    size = pdf_path.stat().st_size
    if size < 10240:
        print(" ⚠ File too small")
        return None

    text = extract_text_improved(pdf_path)

    if not text:
        print(" ⚠ Empty or restricted PDF")
        return None

    print(f" Extracted: {len(text):,} chars")

    sections = extract_sections_improved(text)

    count = sum(len(sections[s]) > 200 for s in ["abstract", "introduction", "methods", "results", "conclusion"])
    print(f" Sections found: {count}")

    return {
        "paper_id": pdf_path.stem,
        "filename": pdf_path.name,
        "file_size_bytes": size,
        "total_characters": len(text),
        "meaningful_sections": [s for s in sections if len(sections[s]) > 200 and s != "extracted_text"],
        "sections": sections,
        "status": "success"
    }


# ===================== UTILITIES ==================================
def get_downloaded_papers(download_dir="downloads"):
    path = Path(download_dir)
    if not path.exists():
        return []
    return list(path.glob("*.pdf"))


def save_results_final(results, output_dir="data/extracted"):
    out = Path(output_dir)
    out.mkdir(parents=True, exist_ok=True)

    for r in results:
        file = out / f"{r['paper_id']}_extracted.json"

        # Avoid dumping huge raw text
        if "extracted_text" in r["sections"] and len(r["sections"]["extracted_text"]) > 10000:
            r["sections"]["extracted_text"] = r["sections"]["extracted_text"][:10000] + "...[truncated]"

        with open(file, "w", encoding="utf-8") as f:
            json.dump(r, f, indent=2, ensure_ascii=False)

        print(f" ✓ Saved {file.name}")

    summary = {
        "extraction_date": datetime.now().isoformat(),
        "total_papers": len(results),
        "papers": [
            {
                "paper_id": r["paper_id"],
                "filename": r["filename"],
                "total_chars": r["total_characters"],
                "file_size_bytes": r["file_size_bytes"],
                "sections_found": r["meaningful_sections"]
            }
            for r in results
        ]
    }

    with open(out / "extraction_summary.json", "w", encoding="utf-8") as f:
        json.dump(summary, f, indent=2, ensure_ascii=False)


# ===================== MAIN EXTRACTION DRIVER ======================
def extract_all_papers(download_dir="downloads", max_papers=None):
    print("\n" + "="*80)
    print(" MODULE 3: PDF TEXT EXTRACTION")
    print("="*80)

    pdfs = get_downloaded_papers(download_dir)
    if not pdfs:
        print(" No PDF files found")
        return []

    if max_papers:
        pdfs = pdfs[:max_papers]

    results = []
    skipped = 0

    for pdf in tqdm(pdfs):
        res = process_paper_smart(pdf)
        if res:
            results.append(res)
        else:
            skipped += 1

    if results:
        save_results_final(results)

    print(f"\nExtraction complete.")
    print(f" Processed: {len(results)}")
    print(f" Skipped:   {skipped}")

    return results


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/66.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.9/66.9 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hConsider using the pymupdf_layout package for a greatly improved page layout analysis.


In [6]:
# ===============================================================
# MODULE 4: CROSS-PAPER ANALYSIS (arXiv Compatible Version)
# ===============================================================

!pip install scikit-learn numpy -q

import json
import re
from pathlib import Path
from collections import defaultdict
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# ===============================================================
# 1. LOAD EXTRACTED PAPERS (arXiv-based)
# ===============================================================

def load_extracted_papers(data_dir="data/extracted"):
    """
    Load all extracted papers from JSON files (arXiv version)
    """
    data_path = Path(data_dir)
    papers = []

    json_files = list(data_path.glob("*_extracted.json"))

    if not json_files:
        print("❌ No extracted papers found. Run Module 3 first.")
        return []

    print(f"📄 Loading {len(json_files)} extracted papers...")

    for json_file in json_files:
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                papers.append(data)
                print(f"   ✓ Loaded {data['paper_id']} ({data['total_characters']:,} chars)")
        except Exception as e:
            print(f"   ❌ Error loading {json_file}: {e}")

    return papers


# ===============================================================
# 2. SINGLE PAPER ANALYSIS
# ===============================================================

def analyze_single_paper(paper):
    """
    Deep analysis for one paper
    """
    print("\n🔍 Deep analyzing single paper...")

    info = extract_key_information(paper)

    analysis = {
        "paper_id": info["paper_id"],
        "title": info["title"],
        "year": info["year"],
        "methods_used": info["methods"],
        "datasets_mentioned": info["datasets"],
        "key_findings": info["key_findings"],
        "limitations": info["limitations"],
        "contributions": info["contributions"],
        "metrics_reported": info["metrics"],
        "paper_structure": analyze_paper_structure(paper),
        "research_quality_indicators": assess_research_quality(info),
        "recommendations_for_future_research": generate_recommendations(info)
    }

    return analysis


# ===============================================================
# 2A. PAPER STRUCTURE
# ===============================================================

def analyze_paper_structure(paper):
    sections = paper.get("sections", {})
    structure = {
        "sections_present": [],
        "sections_missing": [],
        "section_lengths": {}
    }

    expected = ["title", "abstract", "introduction", "methods", "results", "conclusion", "references"]

    for sec in expected:
        text = sections.get(sec, "")
        if text and len(text) > 50:
            structure["sections_present"].append(sec)
            structure["section_lengths"][sec] = len(text)
        else:
            structure["sections_missing"].append(sec)

    return structure


# ===============================================================
# 2B. RESEARCH QUALITY SCORING
# ===============================================================

def assess_research_quality(info):
    quality = {
        "has_methods": len(info["methods"]) > 0,
        "has_datasets": len(info["datasets"]) > 0,
        "has_findings": len(info["key_findings"]) > 0,
        "has_limitations": len(info["limitations"]) > 0,
        "has_metrics": len(info["metrics"]) > 0,
        "method_diversity": len(info["methods"]),
        "finding_clarity": len(info["key_findings"])
    }

    score = sum([
        quality["has_methods"],
        quality["has_datasets"],
        quality["has_findings"],
        quality["has_limitations"],
        quality["has_metrics"],
        int(quality["method_diversity"] >= 2),
        int(quality["finding_clarity"] >= 2)
    ])

    quality["overall_score"] = f"{score}/7"
    quality["percentage"] = (score / 7) * 100

    return quality


def generate_recommendations(info):
    recs = []

    if info["methods"]:
        recs.append(f"Compare performance with other methods similar to: {info['methods'][0]}")

    if info["limitations"]:
        recs.append(f"Address limitation: {info['limitations'][0][:80]}...")

    if info["datasets"]:
        recs.append("Consider exploring more datasets for validation.")

    recs.append("Include more recent benchmarking comparisons.")
    recs.append("Add ablation studies for deeper analysis.")

    return recs[:3]


# ===============================================================
# 3. KEY INFORMATION EXTRACTION
# ===============================================================

def extract_key_information(paper):
    sections = paper.get("sections", {})

    info = {
        "paper_id": paper.get("paper_id", "unknown"),
        "title": sections.get("title", "Unknown"),
        "year": extract_year(paper),
        "methods": extract_methods(paper),
        "datasets": extract_datasets(paper),
        "key_findings": extract_key_findings(paper),
        "limitations": extract_limitations(paper),
        "contributions": extract_contributions(paper),
        "metrics": extract_metrics(paper)
    }

    return info


def extract_year(paper):
    title = paper["sections"].get("title", "")
    match = re.search(r"\b(19|20)\d{2}\b", title)
    if match:
        return match.group()

    text = paper["sections"].get("extracted_text", "")[:5000]
    match = re.search(r"\b(19|20)\d{2}\b", text)
    return match.group() if match else "Unknown"


def extract_methods(paper):
    methods_text = (
        paper["sections"].get("methods") or
        paper["sections"].get("extracted_text", "")
    )[:5000].lower()

    keywords = [
        "deep learning", "neural network", "transformer", "cnn", "rnn", "lstm",
        "regression", "classification", "svm", "xgboost", "random forest",
        "framework", "model", "algorithm", "approach", "method"
    ]

    found = []
    sentences = re.split(r"[.!?]+", methods_text)

    for sentence in sentences:
        for k in keywords:
            if k in sentence and len(sentence) > 20:
                cleaned = sentence.strip().replace("\n", " ")
                if cleaned not in found:
                    found.append(cleaned[:200])
                break

    return found[:5]


def extract_datasets(paper):
    text = paper["sections"].get("extracted_text", "").lower()

    dataset_keywords = [
        "dataset", "benchmark", "uci", "kaggle", "cifar", "imagenet",
        "mnist", "corpus", "collection", "repository"
    ]

    found = []

    for k in dataset_keywords:
        if k in text:
            found.append(k)

    return list(set(found))[:5]


def extract_key_findings(paper):
    text = (
        paper["sections"].get("results") or
        paper["sections"].get("conclusion") or
        paper["sections"].get("extracted_text", "")
    )[:3000].lower()

    keywords = [
        "improves", "achieves", "outperforms", "better", "higher accuracy",
        "reduces error", "significant", "results show", "we found"
    ]

    findings = []
    for sent in re.split(r"[.!?]+", text):
        if any(k in sent for k in keywords) and len(sent) > 40:
            findings.append(sent.strip()[:300])

    return findings[:5]


def extract_limitations(paper):
    text = (
        paper["sections"].get("conclusion") or
        paper["sections"].get("extracted_text", "")
    ).lower()

    keywords = [
        "limitation", "future work", "could be improved", "drawback",
        "only considers", "challenge", "issue"
    ]

    limits = []
    for s in re.split(r"[.!?]+", text):
        if any(k in s for k in keywords) and len(s) > 40:
            limits.append(s.strip()[:250])

    return limits[:3]


def extract_contributions(paper):
    text = (
        paper["sections"].get("abstract", "") +
        paper["sections"].get("introduction", "")
    ).lower()

    keywords = ["contribution", "we propose", "we introduce", "novel", "new method"]

    contribs = []
    for s in re.split(r"[.!?]+", text):
        if any(k in s for k in keywords) and len(s) > 40:
            contribs.append(s.strip()[:250])

    return contribs[:3]


def extract_metrics(paper):
    text = paper["sections"].get("results", "").lower()

    patterns = [
        r"accuracy\s*\d+\.?\d*",
        r"precision\s*\d+\.?\d*",
        r"recall\s*\d+\.?\d*",
        r"f1\s*\d+\.?\d*",
        r"\d+\.?\d*\s*%"
    ]

    found = []
    for p in patterns:
        found.extend(re.findall(p, text))

    return list(set(found))[:5]


# ===============================================================
# 4. MULTI-PAPER COMPARISON
# ===============================================================

def compare_papers(papers_info):
    print(f"\n📘 Comparing {len(papers_info)} papers...")

    return {
        "total_papers": len(papers_info),
        "similarities": find_similarities(papers_info),
        "differences": find_differences(papers_info),
        "common_methods": find_common_elements(papers_info, "methods"),
        "common_datasets": find_common_elements(papers_info, "datasets"),
        "timeline_analysis": analyze_timeline(papers_info),
        "research_gaps": identify_research_gaps(papers_info)
    }


def find_similarities(papers):
    sim = {
        "methods": defaultdict(int),
        "datasets": defaultdict(int),
        "findings": defaultdict(int)
    }

    for p in papers:
        for m in p["methods"]:
            sim["methods"][m[:40]] += 1
        for d in p["datasets"]:
            sim["datasets"][d[:40]] += 1
        for f in p["key_findings"]:
            sim["findings"][f[:40]] += 1

    return {
        "methods": [k for k,v in sim["methods"].items() if v > 1],
        "datasets": [k for k,v in sim["datasets"].items() if v > 1],
        "findings": [k for k,v in sim["findings"].items() if v > 1]
    }


def find_differences(papers):
    diff = {
        "unique_methods": {},
        "unique_datasets": {},
        "unique_findings": {}
    }

    all_m = set(m for p in papers for m in p["methods"])
    all_d = set(d for p in papers for d in p["datasets"])
    all_f = set(f for p in papers for f in p["key_findings"])

    for p in papers:
        pid = p["paper_id"]
        diff["unique_methods"][pid] = list(set(p["methods"]) - all_m)
        diff["unique_datasets"][pid] = list(set(p["datasets"]) - all_d)
        diff["unique_findings"][pid] = list(set(p["key_findings"]) - all_f)

    return diff


def find_common_elements(papers, key):
    sets = [set(p[key]) for p in papers]
    if sets:
        return list(set.intersection(*sets))
    return []


def analyze_timeline(papers):
    years = []
    for p in papers:
        y = p["year"]
        if str(y).isdigit():
            years.append(int(y))

    if len(years) < 2:
        return {"message": "Not enough data for timeline analysis"}

    return {
        "earliest": min(years),
        "latest": max(years),
        "range": max(years) - min(years),
        "count_by_year": {y: years.count(y) for y in set(years)}
    }


def identify_research_gaps(papers):
    gaps = []

    if all(len(p["datasets"]) == 0 for p in papers):
        gaps.append("No datasets mentioned across papers")

    if all(len(p["limitations"]) == 0 for p in papers):
        gaps.append("No limitations discussed")

    if not gaps:
        gaps.append("No major research gaps detectable")

    return gaps


In [8]:
# ================================================================
# MODULE 5: FETCH PAPER METADATA & DOWNLOAD PDF USING ARXIV API
# ================================================================

import os
import json
import requests
import xml.etree.ElementTree as ET
import time

# ----------------------------- CONFIG -----------------------------
ARXIV_SEARCH_URL = "https://export.arxiv.org/api/query"
SAVE_JSON = "module5_arxiv_results.json"
SAVE_PDF_DIR = "downloaded_papers"

# Create folder for PDFs
os.makedirs(SAVE_PDF_DIR, exist_ok=True)

# ----------------------------- FUNCTIONS -----------------------------

def fetch_arxiv_papers(query, max_results=10):
    """Search papers from arXiv API."""
    params = {
        "search_query": f"all:{query}",
        "start": 0,
        "max_results": max_results
    }

    print("\n🔍 Fetching results from arXiv…")
    response = requests.get(ARXIV_SEARCH_URL, params=params)

    if response.status_code != 200:
        print("❌ API Request Failed:", response.status_code)
        return []

    root = ET.fromstring(response.text)
    namespace = {"arxiv": "http://www.w3.org/2005/Atom"}

    papers = []

    for entry in root.findall("arxiv:entry", namespace):

        title = entry.find("arxiv:title", namespace).text.strip()
        summary = entry.find("arxiv:summary", namespace).text.strip()

        # Authors
        authors = [author.find("arxiv:name", namespace).text
                   for author in entry.findall("arxiv:author", namespace)]

        # Published Year
        published = entry.find("arxiv:published", namespace).text[:4]

        # PDF Link
        pdf_link = None
        for link in entry.findall("arxiv:link", namespace):
            if link.attrib.get("type") == "application/pdf":
                pdf_link = link.attrib["href"]

        papers.append({
            "title": title,
            "authors": authors,
            "year": published,
            "abstract": summary,
            "pdf_url": pdf_link
        })

    print(f"✅ Retrieved {len(papers)} papers from arXiv")
    return papers


def download_pdf(title, url):
    """Download PDF from arXiv."""
    if not url:
        print(f"❌ No PDF available for: {title}")
        return None

    safe_title = "".join(c for c in title if c.isalnum() or c in " -_")
    file_path = os.path.join(SAVE_PDF_DIR, f"{safe_title}.pdf")

    print(f"⬇ Downloading: {title}")

    try:
        pdf = requests.get(url, timeout=20)
        if pdf.status_code == 200:
            with open(file_path, "wb") as f:
                f.write(pdf.content)
            print("✔ PDF Saved:", file_path)
            return file_path
        else:
            print("❌ Failed:", pdf.status_code)
    except Exception as e:
        print("❌ Error downloading:", e)

    return None


# ----------------------------- MAIN -----------------------------

def module5_pipeline(search_keyword):
    print("\n================ Module 5: ARXIV API ==================\n")

    # Step 1: Fetch Metadata
    papers = fetch_arxiv_papers(search_keyword, max_results=10)

    # Step 2: Download PDFs
    for p in papers:
        time.sleep(2)  # avoid rate limits
        p["pdf_path"] = download_pdf(p["title"], p["pdf_url"])

    # Step 3: Save results to JSON
    with open(SAVE_JSON, "w", encoding="utf-8") as f:
        json.dump(papers, f, indent=4)

    print("\n📁 JSON Saved:", SAVE_JSON)
    print("📚 PDF Download Folder:", SAVE_PDF_DIR)

    return papers


# ----------------------------- RUN -----------------------------
if __name__ == "__main__":
    keyword = input("\nEnter topic to search in arXiv: ")
    module5_pipeline(keyword)



Enter topic to search in arXiv: Stock Price forecasting



🔍 Fetching results from arXiv…
✅ Retrieved 10 papers from arXiv
⬇ Downloading: Probabilistic Forecasting in Day-Ahead Electricity Markets: Simulating Peak and Off-Peak Prices
✔ PDF Saved: downloaded_papers/Probabilistic Forecasting in Day-Ahead Electricity Markets Simulating Peak and Off-Peak Prices.pdf
⬇ Downloading: Machine learning approach to stock price crash risk
✔ PDF Saved: downloaded_papers/Machine learning approach to stock price crash risk.pdf
⬇ Downloading: Quantile Regression for Qualifying Match of GEFCom2017 Probabilistic Load Forecasting
✔ PDF Saved: downloaded_papers/Quantile Regression for Qualifying Match of GEFCom2017 Probabilistic Load Forecasting.pdf
⬇ Downloading: Lasso estimation for GEFCom2014 probabilistic electric load forecasting
✔ PDF Saved: downloaded_papers/Lasso estimation for GEFCom2014 probabilistic electric load forecasting.pdf
⬇ Downloading: Probabilistic Hierarchical Forecasting with Deep P

In [9]:
# ================================================================
# MODULE 6: PROCESS, SUMMARIZE & EXTRACT KEYWORDS (arXiv Version)
# ================================================================

import json
import spacy
from collections import Counter
import re

INPUT_JSON = "module5_arxiv_results.json"
OUTPUT_JSON = "module6_processed_papers.json"

# Load spaCy model once
nlp = spacy.load("en_core_web_sm")

# ----------------------------- CLEAN TEXT -----------------------------

def clean_text(text):
    """Remove extra spaces, newline chars, special symbols."""
    text = re.sub(r"\s+", " ", text)
    return text.strip()


# -------------------------- ABSTRACT SUMMARY -------------------------

def summarize(text, max_sentences=3):
    """Generate summary by selecting important sentences."""
    doc = nlp(text)

    # Score sentences by number of nouns & verbs (simple scoring approach)
    sent_scores = []
    for sent in doc.sents:
        score = sum(1 for token in sent if token.pos_ in ["NOUN", "VERB"])
        sent_scores.append((score, sent.text))

    # Sort best sentences
    top_sents = sorted(sent_scores, reverse=True)[:max_sentences]

    summary = " ".join([s[1] for s in top_sents])
    return summary


# -------------------------- KEYWORD EXTRACTION ------------------------

def extract_keywords(text, top_k=10):
    doc = nlp(text)
    words = [token.lemma_.lower() for token in doc
             if token.pos_ in ["NOUN", "PROPN"]
             and not token.is_stop
             and len(token.text) > 3]

    freq = Counter(words)
    return [w for w, c in freq.most_common(top_k)]


# --------------------------- PROCESS PAPERS ----------------------------

def process_papers():
    print("\n================ Module 6: PROCESSING PAPERS ==================\n")

    # Load Module 5 results
    try:
        with open(INPUT_JSON, "r", encoding="utf-8") as f:
            papers = json.load(f)
    except FileNotFoundError:
        print(f"❌ ERROR: {INPUT_JSON} not found. Run Module 5 first.")
        return

    processed_list = []

    for p in papers:
        print(f"📄 Processing: {p['title']}")

        title = clean_text(p["title"])
        abstract = clean_text(p["abstract"])
        authors = p.get("authors", [])
        year = p.get("year", "Unknown")
        pdf_path = p.get("pdf_path", None)

        # Generate summary
        summary = summarize(abstract)

        # Extract keywords
        keywords = extract_keywords(abstract)

        processed_list.append({
            "title": title,
            "authors": authors,
            "year": year,
            "pdf_path": pdf_path,
            "abstract": abstract,
            "summary": summary,
            "keywords": keywords
        })

    # Save processed data
    with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
        json.dump(processed_list, f, indent=4)

    print("\n✅ Module 6 Completed Successfully")
    print("📁 Saved Processed Output →", OUTPUT_JSON)

    return processed_list


# ----------------------------- RUN DIRECTLY -----------------------------

if __name__ == "__main__":
    process_papers()




📄 Processing: Probabilistic Forecasting in Day-Ahead Electricity Markets: Simulating Peak and Off-Peak Prices
📄 Processing: Machine learning approach to stock price crash risk
📄 Processing: Quantile Regression for Qualifying Match of GEFCom2017 Probabilistic Load Forecasting
📄 Processing: Lasso estimation for GEFCom2014 probabilistic electric load forecasting
📄 Processing: Probabilistic Hierarchical Forecasting with Deep Poisson Mixtures
📄 Processing: LASSO Principal Component Averaging -- a fully automated approach for point forecast pooling
📄 Processing: A Time Series Analysis-Based Stock Price Prediction Using Machine Learning and Deep Learning Models
📄 Processing: MASTER: Market-Guided Stock Transformer for Stock Price Forecasting
📄 Processing: Tokenizing Stock Prices for Enhanced Multi-Step Forecast and Prediction
📄 Processing: A Quantum Approach to Stock Price Fluctuations

✅ Module 6 Completed Successfully
📁 Saved Processed Output → module6_processed_papers.json


In [10]:
# ================================================================
# MODULE 7: PAPER EMBEDDINGS & SEMANTIC SEARCH PREPARATION
# ================================================================

import json
import os
from sentence_transformers import SentenceTransformer
import numpy as np

INPUT_JSON = "module6_processed_papers.json"
OUTPUT_EMBEDDINGS = "module7_paper_embeddings.npz"
OUTPUT_METADATA = "module7_paper_metadata.json"

# -------------------- Load Module 6 processed papers --------------------

def load_processed_papers():
    try:
        with open(INPUT_JSON, "r", encoding="utf-8") as f:
            papers = json.load(f)
        print(f"✅ Loaded {len(papers)} papers from {INPUT_JSON}")
        return papers
    except FileNotFoundError:
        print(f"❌ ERROR: {INPUT_JSON} not found. Run Module 6 first.")
        return []

# -------------------- Generate embeddings --------------------

def generate_embeddings(papers, model_name="all-MiniLM-L6-v2"):
    print(f"\n🧠 Loading embedding model: {model_name}")
    model = SentenceTransformer(model_name)

    embeddings = []
    for p in papers:
        # Use abstract + summary for embedding
        text = (p.get("abstract", "") + " " + p.get("summary", "")).strip()
        emb = model.encode(text, convert_to_numpy=True)
        embeddings.append(emb)

    embeddings = np.array(embeddings)
    print(f"✅ Generated embeddings shape: {embeddings.shape}")
    return embeddings

# -------------------- Save embeddings and metadata --------------------

def save_embeddings(embeddings, papers):
    np.savez_compressed(OUTPUT_EMBEDDINGS, embeddings=embeddings)
    print(f"✅ Embeddings saved to {OUTPUT_EMBEDDINGS}")

    # Save metadata
    metadata = [{"title": p["title"], "authors": p["authors"], "year": p["year"], "pdf_path": p["pdf_path"]} for p in papers]
    with open(OUTPUT_METADATA, "w", encoding="utf-8") as f:
        json.dump(metadata, f, indent=4)
    print(f"✅ Metadata saved to {OUTPUT_METADATA}")

# -------------------- Semantic Search Example --------------------

def semantic_search(query, embeddings, papers, top_k=5, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    query_emb = model.encode(query, convert_to_numpy=True)

    # Compute cosine similarity
    similarities = embeddings @ query_emb / (np.linalg.norm(embeddings, axis=1) * np.linalg.norm(query_emb))
    top_indices = similarities.argsort()[::-1][:top_k]

    results = []
    for i in top_indices:
        results.append({
            "title": papers[i]["title"],
            "authors": papers[i]["authors"],
            "year": papers[i]["year"],
            "pdf_path": papers[i]["pdf_path"],
            "similarity": float(similarities[i])
        })

    return results

# -------------------- MAIN --------------------

if __name__ == "__main__":
    papers = load_processed_papers()
    if not papers:
        exit()

    embeddings = generate_embeddings(papers)
    save_embeddings(embeddings, papers)

    # Example query
    query = "renewable energy optimization"
    top_results = semantic_search(query, embeddings, papers)
    print("\n🔍 Top search results for query:", query)
    for i, res in enumerate(top_results, 1):
        print(f"{i}. {res['title']} ({res['year']}) - similarity: {res['similarity']:.3f}")


✅ Loaded 10 papers from module6_processed_papers.json

🧠 Loading embedding model: all-MiniLM-L6-v2


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Generated embeddings shape: (10, 384)
✅ Embeddings saved to module7_paper_embeddings.npz
✅ Metadata saved to module7_paper_metadata.json

🔍 Top search results for query: renewable energy optimization
1. Lasso estimation for GEFCom2014 probabilistic electric load forecasting (2016) - similarity: 0.347
2. Quantile Regression for Qualifying Match of GEFCom2017 Probabilistic Load Forecasting (2018) - similarity: 0.307
3. LASSO Principal Component Averaging -- a fully automated approach for point forecast pooling (2022) - similarity: 0.295
4. Probabilistic Forecasting in Day-Ahead Electricity Markets: Simulating Peak and Off-Peak Prices (2018) - similarity: 0.287
5. Tokenizing Stock Prices for Enhanced Multi-Step Forecast and Prediction (2025) - similarity: 0.103


In [12]:
import json
import numpy as np
from pathlib import Path
from datetime import datetime
from sentence_transformers import SentenceTransformer
# Optional: use OpenAI GPT for more advanced summaries
# import openai

INPUT_EMBEDDINGS = "module7_paper_embeddings.npz"
INPUT_METADATA = "module7_paper_metadata.json"
OUTPUT_REVIEWS = "module8_paper_reviews.json"

# -------------------- Load embeddings and metadata --------------------

def load_embeddings_and_metadata():
    if not Path(INPUT_EMBEDDINGS).exists() or not Path(INPUT_METADATA).exists():
        print(f"❌ ERROR: Module 7 outputs not found. Run Module 7 first.")
        return None, None

    embeddings_npz = np.load(INPUT_EMBEDDINGS)
    embeddings = embeddings_npz["embeddings"]

    with open(INPUT_METADATA, "r", encoding="utf-8") as f:
        metadata = json.load(f)

    print(f"✅ Loaded {len(metadata)} papers and embeddings shape: {embeddings.shape}")
    return embeddings, metadata

# -------------------- Semantic search function --------------------

def semantic_search(query, embeddings, metadata, top_k=5, model_name="all-MiniLM-L6-v2"):
    model = SentenceTransformer(model_name)
    query_emb = model.encode(query, convert_to_numpy=True)

    similarities = embeddings @ query_emb / (np.linalg.norm(embeddings, axis=1) * np.linalg.norm(query_emb))
    top_indices = similarities.argsort()[::-1][:top_k]

    results = []
    for i in top_indices:
        results.append({
            "title": metadata[i]["title"],
            "authors": metadata[i]["authors"],
            "year": metadata[i]["year"],
            "pdf_path": metadata[i]["pdf_path"],
            "similarity": float(similarities[i])
        })

    return results

# -------------------- Summarization --------------------

def summarize_paper(paper, method="simple"):
    """
    Generate a summary/review for a paper.
    method: "simple" -> basic text truncation or extraction
            "gpt"    -> call GPT model (requires API key)
    """
    abstract = paper.get("abstract", "").strip()
    summary = paper.get("summary", "").strip()

    if method == "simple":
        combined = []
        if abstract: combined.append(abstract)
        if summary: combined.append(summary)
        combined_text = ". ".join(combined)

        if not combined_text:
            return "No abstract or summary available for a review."

        # Simple: take first 3 sentences of abstract + summary
        sentences = [s.strip() for s in combined_text.split(".") if s.strip()]
        summary_text = ". ".join(sentences[:3])

        if not summary_text:
            # Fallback for very short texts that don't form full sentences
            return (abstract if abstract else summary if summary else "Could not generate a meaningful review.")[:200] + "..." if len(abstract if abstract else summary if summary else "") > 200 else ""

        return summary_text.strip() + ("." if not summary_text.endswith(".") else "")

    elif method == "gpt":
        # GPT-based summarization (replace with actual API call if available)
        prompt = f"Write a concise review for the following research paper:\n\nTitle: {paper['title']}\nAbstract: {abstract}\nSummary: {summary}\n\nReview:"
        # Example:
        # response = openai.ChatCompletion.create(
        #     model="gpt-4",
        #     messages=[{"role": "user", "content": prompt}],
        #     max_tokens=200
        # )
        # return response['choices'][0]['message']['content']
        return "GPT-based summary placeholder for paper: " + paper['title']

# -------------------- Generate reviews for top papers --------------------

def generate_reviews(query, embeddings, metadata, top_k=5, method="simple"):
    top_papers = semantic_search(query, embeddings, metadata, top_k)
    reviews = []

    # Need original paper data to get abstract and summary for review
    # Re-load processed papers from Module 6 to get the full data
    try:
        with open("module6_processed_papers.json", "r", encoding="utf-8") as f:
            processed_papers = json.load(f)
        processed_papers_map = {p["title"]: p for p in processed_papers}
    except FileNotFoundError:
        print("❌ ERROR: module6_processed_papers.json not found for detailed summarization.")
        processed_papers_map = {}

    for paper in top_papers:
        original_paper_data = processed_papers_map.get(paper["title"], paper)
        review_text = summarize_paper(original_paper_data, method=method)
        reviews.append({
            "title": paper["title"],
            "authors": paper["authors"],
            "year": paper["year"],
            "pdf_path": paper["pdf_path"],
            "similarity": paper["similarity"],
            "review": review_text
        })

    return reviews

# -------------------- Save reviews --------------------

def save_reviews(reviews):
    with open(OUTPUT_REVIEWS, "w", encoding="utf-8") as f:
        json.dump(reviews, f, indent=4)
    print(f"✅ Saved {len(reviews)} paper reviews to {OUTPUT_REVIEWS}")

# -------------------- MAIN --------------------

if __name__ == "__main__":
    embeddings, metadata = load_embeddings_and_metadata()
    if embeddings is None:
        exit()

    query = input("Enter your research query/topic: ").strip()
    reviews = generate_reviews(query, embeddings, metadata, top_k=5, method="simple")
    save_reviews(reviews)

    print("\n🔹 Top Reviews Generated:")
    for i, r in enumerate(reviews, 1):
        print(f"{i}. {r['title']} ({r['year']})")
        print(f"   Review: {r['review']}\n")

✅ Loaded 10 papers and embeddings shape: (10, 384)
Enter your research query/topic: stock price forecasting
✅ Saved 5 paper reviews to module8_paper_reviews.json

🔹 Top Reviews Generated:
1. Tokenizing Stock Prices for Enhanced Multi-Step Forecast and Prediction (2025)
   Review: Effective stock price forecasting (estimating future prices) and prediction (estimating future price changes) are pivotal for investors, regulatory agencies, and policymakers. These tasks enable informed decision-making, risk management, strategic planning, and superior portfolio returns. Despite their importance, forecasting and prediction are challenging due to the dynamic nature of stock price data, which exhibit significant temporal variations in distribution and statistical properties.

2. A Time Series Analysis-Based Stock Price Prediction Using Machine Learning and Deep Learning Models (2020)
   Review: Prediction of future movement of stock prices has always been a challenging task for the researchers.