AI System to Automatically Review and Summarize Research Papers

Module 1:Topic Input and Paper search

In [13]:
#Import Libraries
!pip install semanticscholar python-dotenv requests -q
import json
import os
from datetime import datetime
from semanticscholar import SemanticScholar
from dotenv import load_dotenv

In [14]:
#fallback papers(when api fails)
FALLBACK_PAPERS = [
    {
        "title": "Deep Learning",
        "authors": ["LeCun", "Bengio", "Hinton"],
        "year": 2015,
        "paperId": "DL001",
        "abstract": "Overview of deep learning...",
        "citationCount": 50000,
        "venue": "Nature",
        "url": "https://arxiv.org/abs/1502.01852",
        "pdf_url": "https://arxiv.org/pdf/1502.01852.pdf",
        "has_pdf": True
    },
    {
        "title": "Attention Is All You Need",
        "authors": ["Vaswani", "Shazeer"],
        "year": 2017,
        "paperId": "DL002",
        "abstract": "Transformer architecture...",
        "citationCount": 100000,
        "venue": "NeurIPS",
        "url": "https://arxiv.org/abs/1706.03762",
        "pdf_url": "https://arxiv.org/pdf/1706.03762.pdf",
        "has_pdf": True
    },
    {
        "title": "Machine Learning Foundations",
        "authors": ["Mitchell"],
        "year": 1997,
        "paperId": "DL003",
        "abstract": "Introduction to machine learning foundations...",
        "citationCount": 20000,
        "venue": "McGraw Hill",
        "url": None,
        "pdf_url": None,
        "has_pdf": False
    }
]
# safe api initilaization
def setup_api_key():
    load_dotenv()
    API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY")

    if not API_KEY:
        print("No API key found. Running without API (fallback mode).")
        return None

    try:
        sch = SemanticScholar(api_key=API_KEY)
        # Test request to validate key
        sch.search_paper("test", limit=1)
        print("Semantic Scholar initialized with API key")
        return sch
    except Exception as e:
        print(f"API key failed ({e}) → Using fallback mode.")
        return None
# Buid result dictionary
def build_result(topic, papers):
    return {
        "topic": topic,
        "search_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "total_results": len(papers),
        "papers_with_pdf": sum(p["has_pdf"] for p in papers),
        "papers": papers
    }
# search papers
def search_papers(topic, limit=20):
    print(f"\n Searching for papers on topic: '{topic}'")

    sch = setup_api_key()

    # If API not available → fallback
    if sch is None:
        print(" Using fallback sample dataset.\n")
        return build_result(topic, FALLBACK_PAPERS)

    try:
        results = sch.search_paper(
            query=topic,
            limit=limit,
            fields=["paperId", "title", "abstract", "year", "authors",
                    "citationCount", "openAccessPdf", "url", "venue"]
        )

        papers = []
        for p in results:
            papers.append({
                "title": p.title,
                "authors": [a["name"] for a in p.authors] if p.authors else [],
                "year": p.year,
                "paperId": p.paperId,
                "abstract": (p.abstract[:300] + "...") if p.abstract else "No abstract",
                "citationCount": p.citationCount or 0,
                "venue": getattr(p, "venue", None),
                "url": p.url,
                "pdf_url": p.openAccessPdf["url"] if p.openAccessPdf else None,
                "has_pdf": bool(p.openAccessPdf)
            })

        print("\n Semantic Scholar search completed successfully!")
        return build_result(topic, papers)

    except Exception as e:
        print(f"\n Semantic Scholar search failed: {e}")
        print(" Using fallback dataset.\n")
        return build_result(topic, FALLBACK_PAPERS)
# save search results
def save_search_results(data):
    os.makedirs("data/search_results", exist_ok=True)
    fname = f"{data['topic'].replace(' ', '_')}_results.json"
    path = f"data/search_results/{fname}"

    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

    print(f"\n Results saved to: {path}")
    return path
# display results
def display_search_results(data):
    print(f" SEARCH RESULTS FOR: {data['topic']}")

    print(f"\n Total papers found: {data['total_results']}")
    print(f" Papers with PDF: {data['papers_with_pdf']}")

    print("\n TOP PAPERS:")

    for i, p in enumerate(data["papers"], start=1):
        print(f"\n{i}. {p['title']}")
        print(f"   Authors: {', '.join(p['authors'])}")
        print(f"    Year: {p['year']}")
        print(f"    Citations: {p['citationCount']}")
        print(f"    PDF: {'YES' if p['has_pdf'] else 'NO'}")
#main function
def main_search():
    print(" MODULE 1: TOPIC INPUT & PAPER SEARCH")

    topic = input("\nEnter research topic: ").strip()
    if not topic:
        topic = "machine learning"

    results = search_papers(topic)
    path = save_search_results(results)
    display_search_results(results)

    print("\n MODULE 1 COMPLETE!")
    print(f" Proceed to Module 2\n")

    return results, path

Module 2:Paper selection and Pdf download

In [19]:
#Import libraries
!pip install PyMuPDF -q
import os
import json
import requests
import fitz  # PyMuPDF
from datetime import datetime

os.makedirs("downloads", exist_ok=True)
#load results from module 1
def load_search_results(path):
    print("\n Loading Module 1 results...")
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        print(" Results loaded successfully.\n")
        return data
    except:
        print(" ERROR: Could not load search results.")
        return None
# RANK PAPERS (CITATIONS → YEAR)
def rank_papers(papers):
    return sorted(
        papers,
        key=lambda p: (p.get("citationCount") or 0, p.get("year") or 0),
        reverse=True
    )
# DOWNLOAD + VALIDATE PDF
def download_pdf(url, title):
    if not url:
        print(f"\n No PDF link for: {title}")
        return False, "no_pdf"

    safe_title = "".join(c if c.isalnum() or c in " _-" else "_" for c in title)
    filename = f"{safe_title[:80]}_{abs(hash(url)) % 99999}.pdf"
    filepath = os.path.join("downloads", filename)

    print(f"\n Downloading: {title}")

    try:
        r = requests.get(url, timeout=20)
        if r.status_code != 200:
            return False, f"HTTP {r.status_code}"

        with open(filepath, "wb") as f:
            f.write(r.content)

        # Validate PDF
        try:
            doc = fitz.open(filepath)
            if doc.page_count == 0:
                os.remove(filepath)
                return False, "empty_pdf"
        except:
            os.remove(filepath)
            return False, "invalid_pdf"

        print(f"    Saved at: {filepath}")
        return True, filepath

    except Exception as e:
        return False, str(e)
# MODULE 2 MAIN
def main_module_2(results_path):
    print(" MODULE 2: PAPER SELECTION & PDF DOWNLOAD")
    data = load_search_results(results_path)
    if not data:
        return

    papers = data["papers"]
    print(f" Total papers: {len(papers)}")

    pdf_papers = [p for p in papers if p.get("has_pdf")]
    print(f" Papers with PDF: {len(pdf_papers)}")

    if len(pdf_papers) == 0:
        print("\n No PDFs found. Using fallback PDF paper.")
        pdf_papers = [{
            "title": "Deep Learning (Fallback PDF)",
            "pdf_url": "https://arxiv.org/pdf/1502.01852.pdf",
            "citationCount": 50000,
            "year": 2015
        }]

    ranked = rank_papers(pdf_papers)
    selected = ranked[:3]

    print("\n SELECTED PAPERS:")
    for p in selected:
        print(f" - {p['title']} (Citations: {p.get('citationCount',0)})")

    print("\n DOWNLOADING PDFs...\n")

    for p in selected:
        success, info = download_pdf(p.get("pdf_url"), p.get("title"))
        print(f"   Status: {'SUCCESS' if success else 'FAILED'} ({info})")

    print("\n MODULE 2 COMPLETE!")
    print(" PDFs saved in: downloads/\n")

In [21]:
results, path = main_search()

 MODULE 1: TOPIC INPUT & PAPER SEARCH

Enter research topic: Machine Learning

 Searching for papers on topic: 'Machine Learning'
No API key found. Running without API (fallback mode).
 Using fallback sample dataset.


 Results saved to: data/search_results/Machine_Learning_results.json
 SEARCH RESULTS FOR: Machine Learning

 Total papers found: 3
 Papers with PDF: 2

 TOP PAPERS:

1. Deep Learning
   Authors: LeCun, Bengio, Hinton
    Year: 2015
    Citations: 50000
    PDF: YES

2. Attention Is All You Need
   Authors: Vaswani, Shazeer
    Year: 2017
    Citations: 100000
    PDF: YES

3. Machine Learning Foundations
   Authors: Mitchell
    Year: 1997
    Citations: 20000
    PDF: NO

 MODULE 1 COMPLETE!
 Proceed to Module 2



In [20]:
main_module_2(path)


 MODULE 2: PAPER SELECTION & PDF DOWNLOAD

 Loading Module 1 results...
 Results loaded successfully.

 Total papers: 3
 Papers with PDF: 2

 SELECTED PAPERS:
 - Attention Is All You Need (Citations: 100000)
 - Deep Learning (Citations: 50000)

 DOWNLOADING PDFs...


 Downloading: Attention Is All You Need
    Saved at: downloads/Attention Is All You Need_88725.pdf
   Status: SUCCESS (downloads/Attention Is All You Need_88725.pdf)

 Downloading: Deep Learning
    Saved at: downloads/Deep Learning_64099.pdf
   Status: SUCCESS (downloads/Deep Learning_64099.pdf)

 MODULE 2 COMPLETE!
 PDFs saved in: downloads/

