AI System to Automatically Review and Summarize Research Papers

Module 1:Topic Input and Paper search

In [None]:
#Import Libraries
!pip install semanticscholar python-dotenv requests -q
import json
import os
from datetime import datetime
from semanticscholar import SemanticScholar
from dotenv import load_dotenv

In [None]:
#fallback papers(when api fails)
FALLBACK_PAPERS = [
    {
        "title": "Deep Learning",
        "authors": ["LeCun", "Bengio", "Hinton"],
        "year": 2015,
        "paperId": "DL001",
        "abstract": "Overview of deep learning...",
        "citationCount": 50000,
        "venue": "Nature",
        "url": "https://arxiv.org/abs/1502.01852",
        "pdf_url": "https://arxiv.org/pdf/1502.01852.pdf",
        "has_pdf": True
    },
    {
        "title": "Attention Is All You Need",
        "authors": ["Vaswani", "Shazeer"],
        "year": 2017,
        "paperId": "DL002",
        "abstract": "Transformer architecture...",
        "citationCount": 100000,
        "venue": "NeurIPS",
        "url": "https://arxiv.org/abs/1706.03762",
        "pdf_url": "https://arxiv.org/pdf/1706.03762.pdf",
        "has_pdf": True
    },
    {
        "title": "Machine Learning Foundations",
        "authors": ["Mitchell"],
        "year": 1997,
        "paperId": "DL003",
        "abstract": "Introduction to machine learning foundations...",
        "citationCount": 20000,
        "venue": "McGraw Hill",
        "url": None,
        "pdf_url": None,
        "has_pdf": False
    }
]
# safe api initilaization
def setup_api_key():
    load_dotenv()
    API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY")

    if not API_KEY:
        print("No API key found. Running without API (fallback mode).")
        return None

    try:
        sch = SemanticScholar(api_key=API_KEY)
        # Test request to validate key
        sch.search_paper("test", limit=1)
        print("Semantic Scholar initialized with API key")
        return sch
    except Exception as e:
        print(f"API key failed ({e}) → Using fallback mode.")
        return None
# Buid result dictionary
def build_result(topic, papers):
    return {
        "topic": topic,
        "search_timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "total_results": len(papers),
        "papers_with_pdf": sum(p["has_pdf"] for p in papers),
        "papers": papers
    }
# search papers
def search_papers(topic, limit=20):
    print(f"\n Searching for papers on topic: '{topic}'")

    sch = setup_api_key()

    # If API not available → fallback
    if sch is None:
        print(" Using fallback sample dataset.\n")
        return build_result(topic, FALLBACK_PAPERS)

    try:
        results = sch.search_paper(
            query=topic,
            limit=limit,
            fields=["paperId", "title", "abstract", "year", "authors",
                    "citationCount", "openAccessPdf", "url", "venue"]
        )

        papers = []
        for p in results:
            papers.append({
                "title": p.title,
                "authors": [a["name"] for a in p.authors] if p.authors else [],
                "year": p.year,
                "paperId": p.paperId,
                "abstract": (p.abstract[:300] + "...") if p.abstract else "No abstract",
                "citationCount": p.citationCount or 0,
                "venue": getattr(p, "venue", None),
                "url": p.url,
                "pdf_url": p.openAccessPdf["url"] if p.openAccessPdf else None,
                "has_pdf": bool(p.openAccessPdf)
            })

        print("\n Semantic Scholar search completed successfully!")
        return build_result(topic, papers)

    except Exception as e:
        print(f"\n Semantic Scholar search failed: {e}")
        print(" Using fallback dataset.\n")
        return build_result(topic, FALLBACK_PAPERS)
# save search results
def save_search_results(data):
    os.makedirs("data/search_results", exist_ok=True)
    fname = f"{data['topic'].replace(' ', '_')}_results.json"
    path = f"data/search_results/{fname}"

    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4)

    print(f"\n Results saved to: {path}")
    return path
# display results
def display_search_results(data):
    print(f" SEARCH RESULTS FOR: {data['topic']}")

    print(f"\n Total papers found: {data['total_results']}")
    print(f" Papers with PDF: {data['papers_with_pdf']}")

    print("\n TOP PAPERS:")

    for i, p in enumerate(data["papers"], start=1):
        print(f"\n{i}. {p['title']}")
        print(f"   Authors: {', '.join(p['authors'])}")
        print(f"    Year: {p['year']}")
        print(f"    Citations: {p['citationCount']}")
        print(f"    PDF: {'YES' if p['has_pdf'] else 'NO'}")
#main function
def main_search():
    print(" MODULE 1: TOPIC INPUT & PAPER SEARCH")

    topic = input("\nEnter research topic: ").strip()
    if not topic:
        topic = "machine learning"

    results = search_papers(topic)
    path = save_search_results(results)
    display_search_results(results)

    print("\n MODULE 1 COMPLETE!")
    print(f" Proceed to Module 2\n")

    return results, path

Module 2:Paper selection and Pdf download

In [None]:
import requests
import fitz  # PyMuPDF

os.makedirs("downloads", exist_ok=True)
def load_search_results(path):
    print("\n Loading Module 1 results...")

    if not path or not os.path.exists(path):
        print(f" ERROR: Cannot read results → {path}")
        return None

    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        print("Results loaded.\n")
        return data
    except Exception as e:
        print(f" JSON read error: {e}")
        return None

def rank_papers(papers):
    return sorted(
        papers,
        key=lambda p: ((p.get("citationCount") or 0), (p.get("year") or 0)),
        reverse=True
    )
def download_pdf(url, title):
    print(f"\n Downloading: {title}")

    if not url:
        print("    No PDF URL available.")
        return False, "no_url"

    # Safe standardized filename
    safe_title = "".join(char if char.isalnum() or char in " _-" else "_" for char in title)[:50]
    filename = f"{safe_title}_{abs(hash(url)) % 99999}.pdf"
    filepath = os.path.join("downloads", filename)

    try:
        r = requests.get(url, timeout=20)
        if r.status_code != 200:
            print(f"   HTTP error: {r.status_code}")
            return False, f"http_{r.status_code}"

        with open(filepath, "wb") as f:
            f.write(r.content)

        # Validate PDF
        try:
            doc = fitz.open(filepath)
            if doc.page_count == 0:
                os.remove(filepath)
                print("    Corrupted PDF.")
                return False, "empty_pdf"
        except:
            os.remove(filepath)
            print("    Could not open PDF.")
            return False, "invalid_pdf"

        print(f"    Saved at: {filepath}")
        return True, filepath

    except Exception as e:
        print(f"    ERROR: {e}")
        return False, str(e)
def ask_user_to_select(papers):

    print("\n AVAILABLE PAPERS WITH PDF:")
    for i, p in enumerate(papers, start=1):
        print(f"{i}. {p['title'][:60]}")
        print(f"    Citations: {p['citationCount']} | Year: {p['year']}")
        print()

    print("\n Press ENTER to auto-select top papers")
    choice = input("Or enter paper numbers (comma separated): ").strip()

    if choice == "":
        print("\n Auto-selecting top papers based on citation count.\n")
        return None  # automatic mode

    try:
        indexes = [int(x.strip()) for x in choice.split(",")]
        selected = [papers[i - 1] for i in indexes if 1 <= i <= len(papers)]
        print("\n User-selected papers:")
        return selected

    except:
        print("Invalid input → Using automatic selection.")
        return None

def main_module_2(results_path):

    print(" MODULE 2: PDF DOWNLOAD WITH USER OR AUTO SELECTION")
    data = load_search_results(results_path)
    if not data:
        return

    papers = data.get("papers", [])
    pdf_papers = [p for p in papers if p.get("has_pdf")]

    print(f"Total papers: {len(papers)}")
    print(f" Papers with PDF: {len(pdf_papers)}")

    if len(pdf_papers) == 0:
        print("\n⚠ No PDF papers found → Using fallback PDF paper.")
        pdf_papers = [{
            "title": "Deep Learning (Fallback PDF)",
            "pdf_url": "https://arxiv.org/pdf/1502.01852.pdf",
            "citationCount": 50000,
            "year": 2015
        }]
    selected = ask_user_to_select(pdf_papers)
    if selected is None:
        ranked = rank_papers(pdf_papers)
        selected = ranked[:3]  # choose top 3 papers automatically

    print("\n SELECTED PAPERS FOR DOWNLOAD:")
    for p in selected:
        print(f" - {p['title']} (Citations: {p.get('citationCount',0)})")
    print("\n STARTING DOWNLOADS...\n")

    for p in selected:
        success, info = download_pdf(p.get("pdf_url"), p["title"])
        print(f"  RESULT: {'SUCCESS' if success else 'FAILED'} ({info})")

    print("\n MODULE 2 COMPLETE!")
    print(" PDFs stored in: downloads/")

In [None]:
results, path = main_search()

 MODULE 1: TOPIC INPUT & PAPER SEARCH

Enter research topic: Machine Learning

 Searching for papers on topic: 'Machine Learning'
No API key found. Running without API (fallback mode).
 Using fallback sample dataset.


 Results saved to: data/search_results/Machine_Learning_results.json
 SEARCH RESULTS FOR: Machine Learning

 Total papers found: 3
 Papers with PDF: 2

 TOP PAPERS:

1. Deep Learning
   Authors: LeCun, Bengio, Hinton
    Year: 2015
    Citations: 50000
    PDF: YES

2. Attention Is All You Need
   Authors: Vaswani, Shazeer
    Year: 2017
    Citations: 100000
    PDF: YES

3. Machine Learning Foundations
   Authors: Mitchell
    Year: 1997
    Citations: 20000
    PDF: NO

 MODULE 1 COMPLETE!
 Proceed to Module 2



In [None]:
main_module_2(path)


 MODULE 2: PDF DOWNLOAD WITH USER OR AUTO SELECTION

 Loading Module 1 results...
Results loaded.

Total papers: 3
 Papers with PDF: 2

 AVAILABLE PAPERS WITH PDF:
1. Deep Learning
    Citations: 50000 | Year: 2015

2. Attention Is All You Need
    Citations: 100000 | Year: 2017


 Press ENTER to auto-select top papers
Or enter paper numbers (comma separated): 

 Auto-selecting top papers based on citation count.


 SELECTED PAPERS FOR DOWNLOAD:
 - Attention Is All You Need (Citations: 100000)
 - Deep Learning (Citations: 50000)

 STARTING DOWNLOADS...


 Downloading: Attention Is All You Need
    Saved at: downloads/Attention Is All You Need_27546.pdf
  RESULT: SUCCESS (downloads/Attention Is All You Need_27546.pdf)

 Downloading: Deep Learning
    Saved at: downloads/Deep Learning_62071.pdf
  RESULT: SUCCESS (downloads/Deep Learning_62071.pdf)

 MODULE 2 COMPLETE!
 PDFs stored in: downloads/


Module 3:PDF Text Extraction

In [None]:
import fitz  # PyMuPDF
from datetime import datetime

DOWNLOAD_DIR = "downloads"
EXTRACT_DIR = "data/extracted_text"

os.makedirs(EXTRACT_DIR, exist_ok=True)


def extract_text_from_pdf(pdf_path, max_pages=20):
    text = ""

    try:
        with fitz.open(pdf_path) as doc:
            pages_to_read = min(len(doc), max_pages)

            for page_num in range(pages_to_read):
                page = doc.load_page(page_num)
                text += page.get_text()

        return text.strip()

    except Exception as e:
        print(f" Failed to extract text from {pdf_path}: {e}")
        return ""


def run_module_3():
    print("MODULE 3: PDF TEXT EXTRACTION")

    pdf_files = [f for f in os.listdir(DOWNLOAD_DIR) if f.endswith(".pdf")]

    if not pdf_files:
        print("No PDF files found. Run Module 2 first.")
        return []

    extracted_files = []

    for pdf in pdf_files:
        pdf_path = os.path.join(DOWNLOAD_DIR, pdf)
        print(f"\n Processing: {pdf}")

        text = extract_text_from_pdf(pdf_path)

        if not text:
            print(" No text extracted.")
            continue

        base_name = os.path.splitext(pdf)[0]
        text_path = os.path.join(EXTRACT_DIR, base_name + ".txt")
        meta_path = os.path.join(EXTRACT_DIR, base_name + "_meta.json")

        # Save extracted text
        with open(text_path, "w", encoding="utf-8") as f:
            f.write(text)

        # Save metadata
        metadata = {
            "source_pdf": pdf,
            "characters_extracted": len(text),
            "extraction_time": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }

        with open(meta_path, "w", encoding="utf-8") as f:
            json.dump(metadata, f, indent=4)

        extracted_files.append(text_path)

        print(f" Text extracted and saved: {text_path}")

    print("\n MODULE 3 COMPLETE!")
    print(f" Extracted files saved in: {EXTRACT_DIR}")

    return extracted_files


In [None]:
run_module_3()

MODULE 3: PDF TEXT EXTRACTION

 Processing: Attention Is All You Need_27546.pdf
 Text extracted and saved: data/extracted_text/Attention Is All You Need_27546.txt

 Processing: Deep Learning_62071.pdf
 Text extracted and saved: data/extracted_text/Deep Learning_62071.txt

 MODULE 3 COMPLETE!
 Extracted files saved in: data/extracted_text


['data/extracted_text/Attention Is All You Need_27546.txt',
 'data/extracted_text/Deep Learning_62071.txt']

Module 4:Paper Summarization

In [None]:
import re
from datetime import datetime

EXTRACT_DIR = "data/extracted_text"
SUMMARY_DIR = "data/summaries"

os.makedirs(SUMMARY_DIR, exist_ok=True)


def simple_summary(text, max_sentences=5):
    text = re.sub(r"\s+", " ", text)
    sentences = re.split(r"(?<=[.!?])\s+", text)
    return " ".join(sentences[:max_sentences]).strip()


def run_module_4():
    print(" MODULE 4: PAPER SUMMARIZATION")

    if not os.path.exists(EXTRACT_DIR):
        print(" Extracted text folder missing. Run Module 3 first.")
        return

    text_files = [f for f in os.listdir(EXTRACT_DIR) if f.endswith(".txt")]

    print(f" Text files found: {len(text_files)}")

    if not text_files:
        print(" No extracted text files found. Module 4 cannot proceed.")
        return

    for txt in text_files:
        txt_path = os.path.join(EXTRACT_DIR, txt)
        print(f"\n Generating summary for: {txt}")

        with open(txt_path, "r", encoding="utf-8") as f:
            text = f.read()

        if not text.strip():
            print(" Empty text. Skipping.")
            continue

        summary = simple_summary(text)

        base = os.path.splitext(txt)[0]
        summary_path = os.path.join(SUMMARY_DIR, base + "_summary.txt")
        meta_path = os.path.join(SUMMARY_DIR, base + "_summary_meta.json")

        with open(summary_path, "w", encoding="utf-8") as f:
            f.write(summary)

        meta = {
            "source_text": txt,
            "summary_method": "extractive_first_n_sentences",
            "sentences_used": 5,
            "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }

        with open(meta_path, "w", encoding="utf-8") as f:
            json.dump(meta, f, indent=4)

        print(f" Summary saved to: {summary_path}")

    print("\n MODULE 4 COMPLETE!")
    print(f" Summaries stored in: {SUMMARY_DIR}")


In [None]:
run_module_4()

 MODULE 4: PAPER SUMMARIZATION
 Text files found: 2

 Generating summary for: Deep Learning_62071.txt
 Summary saved to: data/summaries/Deep Learning_62071_summary.txt

 Generating summary for: Attention Is All You Need_27546.txt
 Summary saved to: data/summaries/Attention Is All You Need_27546_summary.txt

 MODULE 4 COMPLETE!
 Summaries stored in: data/summaries


Module 5:Knowledge Indexing & Question and answers

MODULE 6: TOPIC CLUSTERING & INSIGHTS GENERATION

1)Read summaries from Module 4

2)Convert text into TF-IDF vectors

3)Apply K-Means clustering

4)Display top keywords per cluster

5)Assign each paper to a cluster