<a href="https://colab.research.google.com/github/springboardmentor3847a-cloud/AI-System-to-Automatically-Review-and-Summarize-Research-Papers-/blob/HarshithaNancharla-Branch/Milestone1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install requests semanticscholar crossrefapi -q

import json
import os
from datetime import datetime
import requests
from crossref.restful import Works
from semanticscholar import SemanticScholar

# ============================================
# 1. SEMANTIC SCHOLAR SEARCH (Main Source)
# ============================================

def search_semantic_scholar(query, max_results=10):
    print("üîç Searching Semantic Scholar...")
    sch = SemanticScholar(timeout=10)

    papers = []
    try:
        results = sch.search_paper(query, limit=max_results)

        for p in results:
            papers.append({
                "title": p.title,
                "authors": [a.name for a in p.authors] if p.authors else [],
                "year": p.year,
                "pdf_url": p.openAccessPdf.get("url") if p.openAccessPdf else None,
                "citationCount": p.citationCount,
                "abstract": p.abstract,
                "url": p.url
            })
    except Exception as e:
        print("Semantic Scholar error:", e)

    return papers

# ============================================
# 2. CROSSREF SEARCH (Backup Source)
# ============================================

def search_crossref(query, max_results=5):
    print("üîç Searching CrossRef...")
    works = Works()
    papers = []

    try:
        results = works.query(query).sort("score")

        count = 0
        for item in results:
            if count >= max_results:
                break

            papers.append({
                "title": item.get("title", [""])[0],
                "authors": [a.get("family","") for a in item.get("author", [])],
                "year": item.get("issued", {}).get("date-parts", [[None]])[0][0],
                "pdf_url": None,
                "citationCount": None,
                "abstract": None,
                "url": item.get("URL", "")
            })
            count += 1
    except Exception as e:
        print("CrossRef error:", e)

    return papers

# ============================================
# 3. MAIN FUNCTION (Module-1)
# ============================================

def run_module1(topic):
    print("\n===============================")
    print("MODULE 1: Paper Search Started")
    print("===============================\n")

    ss_results = search_semantic_scholar(topic, max_results=10)
    cr_results = search_crossref(topic, max_results=5)

    all_papers = ss_results + cr_results

    print(f"\nüìö Total papers found: {len(all_papers)}")

    # Create folder if not exists
    os.makedirs("data/search_results", exist_ok=True)

    filename = f"data/search_results/search_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

    with open(filename, "w", encoding="utf-8") as f:
        json.dump({
            "topic": topic,
            "papers": all_papers
        }, f, indent=4, ensure_ascii=False)

    print("\n‚úÖ Module 1 Completed!")
    print(f"üìÅ Results saved to: {filename}")

    return filename


# --------------------------------------------
# RUN MODULE-1 (Change your topic here)
# --------------------------------------------

topic = "AI system to automatically review and summarize research papers"
run_module1(topic)



MODULE 1: Paper Search Started

üîç Searching Semantic Scholar...
üîç Searching CrossRef...

üìö Total papers found: 16

‚úÖ Module 1 Completed!
üìÅ Results saved to: data/search_results/search_results_20251211_145103.json


'data/search_results/search_results_20251211_145103.json'

In [7]:
# ===============================
# MODULE 2 : PAPER SELECTION & PDF DOWNLOAD
# ===============================

!pip install PyMuPDF requests -q

import json
import os
import requests
import fitz  # PyMuPDF
import hashlib
from datetime import datetime

# -----------------------------------
# 1. LOAD SEARCH RESULTS
# -----------------------------------

def load_search_results(filepath=None):
    """
    Load previously saved search results
    """
    if not filepath:
        print("‚ùå ERROR: You must provide the JSON filepath from Module 1.")
        return None

    try:
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)
        print(f" Loaded {len(data['papers'])} papers on topic: {data['topic']}")
        return data
    except Exception as e:
        print(f" Error loading search results: {e}")
        return None

# -----------------------------------
# 2. PAPER SELECTION
# -----------------------------------

def filter_papers_with_pdfs(papers):
    """
    Filter only papers that have PDF URLs
    """
    papers_with_pdf = [
        p for p in papers
        if p.get("pdf_url") and ("pdf" in p["pdf_url"].lower())
    ]

    print(f"\n PDF Check:")
    print(f" ‚Ä¢ Total papers: {len(papers)}")
    print(f" ‚Ä¢ Papers with PDF URLs: {len(papers_with_pdf)}")

    return papers_with_pdf


def rank_papers(papers):
    """
    Sort papers by citation count & year
    """
    valid = [
        p for p in papers
        if p.get("citationCount") is not None and p.get("year")
    ]

    ranked = sorted(valid, key=lambda x: (x["citationCount"], x["year"]), reverse=True)
    return ranked


def select_top_papers(papers, count=3):
    papers_with_pdf = filter_papers_with_pdfs(papers)
    ranked = rank_papers(papers_with_pdf)
    selected = ranked[:count]

    print(f"\n Top {count} Selected Papers:")
    for i, p in enumerate(selected):
        print(f"\n{i+1}. {p['title']}")
        print(f"   ‚Ü≥ Citations: {p['citationCount']}, Year: {p['year']}")

    return selected

# -----------------------------------
# 3. VERIFY PDF
# -----------------------------------

def verify_pdf(filepath):
    """
    Verify if the downloaded file is a valid PDF
    """
    try:
        if not os.path.exists(filepath):
            return False
        if os.path.getsize(filepath) < 1024:  # less than 1 KB
            return False
        with fitz.open(filepath) as doc:
            if len(doc) > 0:
                return True
        return False
    except:
        return False

# -----------------------------------
# 4. PDF DOWNLOAD
# -----------------------------------

def download_pdf_with_verification(url, filename, max_retries=3):
    """
    Download PDF with verification and retry
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36',
        'Accept': 'application/pdf,text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
    }

    for attempt in range(max_retries):
        try:
            print(f"  Attempt {attempt + 1}/{max_retries}...")
            response = requests.get(url, headers=headers, timeout=30, stream=True)

            if response.status_code == 403:
                print("    HTTP 403 Forbidden. Retrying...")
                continue
            if response.status_code != 200:
                print(f"    HTTP Error: {response.status_code}")
                continue

            # Save PDF
            with open(filename, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)

            # Verify PDF
            if verify_pdf(filename):
                size = os.path.getsize(filename)
                print(f"    ‚úÖ Downloaded: {size:,} bytes")
                return True
            else:
                print("    ‚ùå Invalid PDF. Retrying...")
                os.remove(filename)

        except requests.exceptions.Timeout:
            print("    Timeout. Retrying...")
        except Exception as e:
            print(f"    Error: {str(e)[:50]}")

    return False

def download_selected_papers(selected, output_dir="downloads"):
    os.makedirs(output_dir, exist_ok=True)
    downloaded = []

    for i, paper in enumerate(selected):
        print(f"\n[{i+1}/{len(selected)}] Downloading:")
        print(" ", paper['title'])

        safe_title = "".join(c for c in paper['title'] if c.isalnum())[:30]
        filename = f"{output_dir}/{safe_title}.pdf"

        if download_pdf_with_verification(paper["pdf_url"], filename):
            print("   ‚úÖ Success:", filename)
            paper["local_file"] = filename
            downloaded.append(paper)
        else:
            print("   ‚ùå Failed")

    return downloaded

# -----------------------------------
# 5. SAVE REPORT
# -----------------------------------

def save_download_report(downloaded, topic):
    os.makedirs("data/reports", exist_ok=True)

    report = {
        "topic": topic,
        "download_count": len(downloaded),
        "papers": downloaded
    }

    output_file = f"data/reports/download_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(report, f, indent=4)

    print("\n Report saved to:", output_file)
    return output_file

# -----------------------------------
# MAIN FUNCTION
# -----------------------------------

def main_module2(json_path, top_n=3):
    print("\n========== MODULE 2 STARTED ==========\n")

    data = load_search_results(json_path)
    if not data:
        return

    selected = select_top_papers(data["papers"], count=top_n)
    downloaded = download_selected_papers(selected)
    save_download_report(downloaded, data["topic"])

    print("\n========== MODULE 2 COMPLETED ==========\n")


In [9]:
json_path = "data/search_results/search_results_20251211_145103.json"
main_module2(json_path, top_n=3)




 Loaded 16 papers on topic: AI system to automatically review and summarize research papers

 PDF Check:
 ‚Ä¢ Total papers: 16
 ‚Ä¢ Papers with PDF URLs: 4

 Top 3 Selected Papers:

1. Automatic assessment of text-based responses in post-secondary education: A systematic review
   ‚Ü≥ Citations: 91, Year: 2023

2. Editorial for Special Issue on Large-scale Pre-training: Data, Models, and Fine-tuning
   ‚Ü≥ Citations: 2, Year: 2023

3. Special issue on future hybrid artificial intelligence and machine learning for smart expert systems
   ‚Ü≥ Citations: 0, Year: 2021

[1/3] Downloading:
  Automatic assessment of text-based responses in post-secondary education: A systematic review
  Attempt 1/3...
    ‚úÖ Downloaded: 962,703 bytes
   ‚úÖ Success: downloads/Automaticassessmentoftextbased.pdf

[2/3] Downloading:
  Editorial for Special Issue on Large-scale Pre-training: Data, Models, and Fine-tuning
  Attempt 1/3...
    ‚úÖ Downloaded: 380,064 bytes
   ‚úÖ Success: downloads/Editorialfor