<a href="https://colab.research.google.com/github/springboardmentor3847a-cloud/AI-System-to-Automatically-Review-and-Summarize-Research-Papers-/blob/sravanipemmasani/Milestone_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
# Module-1
!pip install semanticscholar python-dotenv requests -q

In [9]:
import json
import os
from semanticscholar import SemanticScholar
from dotenv import load_dotenv

In [10]:
def setup_api_key():
    """Set up API key either from .env file or directly"""
    # Method 1: Try loading from .env file
    load_dotenv()
    API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY")

    # Method 2: If not in .env, use direct key
    if not API_KEY:
        # Create .env file with your API key
        with open(".env", "w") as f:
            f.write("SEMANTIC_SCHOLAR_API_KEY=83rBkeaXb14D8vGpXJezU6nrCFFmyn5L8RCvT9MM\n")
        load_dotenv()
        API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY")

    # Initialize Semantic Scholar
    if API_KEY:
        sch = SemanticScholar(api_key=API_KEY)
        print("Semantic Scholar initialized with API key")
    else:
        sch = SemanticScholar()
        print(" Using Semantic Scholar without API key (limited rate)")

    return sch


In [11]:
def search_papers(topic, limit=20):
    """
    Search Semantic Scholar for papers on a given topic
    Returns: Dictionary with search results
    """
    print(f"\n Searching for papers on: '{topic}'")
    print(f"   Requesting {limit} papers from Semantic Scholar...")

    sch = setup_api_key()

    try:
        # Search for papers
        results = sch.search_paper(
            query=topic,
            limit=limit,
            fields=["paperId", "title", "abstract", "year", "authors",
                   "citationCount", "openAccessPdf", "url", "venue"]
        )

        papers = []
        for paper in results:
            paper_data = {
                "title": paper.title,
                "authors": [author['name'] for author in paper.authors] if paper.authors else [],
                "year": paper.year,
                "paperId": paper.paperId,
                "abstract": paper.abstract[:300] + "..." if paper.abstract else "No abstract available",
                "citationCount": paper.citationCount,
                "venue": paper.venue if hasattr(paper, 'venue') else None,
                "url": paper.url,
                "pdf_url": paper.openAccessPdf['url'] if paper.openAccessPdf else None,
                "has_pdf": bool(paper.openAccessPdf)
            }
            papers.append(paper_data)

        # Calculate statistics
        papers_with_pdf = sum(1 for p in papers if p["has_pdf"])

        print(f"Search complete!")
        print(f"   Total papers found: {len(papers)}")
        print(f"   Papers with PDF available: {papers_with_pdf}")

        return {
            "topic": topic,
            "search_timestamp": "timestamp_placeholder",
            "total_results": len(papers),
            "papers_with_pdf": papers_with_pdf,
            "papers": papers
        }

    except Exception as e:
        print(f" Error searching papers: {e}")
        return None

In [12]:
def save_search_results(data, filename=None):
    """
    Save search results to JSON file
    """
    if not filename:
        # Create filename from topic
        safe_topic = "".join(c for c in data["topic"] if c.isalnum() or c == " ").replace(" ", "_")
        filename = f"paper_search_results_{safe_topic}.json"

    os.makedirs("data/search_results", exist_ok=True)
    filepath = os.path.join("data/search_results", filename)

    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)

    print(f" Search results saved to: {filepath}")
    return filepath

In [13]:
import pandas as pd

def display_results_table(data):
    """
    Convert search results into a clean, readable table using pandas.
    """
    if not data or "papers" not in data:
        print("No data to display in table")
        return

    table_data = []

    for p in data["papers"]:
        table_data.append({
            "Title": p["title"],
            "Authors": ", ".join(p["authors"][:3]) + ("..." if len(p["authors"]) > 3 else ""),
            "Year": p["year"],
            "Citations": p["citationCount"],
            "PDF": "Yes" if p["has_pdf"] else "No",
            "Venue": p["venue"]
        })

    df = pd.DataFrame(table_data)

    print("\n" + "="*80)
    print("TABLE VIEW OF RESULTS")
    print("="*80)
    display(df)

    return df


In [14]:
def main_search():
    """
    Main function for Module 1: Get topic and search for papers
    """
    print("\n" + "="*80)
    print("MODULE 1: TOPIC INPUT & PAPER SEARCH")
    print("="*80)

    # Get topic from user
    topic = input("\nEnter research topic: ").strip()
    if not topic:
        topic = "machine learning"  # Default topic

    # Search for papers
    results = search_papers(topic, limit=20)

    if results:
        # Save results
        save_path = save_search_results(results)

        # Display results
        display_results_table(results)

        print(f"\n Module 1 complete! Results saved to: {save_path}")
        print("   Proceed to Module 2 for paper selection and PDF download.")

        return results, save_path
    else:
        print(" No results found. Please try a different topic.")
        return None, None

# Run Module 1 directly if needed
if __name__ == "__main__":
    main_search()


MODULE 1: TOPIC INPUT & PAPER SEARCH

Enter research topic:  Alzheimer Detection and Classification Using SVM

 Searching for papers on: 'Alzheimer Detection and Classification Using SVM'
   Requesting 20 papers from Semantic Scholar...
Semantic Scholar initialized with API key
Search complete!
   Total papers found: 1000
   Papers with PDF available: 1000
 Search results saved to: data/search_results/paper_search_results_Alzheimer_Detection_and_Classification_Using_SVM.json

TABLE VIEW OF RESULTS


Unnamed: 0,Title,Authors,Year,Citations,PDF,Venue
0,Alzheimer Detection and Classification Using S...,"Sanchit Vashisht, Bhanu Sharma",2024.0,1,Yes,2024 IEEE International Conference on Informat...
1,Alzheimer Disease Detection of 3D-CNN with SE-...,Et. al R. Hemalatha,2023.0,1,Yes,International Journal on Recent and Innovation...
2,Application of Convolutional Neural Networks f...,"Kumar Swarnkar, Dr.Rajkumar Jhapte, Dr. Abhish...",2024.0,2,Yes,Journal of Electrical Systems
3,Using SVM for Alzheimer’s Disease detection fr...,"R. Kumari, Shivani Goel, Subhranil Das",2022.0,4,Yes,2022 IEEE 21st Mediterranean Electrotechnical ...
4,Speech-based detection of multi-class Alzheime...,"Tripti Tripathi, Rakesh Kumar",2023.0,24,Yes,International Journal of Data Science and Anal...
...,...,...,...,...,...,...
995,Functional and operatorial statistics,"S. Dabo‐Niang, F. Ferraty",2008.0,39,Yes,
996,Multivariate profiling of neurodegeneration-as...,"S. K. Kumarasamy, Yunshi Wang, Vignesh Viswana...",2008.0,3,Yes,BioData Mining
997,"Evolutionary Multi-Criterion Optimization, 5th...",,2009.0,6,Yes,International Conference on Evolutionary Multi...
998,Artificial Neural Networks: Biological Inspira...,"Wlodzislaw Duch, J. Kacprzyk, E. Oja...",2005.0,70,Yes,International Conference on Artificial Neural ...



 Module 1 complete! Results saved to: data/search_results/paper_search_results_Alzheimer_Detection_and_Classification_Using_SVM.json
   Proceed to Module 2 for paper selection and PDF download.


In [15]:
# MODULE 2: Paper Selection & PDF Download

!pip install PyMuPDF requests -q

import json
import os
import requests
import fitz  # PyMuPDF
import hashlib
from datetime import datetime




In [16]:
# 1. Load Research papers
def load_search_results(filepath=None):

    if not filepath:
        results_dir = "data/search_results"
        if os.path.exists(results_dir):
            json_files = [f for f in os.listdir(results_dir) if f.endswith('.json')]
            if json_files:
                json_files.sort(key=lambda x: os.path.getmtime(os.path.join(results_dir, x)), reverse=True)
                filepath = os.path.join(results_dir, json_files[0])
                print(f" Loading most recent search results: {json_files[0]}")
            else:
                print(" No search results found. Run Module 1 first.")
                return None
        else:
            print(" Search results directory not found. Run Module 1 first.")
            return None

    try:
        with open(filepath, "r", encoding="utf-8") as f:
            data = json.load(f)
        print(f" Loaded {len(data['papers'])} papers on '{data['topic']}'")
        return data
    except Exception as e:
        print(f" Error loading file: {e}")
        return None


In [17]:
# 2. PAPER SELECTION
def filter_papers_with_pdfs(papers):
    papers_with_pdf = []
    for paper in papers:
        if paper.get("pdf_url") and paper["pdf_url"].strip():
            url = paper["pdf_url"].lower()
            if url.endswith('.pdf') or '.pdf?' in url or 'pdf' in url:
                papers_with_pdf.append(paper)

    print(f"\n PDF Availability:")
    print(f"  • Total papers: {len(papers)}")
    print(f"  • Papers with PDF URLs: {len(papers_with_pdf)}")

    return papers_with_pdf

def rank_papers(papers):
    valid_papers = []
    for paper in papers:
        if paper.get("year") and paper.get("citationCount") is not None:
            valid_papers.append(paper)
    ranked = sorted(valid_papers,
                   key=lambda x: (x["citationCount"], x["year"]),
                   reverse=True)

    return ranked

def select_top_papers(papers, count=3):
    papers_with_pdf = filter_papers_with_pdfs(papers)
    ranked_papers = rank_papers(papers_with_pdf)
    selected = ranked_papers[:count]
    print(f"\n Selected top {len(selected)} papers for download:")
    for i, paper in enumerate(selected):
        print(f"\n{i+1}. {paper['title'][:70]}...")
        print(f"   Citations: {paper['citationCount']}")
        print(f"   Year: {paper['year']}")
        print(f"   Authors: {', '.join(paper['authors'][:2])}")

    return selected


In [18]:
# 3. PDF DOWNLOAD
def download_pdf_with_verification(url, filename, max_retries=2):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

        for attempt in range(max_retries):
            try:
                print(f"  Attempt {attempt + 1}/{max_retries}...")
                response = requests.get(url, headers=headers, timeout=30)

                if response.status_code != 200:
                    print(f"    HTTP Error: {response.status_code}")
                    continue

                # Check if it's a PDF
                if not (response.content[:4] == b'%PDF' or
                       'pdf' in response.headers.get('content-type', '').lower()):
                    print(f"    Not a PDF file")
                    continue

                # Save file
                with open(filename, 'wb') as f:
                    f.write(response.content)

                # Verify PDF
                if verify_pdf(filename):
                    size = os.path.getsize(filename)
                    print(f"    Downloaded: {size:,} bytes")
                    return True
                else:
                    print(f"     Invalid PDF")
                    os.remove(filename)
                    continue

            except requests.exceptions.Timeout:
                print(f"    Timeout")
            except Exception as e:
                print(f"    Error: {str(e)[:50]}")

        return False

    except Exception as e:
        print(f"   Download failed: {str(e)[:50]}")
        return False

def verify_pdf(filepath):
    try:
        if not os.path.exists(filepath):
            return False
            if os.path.getsize(filepath) < 1024:  # Less than 1KB
              return False
        with fitz.open(filepath) as doc:
            if len(doc) > 0:
                return True
        return False
    except:
        return False

def get_pdf_info(filepath):
    try:
        with fitz.open(filepath) as doc:
            return {
                'pages': len(doc),
                'size_bytes': os.path.getsize(filepath),
                'size_mb': round(os.path.getsize(filepath) / (1024 * 1024), 2),
                'is_valid': True
            }
    except:
        return {'is_valid': False}

def download_selected_papers(selected_papers, output_dir="downloads"):
    os.makedirs(output_dir, exist_ok=True)

    print(f"\n Starting PDF downloads to: {output_dir}/")
    print("-"*60)

    downloaded_papers = []

    for i, paper in enumerate(selected_papers):
        print(f"\n[{i+1}/{len(selected_papers)}] Downloading: {paper['title'][:60]}...")

        # Create safe filename
        safe_title = "".join(c for c in paper['title'] if c.isalnum() or c in (' ', '-', '_')).rstrip()
        if len(safe_title) > 50:
            safe_title = safe_title[:50]

        filename = f"{output_dir}/paper_{i+1}_{hashlib.md5(safe_title.encode()).hexdigest()[:8]}.pdf"

        # Download
        success = download_pdf_with_verification(paper['pdf_url'], filename)

        if success:
            # Get PDF info
            pdf_info = get_pdf_info(filename)

            # Update paper info
            paper['downloaded'] = True
            paper['local_path'] = filename
            paper['download_time'] = datetime.now().isoformat()
            paper['pdf_info'] = pdf_info

            downloaded_papers.append(paper)
            print(f"    Success! {pdf_info['pages']} pages, {pdf_info['size_mb']} MB")
        else:
            paper['downloaded'] = False
            print(f"   Failed to download")

    return downloaded_papers

In [19]:
# 4. SAVE DOWNLOAD INFO
def save_download_report(downloaded_papers, topic, output_dir="downloads"):
    report = {
        'topic': topic,
        'download_timestamp': datetime.now().isoformat(),
        'total_selected': len(downloaded_papers),
        'successful_downloads': sum(1 for p in downloaded_papers if p.get('downloaded', False)),
        'failed_downloads': sum(1 for p in downloaded_papers if not p.get('downloaded', False)),
        'papers': downloaded_papers
    }

    os.makedirs("data/reports", exist_ok=True)
    report_file = f"data/reports/download_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

    with open(report_file, 'w', encoding='utf-8') as f:
        json.dump(report, f, indent=4, ensure_ascii=False)

    print(f"\n Download report saved to: {report_file}")
    download_list = []
    for paper in downloaded_papers:
        if paper.get('downloaded'):
            download_list.append({
                'title': paper['title'],
                'local_file': paper['local_path'],
                'size_mb': paper['pdf_info']['size_mb'],
                'pages': paper['pdf_info']['pages']
            })

    list_file = f"{output_dir}/downloaded_papers_list.json"
    with open(list_file, 'w', encoding='utf-8') as f:
        json.dump(download_list, f, indent=4, ensure_ascii=False)

    return report_file

In [20]:
# 5. VERIFICATION
def verify_downloads(output_dir="downloads"):
    print("\n" + "="*60)
    print(" VERIFICATION OF DOWNLOADS")
    print("="*60)

    if not os.path.exists(output_dir):
        print(f" Directory '{output_dir}' does not exist!")
        return 0

    pdf_files = [f for f in os.listdir(output_dir) if f.endswith('.pdf')]

    print(f"\n Directory: {os.path.abspath(output_dir)}")
    print(f" PDF files found: {len(pdf_files)}")

    if pdf_files:
        print("\nFile Details:")
        print("-"*60)

        total_size = 0
        valid_files = 0

        for pdf in pdf_files:
            filepath = os.path.join(output_dir, pdf)
            size = os.path.getsize(filepath)
            total_size += size

            # Verify PDF
            if verify_pdf(filepath):
                valid_files += 1
                with fitz.open(filepath) as doc:
                    pages = len(doc)
                print(f" {pdf}")
                print(f"   Size: {size:,} bytes ({size/1024/1024:.2f} MB)")
                print(f"   Pages: {pages}")
            else:
                print(f" {pdf} - INVALID PDF")
                print(f"   Size: {size:,} bytes")

    print(f"\n Summary:")
    print(f"  • Total PDF files: {len(pdf_files)}")
    print(f"  • Valid PDFs: {valid_files}")
    print(f"  • Total size: {total_size/1024/1024:.2f} MB")

    return valid_files


In [21]:
# 6. MAIN DOWNLOAD FUNCTION

def main_download(filepath=None, download_count=3):
    print("\n" + "="*80)
    print("MODULE 2: PAPER SELECTION & PDF DOWNLOAD")
    print("="*80)
    data = load_search_results(filepath)
    if not data:
        return None
    selected_papers = select_top_papers(data["papers"], count=download_count)

    if not selected_papers:
        print(" No papers with PDFs available for download.")
        return None
    downloaded = download_selected_papers(selected_papers)
    report_file = save_download_report(downloaded, data["topic"])
    verify_downloads()

    print(f"\n Module 2 complete!")
    print(f"   Downloaded papers are in: downloads/")
    print(f"   Report saved to: {report_file}")
    print(f"\n Milestone 1 complated!")

    return downloaded
if __name__ == "__main__":
    main_download(download_count=3)


MODULE 2: PAPER SELECTION & PDF DOWNLOAD
 Loading most recent search results: paper_search_results_Alzheimer_Detection_and_Classification_Using_SVM.json
 Loaded 1000 papers on 'Alzheimer Detection and Classification Using SVM'

 PDF Availability:
  • Total papers: 1000
  • Papers with PDF URLs: 164

 Selected top 3 papers for download:

1. CNN Features Off-the-Shelf: An Astounding Baseline for Recognition...
   Citations: 5048
   Year: 2014
   Authors: A. Razavian, Hossein Azizpour

2. Automatic classification of MR scans in Alzheimer's disease....
   Citations: 1203
   Year: 2008
   Authors: S. Klöppel, C. Stonnington

3. Bearing Health Monitoring Based on Hilbert–Huang Transform, Support Ve...
   Citations: 584
   Year: 2015
   Authors: A. Soualhi, K. Medjaher

 Starting PDF downloads to: downloads/
------------------------------------------------------------

[1/3] Downloading: CNN Features Off-the-Shelf: An Astounding Baseline for Recog...
  Attempt 1/2...
    Downloaded: 405,617 