In [1]:
!pip install semanticscholar langchain langgraph openai PyMuPDF4LLM python-dotenv gradio

Collecting semanticscholar
  Downloading semanticscholar-0.11.0-py3-none-any.whl.metadata (3.8 kB)
Collecting PyMuPDF4LLM
  Downloading pymupdf4llm-0.2.7-py3-none-any.whl.metadata (7.5 kB)
Collecting pymupdf>=1.26.6 (from PyMuPDF4LLM)
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading semanticscholar-0.11.0-py3-none-any.whl (26 kB)
Downloading pymupdf4llm-0.2.7-py3-none-any.whl (66 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.9/66.9 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m48.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf, PyMuPDF4LLM, semanticscholar
Successfully installed PyMuPDF4LLM-0.2.7 pymupdf-1.26.7 semanticscholar-0.11.0


In [4]:
import os

with open(".env", "w") as f:
    f.write("SEMANTIC_SCHOLAR_API_KEY=QyzAnc3la76icrOJH4oc72S3PG0c4DAOPO6sjb6e\n")

In [5]:
import os
import json
import requests
import logging
import re
from datetime import datetime
from semanticscholar import SemanticScholar
from dotenv import load_dotenv

# Configure Logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    datefmt='%H:%M:%S'
)
logger = logging.getLogger(__name__)

# Load Environment Variables
load_dotenv()

# API Key Handling (Robust check)
API_KEY = os.getenv("SEMANTIC_SCHOLAR_API_KEY")
if not API_KEY:
    # prompt user if not in .env
    print("API Key not found in environment.")
    API_KEY = input("Please enter your Semantic Scholar API Key: ").strip()

# Initialize Client
sch = SemanticScholar(api_key=API_KEY)

class ResearchAssistant:
    def __init__(self, base_dir="research_workspace"):
        self.base_dir = base_dir
        os.makedirs(self.base_dir, exist_ok=True)

    def sanitize_filename(self, text):
        """Creates a safe filename from a string."""
        s = re.sub(r'[\\/*?:"<>|]', "", text)
        return s.strip()[:100]  # Limit length

    def get_search_parameters(self):
        """Captures user input for dynamic research topics."""
        print("\n--- Research Configuration ---")
        topic = input("Enter research topic: ").strip()

        # Optional filters
        print("Optional Filters (press Enter to skip):")
        min_year = input("  Minimum Publication Year (e.g., 2020): ").strip()
        min_citations = input("  Minimum Citations (e.g., 10): ").strip()

        return {
            "topic": topic,
            "min_year": int(min_year) if min_year.isdigit() else 2015,
            "min_citations": int(min_citations) if min_citations.isdigit() else 0
        }

    def search_and_rank(self, params, fetch_limit=50, selection_limit=3):
        """
        Searches papers, filters them, and ranks them to select the best candidates.
        Corresponds to the 'Automated paper search' and 'Smart paper selection' tasks.
        """
        logger.info(f"Searching for papers on: {params['topic']}")

        try:
            # Fetching extra fields for better ranking
            results = sch.search_paper(
                params['topic'],
                limit=fetch_limit,
                fields=['title', 'authors', 'year', 'citationCount', 'openAccessPdf', 'url', 'abstract', 'venue']
            )
        except Exception as e:
            logger.error(f"Search API failed: {str(e)}")
            return []

        candidates = []
        for paper in results:
            # 1. Filter: Check basic data integrity
            if not paper.title or not paper.year:
                continue

            # 2. Filter: User constraints
            if paper.year < params['min_year']:
                continue
            if (paper.citationCount or 0) < params['min_citations']:
                continue

            # 3. Score: Weighted Ranking Algorithm
            # Recency Score: Papers newer than 5 years get higher points
            recency_score = max(0, (paper.year - (datetime.now().year - 5))) * 2

            # Impact Score: Log-scale approximation for citations (capped)
            citations = paper.citationCount or 0
            impact_score = min(citations / 10, 20)

            # Accessibility Score: Huge bonus if we can actually download the PDF
            pdf_url = paper.openAccessPdf['url'] if paper.openAccessPdf else None
            access_score = 50 if pdf_url else 0

            total_score = recency_score + impact_score + access_score

            candidates.append({
                "paperId": paper.paperId,
                "title": paper.title,
                "authors": [a['name'] for a in paper.authors] if paper.authors else [],
                "year": paper.year,
                "citations": citations,
                "venue": paper.venue,
                "url": paper.url,
                "pdf_url": pdf_url,
                "score": total_score
            })

        # Sort by calculated score descending
        candidates.sort(key=lambda x: x['score'], reverse=True)

        selected = candidates[:selection_limit]
        logger.info(f"Screened {len(candidates)} papers. Selected top {len(selected)}.")
        return selected

    def download_pdfs(self, papers, topic):
        """
        Downloads PDFs for the selected papers and organizes them.
        Corresponds to 'Automatic PDF retrieval'.
        """
        # Create organized structure: workspace/topic/pdfs/
        topic_slug = self.sanitize_filename(topic.replace(" ", "_"))
        topic_path = os.path.join(self.base_dir, topic_slug)
        pdf_dir = os.path.join(topic_path, "pdfs")
        os.makedirs(pdf_dir, exist_ok=True)

        logger.info("Starting PDF downloads...")

        successful_downloads = []

        for paper in papers:
            if not paper['pdf_url']:
                logger.warning(f"Skipping download (No Open Access URL): {paper['title']}")
                paper['local_path'] = None
                continue

            # Format: Year - FirstAuthor - TitleTruncated.pdf
            first_author = paper['authors'][0].split()[-1] if paper['authors'] else "Unknown"
            safe_title = self.sanitize_filename(paper['title'])
            filename = f"{paper['year']}_{first_author}_{safe_title}.pdf"
            save_path = os.path.join(pdf_dir, filename)

            try:
                response = requests.get(paper['pdf_url'], timeout=30)
                if response.status_code == 200 and b"%PDF" in response.content[:20]:
                    with open(save_path, "wb") as f:
                        f.write(response.content)

                    paper['local_path'] = save_path
                    paper['download_status'] = "Success"
                    successful_downloads.append(paper)
                    logger.info(f"Downloaded: {filename}")
                else:
                    logger.warning(f"Invalid PDF content: {paper['title']}")
                    paper['download_status'] = "Failed"
            except Exception as e:
                logger.error(f"Download error for {paper['title']}: {e}")
                paper['download_status'] = "Error"

        # Save Metadata
        metadata_path = os.path.join(topic_path, "dataset_metadata.json")
        with open(metadata_path, "w") as f:
            json.dump(papers, f, indent=4)

        logger.info(f"Process Complete. Metadata saved to: {metadata_path}")
        return successful_downloads

# --- Main Execution Flow ---
if __name__ == "__main__":
    # Initialize System
    assistant = ResearchAssistant()

    # 1. Get Input
    params = assistant.get_search_parameters()

    if params['topic']:
        # 2. Search & Rank
        top_papers = assistant.search_and_rank(params)

        if top_papers:
            print(f"\nTop {len(top_papers)} Papers Selected:")
            for i, p in enumerate(top_papers, 1):
                print(f"{i}. [{p['year']}] {p['title']} (Citations: {p['citations']})")

            # 3. Download & Store
            assistant.download_pdfs(top_papers, params['topic'])
        else:
            logger.warning("No suitable papers found matching criteria.")
    else:
        logger.error("Topic is required.")


--- Research Configuration ---
Enter research topic: machine learning
Optional Filters (press Enter to skip):
  Minimum Publication Year (e.g., 2020): 2020
  Minimum Citations (e.g., 10): 5

Top 3 Papers Selected:
1. [2024] Leveraging large language models for predictive chemistry (Citations: 289)
2. [2023] Understanding of Machine Learning with Deep Learning: Architectures, Workflow, Applications and Future Directions (Citations: 718)
3. [2023] Small data machine learning in materials science (Citations: 457)


