In [2]:

!pip install semanticscholar python-dotenv PyMuPDF requests tqdm -q


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/24.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/24.1 MB[0m [31m37.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/24.1 MB[0m [31m124.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━[0m [32m17.2/24.1 MB[0m [31m225.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━[0m [32m23.1/24.1 MB[0m [31m243.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m24.1/24.1 MB[0m [31m137.2 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m24.1/24.1 MB[0m [31m137.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [

In [3]:
import os
import sys
import json
import requests
import pandas as pd
import fitz  # PyMuPDF
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.notebook import tqdm
from semanticscholar import SemanticScholar
from google.colab import files


class Config:
    BASE_DIR = "/content/research_workspace"
    DATA_DIR = os.path.join(BASE_DIR, "metadata")
    PDF_DIR = os.path.join(BASE_DIR, "pdfs")
    DATASET_PATH = os.path.join(BASE_DIR, "research_dataset.csv")

    API_KEY = "esyhXUy8KZc2T3lZmJqlDDmda5pBVa4jheFA4Dg0"
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'

    @staticmethod
    def setup_directories():
        if os.path.exists(Config.BASE_DIR):
            import shutil
            pass
        os.makedirs(Config.DATA_DIR, exist_ok=True)
        os.makedirs(Config.PDF_DIR, exist_ok=True)
        print(f" Workspace ready at: {Config.BASE_DIR}")


class ResearchAgent:
    def __init__(self):
        Config.setup_directories()
        self.sch = SemanticScholar(api_key=Config.API_KEY)
        self.current_results = None
        self.downloaded_papers = []

    def search_topic(self, topic, limit=20):
        print(f"\n Searching for: '{topic}'...")
        try:
            results = self.sch.search_paper(
                query=topic,
                limit=limit,
                fields=["paperId", "title", "abstract", "year", "authors",
                        "citationCount", "openAccessPdf", "url"]
            )

            papers = []
            for item in results:
                pdf_info = getattr(item, 'openAccessPdf', None)
                pdf_url = pdf_info['url'] if pdf_info else None

                papers.append({
                    "title": item.title,
                    "year": item.year or 0,
                    "authors": [a.name for a in item.authors] if item.authors else [],
                    "citationCount": item.citationCount or 0,
                    "abstract": item.abstract,
                    "pdf_url": pdf_url,
                    "has_pdf": bool(pdf_url)
                })

            self.current_results = papers
            self._display_summary()
            return self.current_results
        except Exception as e:
            print(f" Search failed: {str(e)}")
            return None

    def _display_summary(self):
        if not self.current_results: return
        pdf_count = sum(1 for p in self.current_results if p['has_pdf'])
        print(f"\n FOUND: {len(self.current_results)} Papers | {pdf_count} PDFs Available")

    # --- 2. DOWNLOAD ---
    def process_and_download(self, top_n=3):
        if not self.current_results: return []

        available_pdfs = [p for p in self.current_results if p['has_pdf']]
        if not available_pdfs:
            print(" No PDFs found.")
            return []

        ranked = sorted(available_pdfs, key=lambda x: (x['citationCount'], x['year']), reverse=True)[:top_n]
        print(f"\n Downloading Top {len(ranked)} Papers...")

        self.downloaded_papers = []

        with ThreadPoolExecutor(max_workers=4) as executor:
            futures = {executor.submit(self._download_worker, p): p for p in ranked}

            for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading"):
                result = future.result()
                if result:
                    self.downloaded_papers.append(result)

        return self.downloaded_papers

    def _download_worker(self, paper):
        safe_title = "".join(c for c in paper["title"] if c.isalnum() or c in " _-")[:50]
        filename = os.path.join(Config.PDF_DIR, f"{safe_title}.pdf")

        try:
            if not os.path.exists(filename):
                response = requests.get(paper['pdf_url'], headers={'User-Agent': Config.USER_AGENT}, timeout=15)
                if response.status_code == 200:
                    with open(filename, "wb") as f:
                        f.write(response.content)

            # Return combined metadata + local path
            paper['local_path'] = filename
            return paper
        except:
            return None

    def create_dataset(self):
        """Extracts text from downloaded PDFs and builds a DataFrame."""
        if not self.downloaded_papers:
            print(" No papers downloaded to process.")
            return None

        print(f"\n Extracting text modules from {len(self.downloaded_papers)} papers...")
        dataset = []

        for paper in tqdm(self.downloaded_papers, desc="Parsing PDFs"):
            path = paper.get('local_path')
            extracted_text = ""

            try:
                with fitz.open(path) as doc:
                    for page in doc:
                        extracted_text += page.get_text()
            except Exception as e:
                extracted_text = f"Error reading PDF: {str(e)}"

            dataset.append({
                "title": paper['title'],
                "year": paper['year'],
                "authors": ", ".join(paper['authors']),
                "citations": paper['citationCount'],
                "abstract": paper['abstract'],
                "full_text": extracted_text,
                "filename": os.path.basename(path)
            })

        df = pd.DataFrame(dataset)
        df.to_csv(Config.DATASET_PATH, index=False)
        print(f" Dataset built! Shape: {df.shape}")
        return Config.DATASET_PATH


agent = ResearchAgent()

topic = input("Enter research topic: ") or "Reinforcement Learning"
agent.search_topic(topic)

dl_choice = input("\nDownload papers and build dataset? (y/n): ")
if dl_choice.lower() == 'y':
    count = int(input("How many papers? (default 3): ") or 3)

    downloaded = agent.process_and_download(top_n=count)

    if downloaded:
        dataset_path = agent.create_dataset()

        print("\n Packaging files...")
        !zip -r /content/research_bundle.zip /content/research_workspace

        print("\n Downloading bundle (PDFs + Dataset CSV)...")
        files.download("/content/research_bundle.zip")
    else:
        print(" Download failed or no PDFs found.")
else:
    print("Skipped.")

 Workspace ready at: /content/research_workspace
Enter research topic: wild fire prediction

 Searching for: 'wild fire prediction'...

 FOUND: 1000 Papers | 459 PDFs Available

Download papers and build dataset? (y/n): y
How many papers? (default 3): 5

 Downloading Top 5 Papers...


Downloading:   0%|          | 0/5 [00:00<?, ?it/s]


 Extracting text modules from 5 papers...


Parsing PDFs:   0%|          | 0/5 [00:00<?, ?it/s]

 Dataset built! Shape: (5, 7)

 Packaging files...
  adding: content/research_workspace/ (stored 0%)
  adding: content/research_workspace/pdfs/ (stored 0%)
  adding: content/research_workspace/pdfs/A human-driven decline in global burned area.pdf (deflated 7%)
  adding: content/research_workspace/pdfs/Deep Learning Face Attributes in the Wild.pdf (deflated 12%)
  adding: content/research_workspace/research_dataset.csv (deflated 65%)
  adding: content/research_workspace/metadata/ (stored 0%)

 Downloading bundle (PDFs + Dataset CSV)...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>