In [6]:
!pip install semanticscholar python-dotenv PyMuPDF requests tqdm pandas -q

import os
import re
import json
import logging
import requests
import pandas as pd
import fitz  # PyMuPDF

from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from semanticscholar import SemanticScholar
from collections import defaultdict
from google.colab import files


class Config:
    BASE_DIR = "/content/research_workspace"
    PDF_DIR = os.path.join(BASE_DIR, "pdfs")
    OUTPUT_DIR = os.path.join(BASE_DIR, "outputs")
    DATASET_PATH = os.path.join(BASE_DIR, "dataset.csv")

    API_KEY = "esyhXUy8KZc2T3lZmJqlDDmda5pBVa4jhe_api_full_fill"  # need to complete the api key
    USER_AGENT = "Mozilla/5.0"

    @staticmethod
    def setup():
        os.makedirs(Config.PDF_DIR, exist_ok=True)
        os.makedirs(Config.OUTPUT_DIR, exist_ok=True)
        logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")


class PaperSearcher:
    def __init__(self):
        self.sch = SemanticScholar(api_key=Config.API_KEY)

    def search(self, topic, limit=10):
        logging.info(f"Searching papers for topic: {topic}")

        results = self.sch.search_paper(
            query=topic,
            limit=limit,
            fields=["title", "year", "authors", "citationCount", "abstract", "openAccessPdf"]
        )

        papers = []
        for r in results:
            pdf = getattr(r, "openAccessPdf", None)
            papers.append({
                "title": r.title,
                "year": r.year or 0,
                "authors": ", ".join(a.name for a in r.authors),
                "citations": r.citationCount or 0,
                "abstract": r.abstract,
                "pdf_url": pdf["url"] if pdf else None
            })

        return papers


class PDFDownloader:
    @staticmethod
    def download(papers, top_n=3):
        papers = [p for p in papers if p["pdf_url"]]
        papers = sorted(papers, key=lambda x: x["citations"], reverse=True)[:top_n]

        downloaded = []

        def worker(p):
            name = re.sub(r"[^\w\-]", "_", p["title"])[:50]
            path = os.path.join(Config.PDF_DIR, f"{name}.pdf")
            try:
                r = requests.get(
                    p["pdf_url"],
                    headers={"User-Agent": Config.USER_AGENT},
                    timeout=20
                )
                if r.status_code == 200:
                    with open(path, "wb") as f:
                        f.write(r.content)
                    p["pdf_path"] = path
                    return p
            except Exception as e:
                logging.error(f"Download failed: {e}")
            return None

        with ThreadPoolExecutor(max_workers=4) as ex:
            futures = [ex.submit(worker, p) for p in papers]
            for f in tqdm(as_completed(futures), total=len(futures), desc="Downloading PDFs"):
                res = f.result()
                if res:
                    downloaded.append(res)

        return downloaded


# --------- PDF TEXT EXTRACTION ----------
class PDFTextExtractor:
    @staticmethod
    def extract(pdf_path):
        text = ""
        try:
            with fitz.open(pdf_path) as doc:
                for page in doc:
                    text += page.get_text()
        except Exception as e:
            logging.error(f"PDF read error: {e}")
        return text


# --------- TEXT CLEANING ----------
class TextCleaner:
    @staticmethod
    def clean(text):
        text = text.replace("\n", " ")
        text = re.sub(r"\s+", " ", text)
        text = re.sub(r"\[[0-9,\s]+\]", "", text)
        text = re.sub(r"[^A-Za-z0-9.,;:()\- ]+", "", text)
        return text.strip()


# --------- SECTION EXTRACTION ----------
class SectionExtractor:
    HEADINGS = {
        "abstract": r"\babstract\b",
        "introduction": r"\bintroduction\b",
        "methodology": r"\b(methodology|methods|materials and methods)\b",
        "results": r"\b(results|experiments|evaluation)\b",
        "conclusion": r"\b(conclusion|future work)\b"
    }

    @staticmethod
    def extract(text):
        text_l = text.lower()
        sections = {}
        positions = []

        for k, v in SectionExtractor.HEADINGS.items():
            m = re.search(v, text_l)
            if m:
                positions.append((k, m.start()))

        positions.sort(key=lambda x: x[1])

        for i, (name, start) in enumerate(positions):
            end = positions[i + 1][1] if i + 1 < len(positions) else len(text)
            sections[name] = text[start:end]

        return sections


# --------- KEY INFORMATION EXTRACTION ----------
class KeyExtractor:
    METHODS = ["cnn", "transformer", "svm", "lstm", "bert", "resnet", "reinforcement"]
    METRICS = ["accuracy", "precision", "recall", "f1", "auc", "rmse"]
    DATASETS = ["imagenet", "mnist", "cifar", "dataset", "benchmark"]

    @staticmethod
    def find(text, keywords):
        return [k for k in keywords if re.search(rf"\b{k}\b", text, re.I)]

    @staticmethod
    def extract(sections):
        text = " ".join(sections.values())
        return {
            "methods": KeyExtractor.find(text, KeyExtractor.METHODS),
            "metrics": KeyExtractor.find(text, KeyExtractor.METRICS),
            "datasets": KeyExtractor.find(text, KeyExtractor.DATASETS)
        }


class Validator:
    REQUIRED = ["introduction", "methodology", "results"]

    @staticmethod
    def validate(sections):
        return {
            k: (k in sections and len(sections[k]) > 200)
            for k in Validator.REQUIRED
        }


class Comparator:
    @staticmethod
    def compare(papers):
        summary = defaultdict(list)
        for p in papers:
            for k, v in p.items():
                summary[k].append(v)
        return dict(summary)


Config.setup()

topic = input("Enter research topic: ") or "Reinforcement Learning"

searcher = PaperSearcher()
papers = searcher.search(topic, limit=10)

downloaded = PDFDownloader.download(papers, top_n=3)

dataset_records = []
analysis_results = []

for p in tqdm(downloaded, desc="Processing Papers"):
    raw_text = PDFTextExtractor.extract(p["pdf_path"])
    clean_text = TextCleaner.clean(raw_text)
    sections = SectionExtractor.extract(clean_text)
    keys = KeyExtractor.extract(sections)
    validation = Validator.validate(sections)

    dataset_records.append({
        "title": p["title"],
        "year": p["year"],
        "citations": p["citations"],
        "full_text": clean_text
    })

    analysis_results.append({
        "title": p["title"],
        "sections_found": list(sections.keys()),
        **keys,
        "validation": validation
    })

df = pd.DataFrame(dataset_records)
df.to_csv(Config.DATASET_PATH, index=False)

with open(os.path.join(Config.OUTPUT_DIR, "analysis.json"), "w") as f:
    json.dump(analysis_results, f, indent=2)


!zip -r /content/research_bundle.zip /content/research_workspace
files.download("/content/research_bundle.zip")


Enter research topic: forest fire


Downloading PDFs:   0%|          | 0/3 [00:00<?, ?it/s]

Processing Papers:   0%|          | 0/2 [00:00<?, ?it/s]

  adding: content/research_workspace/ (stored 0%)
  adding: content/research_workspace/pdfs/ (stored 0%)
  adding: content/research_workspace/pdfs/Power_laws__Pareto_distributions_and_Zipf_s_law.pdf (deflated 10%)
  adding: content/research_workspace/pdfs/Global_fire_emissions_and_the_contribution_of_defo.pdf (deflated 14%)
  adding: content/research_workspace/outputs/ (stored 0%)
  adding: content/research_workspace/outputs/analysis.json (deflated 65%)
  adding: content/research_workspace/dataset.csv (deflated 67%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>