In [2]:
!pip install semanticscholar python-dotenv PyMuPDF requests tqdm pandas -q

import os
import re
import json
import logging
import requests
import pandas as pd
import fitz  # PyMuPDF

from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from semanticscholar import SemanticScholar
from collections import defaultdict
from google.colab import files


class Config:
    BASE_DIR = "/content/research_workspace"
    PDF_DIR = os.path.join(BASE_DIR, "pdfs")
    OUTPUT_DIR = os.path.join(BASE_DIR, "outputs")
    DATASET_PATH = os.path.join(BASE_DIR, "dataset.csv")

    API_KEY = "esyhXUy8KZc2T3lZmJqlDDmda5pBVa4_fgjghjh"  # need to complete the api key
    USER_AGENT = "Mozilla/5.0"

    @staticmethod
    def setup():
        os.makedirs(Config.PDF_DIR, exist_ok=True)
        os.makedirs(Config.OUTPUT_DIR, exist_ok=True)
        logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")


class PaperSearcher:
    def __init__(self):
        self.sch = SemanticScholar(api_key=Config.API_KEY)

    def search(self, topic, limit=10):
        logging.info(f"Searching papers for topic: {topic}")

        results = self.sch.search_paper(
            query=topic,
            limit=limit,
            fields=["title", "year", "authors", "citationCount", "abstract", "openAccessPdf"]
        )

        papers = []
        for r in results:
            pdf = getattr(r, "openAccessPdf", None)
            papers.append({
                "title": r.title,
                "year": r.year or 0,
                "authors": ", ".join(a.name for a in r.authors),
                "citations": r.citationCount or 0,
                "abstract": r.abstract,
                "pdf_url": pdf["url"] if pdf else None
            })

        return papers


class PDFDownloader:
    @staticmethod
    def download(papers, top_n=3):
        papers = [p for p in papers if p["pdf_url"]]
        papers = sorted(papers, key=lambda x: x["citations"], reverse=True)[:top_n]

        downloaded = []

        def worker(p):
            name = re.sub(r"[^\w\-]", "_", p["title"])[:50]
            path = os.path.join(Config.PDF_DIR, f"{name}.pdf")
            try:
                r = requests.get(
                    p["pdf_url"],
                    headers={"User-Agent": Config.USER_AGENT},
                    timeout=20
                )
                if r.status_code == 200:
                    with open(path, "wb") as f:
                        f.write(r.content)
                    p["pdf_path"] = path
                    return p
            except Exception as e:
                logging.error(f"Download failed: {e}")
            return None

        with ThreadPoolExecutor(max_workers=4) as ex:
            futures = [ex.submit(worker, p) for p in papers]
            for f in tqdm(as_completed(futures), total=len(futures), desc="Downloading PDFs"):
                res = f.result()
                if res:
                    downloaded.append(res)

        return downloaded


# --------- PDF TEXT EXTRACTION ----------
class PDFTextExtractor:
    @staticmethod
    def extract(pdf_path):
        text = ""
        try:
            with fitz.open(pdf_path) as doc:
                for page in doc:
                    text += page.get_text()
        except Exception as e:
            logging.error(f"PDF read error: {e}")
        return text


# --------- TEXT CLEANING ----------
class TextCleaner:
    @staticmethod
    def clean(text):
        text = text.replace("\n", " ")
        text = re.sub(r"\s+", " ", text)
        text = re.sub(r"\[[0-9,\s]+\]", "", text)
        text = re.sub(r"[^A-Za-z0-9.,;:()\- ]+", "", text)
        return text.strip()


# --------- SECTION EXTRACTION ----------
class SectionExtractor:
    HEADINGS = {
        "abstract": r"\babstract\b",
        "introduction": r"\bintroduction\b",
        "methodology": r"\b(methodology|methods|materials and methods)\b",
        "results": r"\b(results|experiments|evaluation)\b",
        "conclusion": r"\b(conclusion|future work)\b"
    }

    @staticmethod
    def extract(text):
        text_l = text.lower()
        sections = {}
        positions = []

        for k, v in SectionExtractor.HEADINGS.items():
            m = re.search(v, text_l)
            if m:
                positions.append((k, m.start()))

        positions.sort(key=lambda x: x[1])

        for i, (name, start) in enumerate(positions):
            end = positions[i + 1][1] if i + 1 < len(positions) else len(text)
            sections[name] = text[start:end]

        return sections


# --------- KEY INFORMATION EXTRACTION ----------
class KeyExtractor:
    METHODS = ["cnn", "transformer", "svm", "lstm", "bert", "resnet", "reinforcement"]
    METRICS = ["accuracy", "precision", "recall", "f1", "auc", "rmse"]
    DATASETS = ["imagenet", "mnist", "cifar", "dataset", "benchmark"]

    @staticmethod
    def find(text, keywords):
        return [k for k in keywords if re.search(rf"\b{k}\b", text, re.I)]

    @staticmethod
    def extract(sections):
        text = " ".join(sections.values())
        return {
            "methods": KeyExtractor.find(text, KeyExtractor.METHODS),
            "metrics": KeyExtractor.find(text, KeyExtractor.METRICS),
            "datasets": KeyExtractor.find(text, KeyExtractor.DATASETS)
        }


class Validator:
    REQUIRED = ["introduction", "methodology", "results"]

    @staticmethod
    def validate(sections):
        return {
            k: (k in sections and len(sections[k]) > 200)
            for k in Validator.REQUIRED
        }


class Comparator:
    @staticmethod
    def compare(papers):
        summary = defaultdict(list)
        for p in papers:
            for k, v in p.items():
                summary[k].append(v)
        return dict(summary)


Config.setup()

topic = input("Enter research topic: ") or "Reinforcement Learning"

searcher = PaperSearcher()
papers = searcher.search(topic, limit=10)

downloaded = PDFDownloader.download(papers, top_n=3)

dataset_records = []
analysis_results = []

for p in tqdm(downloaded, desc="Processing Papers"):
    raw_text = PDFTextExtractor.extract(p["pdf_path"])
    clean_text = TextCleaner.clean(raw_text)
    sections = SectionExtractor.extract(clean_text)
    keys = KeyExtractor.extract(sections)
    validation = Validator.validate(sections)

    dataset_records.append({
        "title": p["title"],
        "year": p["year"],
        "citations": p["citations"],
        "full_text": clean_text
    })

    analysis_results.append({
        "title": p["title"],
        "sections_found": list(sections.keys()),
        **keys,
        "validation": validation
    })

df = pd.DataFrame(dataset_records)
df.to_csv(Config.DATASET_PATH, index=False)

with open(os.path.join(Config.OUTPUT_DIR, "analysis.json"), "w") as f:
    json.dump(analysis_results, f, indent=2)


!zip -r /content/research_bundle.zip /content/research_workspace
files.download("/content/research_bundle.zip")

Enter research topic: forest fire prediction


Downloading PDFs:   0%|          | 0/3 [00:00<?, ?it/s]

Processing Papers:   0%|          | 0/1 [00:00<?, ?it/s]

  adding: content/research_workspace/ (stored 0%)
  adding: content/research_workspace/dataset.csv (deflated 67%)
  adding: content/research_workspace/outputs/ (stored 0%)
  adding: content/research_workspace/outputs/analysis.json (deflated 52%)
  adding: content/research_workspace/pdfs/ (stored 0%)
  adding: content/research_workspace/pdfs/Multivariate_quantile_mapping_bias_correction__an_.pdf (deflated 15%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [14]:
import os
import re
import json
import logging
import requests
import pandas as pd
import fitz  # PyMuPDF
import time
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
from semanticscholar import SemanticScholar
from collections import defaultdict
from google.colab import files
from google import genai

# --- CONFIGURATION ---
class Config:
    BASE_DIR = "/content/research_workspace"
    PDF_DIR = os.path.join(BASE_DIR, "pdfs")
    OUTPUT_DIR = os.path.join(BASE_DIR, "outputs")
    DATASET_PATH = os.path.join(BASE_DIR, "dataset.csv")

    # API KEYS
    GEMINI_API_KEY = "AIzaSyCdkbi8wU4z8vlgCjJFwI-bhnmhbnb" # <--- PASTE KEY HERE
    SEMANTIC_SCHOLAR_KEY = "esyhXUy8KZc2T3lZmJqlDDmda5pBVa4jvgbhjh" # Optional

    USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"

    @staticmethod
    def setup():
        print("--> Setting up directories...")
        if os.path.exists(Config.BASE_DIR):
            import shutil
            shutil.rmtree(Config.BASE_DIR) # Clean start
        os.makedirs(Config.PDF_DIR, exist_ok=True)
        os.makedirs(Config.OUTPUT_DIR, exist_ok=True)
        logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

# --- 1. SEARCH MODULE ---
class PaperSearcher:
    def __init__(self):
        self.sch = SemanticScholar(api_key=Config.SEMANTIC_SCHOLAR_KEY)

    def search(self, topic, limit=10):
        print(f"--> Searching Semantic Scholar for: '{topic}'...")
        try:
            results = self.sch.search_paper(
                query=topic,
                limit=limit,
                fields=["title", "year", "authors", "citationCount", "abstract", "openAccessPdf"]
            )
            print(f"--> Search complete. Found {len(results)} raw results.")
        except Exception as e:
            print(f"!! Search Error: {e}")
            return []

        papers = []
        for r in results:
            pdf = getattr(r, "openAccessPdf", None)
            if pdf and pdf.get("url"): # Only keep papers with PDFs
                papers.append({
                    "title": r.title,
                    "year": r.year or 0,
                    "authors": ", ".join(a.name for a in r.authors),
                    "citations": r.citationCount or 0,
                    "abstract": r.abstract,
                    "pdf_url": pdf["url"]
                })

        print(f"--> Filtered to {len(papers)} papers with valid PDF links.")
        return papers

# --- 2. DOWNLOAD MODULE ---
class PDFDownloader:
    @staticmethod
    def download(papers, top_n=3):
        # Sort by citations and take top N
        papers = sorted(papers, key=lambda x: x["citations"], reverse=True)[:top_n]
        print(f"--> Attempting to download top {len(papers)} papers...")

        downloaded = []

        def worker(p):
            name = re.sub(r"[^\w\-]", "_", p["title"])[:50]
            path = os.path.join(Config.PDF_DIR, f"{name}.pdf")
            try:
                r = requests.get(p["pdf_url"], headers={"User-Agent": Config.USER_AGENT}, timeout=15)
                if r.status_code == 200 and len(r.content) > 1000:
                    with open(path, "wb") as f:
                        f.write(r.content)
                    p["pdf_path"] = path
                    return p
            except Exception:
                pass
            return None

        with ThreadPoolExecutor(max_workers=3) as ex:
            futures = [ex.submit(worker, p) for p in papers]
            for f in tqdm(as_completed(futures), total=len(futures), desc="Downloading"):
                res = f.result()
                if res: downloaded.append(res)

        print(f"--> Successfully downloaded {len(downloaded)} PDFs.")
        return downloaded

# --- 3. EXTRACTION MODULES ---
class PDFTextExtractor:
    @staticmethod
    def extract(pdf_path):
        text = ""
        try:
            with fitz.open(pdf_path) as doc:
                for page in doc: text += page.get_text()
        except Exception as e:
            print(f"!! PDF Read Error: {e}")
        return text

class TextCleaner:
    @staticmethod
    def clean(text):
        text = text.replace("\n", " ")
        return re.sub(r"\s+", " ", text).strip()

class SectionExtractor:
    @staticmethod
    def extract(text):
        # Simple extraction based on common keywords
        text_l = text.lower()
        sections = {}
        keys = {"abstract": "abstract", "intro": "introduction", "methods": "method", "results": "result"}

        sorted_indices = []
        for key, word in keys.items():
            idx = text_l.find(word)
            if idx != -1: sorted_indices.append((key, idx))

        sorted_indices.sort(key=lambda x: x[1])

        for i, (name, start) in enumerate(sorted_indices):
            end = sorted_indices[i+1][1] if i+1 < len(sorted_indices) else min(start + 3000, len(text))
            sections[name] = text[start:end]

        return sections

# --- 4. AI WRITER (GEMINI) ---
class GeminiWriter:
    def __init__(self):
        try:
            self.client = genai.Client(api_key=Config.GEMINI_API_KEY)
            print("--> Gemini Client initialized.")
        except Exception as e:
            print(f"!! Gemini Init Error: {e}")

    def generate_draft(self, paper_title, sections_data):
        context = "\n".join([f"{k.upper()}: {v[:1000]}" for k, v in sections_data.items()])
        prompt = f"Analyze this paper: '{paper_title}'. Context: {context}. Write a structured summary (Abstract, Methods, Results)."
        try:
            time.sleep(2) # Rate limit safety
            response = self.client.models.generate_content(model="gemini-1.5-flash", contents=prompt)
            return response.text
        except Exception as e:
            return f"AI Error: {e}"

    def synthesize(self, summaries):
        prompt = f"Synthesize these paper summaries into one paragraph comparing their results:\n{summaries[:5000]}"
        try:
            time.sleep(2)
            response = self.client.models.generate_content(model="gemini-1.5-flash", contents=prompt)
            return response.text
        except Exception as e:
            return "Synthesis failed."

# --- MAIN EXECUTION ---
if __name__ == "__main__":
    Config.setup()

    # 1. Get Topic
    topic = input("Enter Research Topic (or press Enter for default): ")
    if not topic.strip(): topic = "Machine Learning in Traffic Control"
    print(f"\n=== PROCESSING TOPIC: {topic} ===\n")

    # 2. Search
    searcher = PaperSearcher()
    found_papers = searcher.search(topic)

    if not found_papers:
        print("!!! No papers found with PDFs. Try a different topic.")
    else:
        # 3. Download
        downloader = PDFDownloader()
        docs = downloader.download(found_papers)

        if not docs:
            print("!!! Failed to download any PDFs. Check your internet or topic.")
        else:
            # 4. Analyze
            writer = GeminiWriter()
            all_summaries = ""
            final_data = []

            print("--> Analyzing Papers with AI...")
            for p in tqdm(docs, desc="AI Analysis"):
                raw_text = PDFTextExtractor.extract(p["pdf_path"])
                clean_text = TextCleaner.clean(raw_text)
                sections = SectionExtractor.extract(clean_text)

                draft = writer.generate_draft(p["title"], sections)
                all_summaries += f"\nPaper: {p['title']}\n{draft}\n"

                final_data.append({"title": p["title"], "summary": draft})

            # 5. Synthesize
            print("--> Generatng Final Synthesis...")
            synthesis = writer.synthesize(all_summaries)

            # 6. Save Output
            output_file = os.path.join(Config.OUTPUT_DIR, "Final_Report.txt")
            with open(output_file, "w") as f:
                f.write(f"TOPIC: {topic}\n\nSYNTHESIS:\n{synthesis}\n\nDETAILS:\n{all_summaries}")

            print(f"\nSUCCESS! Report saved to: {output_file}")

            # 7. Download
            print("--> Zipping files...")
            !zip -r /content/research_bundle.zip /content/research_workspace
            files.download("/content/research_bundle.zip")

--> Setting up directories...
Enter Research Topic (or press Enter for default): forest fire prediction

=== PROCESSING TOPIC: forest fire prediction ===

--> Searching Semantic Scholar for: 'forest fire prediction'...
--> Search complete. Found 10 raw results.
--> Filtered to 472 papers with valid PDF links.
--> Attempting to download top 3 papers...


Downloading:   0%|          | 0/3 [00:00<?, ?it/s]

--> Successfully downloaded 1 PDFs.
--> Gemini Client initialized.
--> Analyzing Papers with AI...


AI Analysis:   0%|          | 0/1 [00:00<?, ?it/s]

--> Generatng Final Synthesis...

SUCCESS! Report saved to: /content/research_workspace/outputs/Final_Report.txt
--> Zipping files...
updating: content/research_workspace/ (stored 0%)
updating: content/research_workspace/outputs/ (stored 0%)
updating: content/research_workspace/pdfs/ (stored 0%)
updating: content/research_workspace/pdfs/Multivariate_quantile_mapping_bias_correction__an_.pdf (deflated 15%)
  adding: content/research_workspace/outputs/Final_Report.txt (deflated 33%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>