In [None]:
!pip install semanticscholar python-dotenv requests -q
from openai import OpenAI
from typing import List
import json
import os
from semanticscholar import SemanticScholar
from dotenv import load_dotenv


In [None]:

# PDF-ONLY RESULTS (SemanticScholar-Compatible Output)

!pip install requests feedparser -q
from openai import OpenAI
from typing import List
import requests
import feedparser
import json
from datetime import datetime
import os


key = OpenAI(api_key="")


def clean(x):
    return x.replace("\n", " ").strip() if isinstance(x, str) else x


# ====================================================
# 1. Europe PMC ‚Üí only keep papers with PDF
# ====================================================

def search_europe_pmc(query, limit=20):
    print("\nüîç Searching Europe PMC...")

    url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/search?query={query}&format=json&pageSize={limit}"
    try:
        data = requests.get(url, timeout=10).json()
        results = data.get("resultList", {}).get("result", [])
    except:
        return []

    papers = []

    for p in results:
        pdf_url = None

        # FIXED: Improved PDF detection
        # Check open access status first
        if p.get("isOpenAccess") == "Y":
            # Try PMC ID based URL
            pmcid = p.get("pmcid")
            if pmcid:
                pdf_url = f"https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/pdf/"

            # If no PMC ID, check fullTextUrlList
            if not pdf_url and "fullTextUrlList" in p:
                urls = p["fullTextUrlList"].get("fullTextUrl", [])
                for u in urls:
                    doc_style = u.get("documentStyle", "").lower()
                    site = u.get("site", "").lower()
                    availability = u.get("availability", "")

                    # Look for PDF specifically
                    if "pdf" in doc_style or "pdf" in site:
                        pdf_url = u.get("url")
                        break
                    # Or free full text
                    elif availability == "Free" and u.get("url"):
                        potential_url = u.get("url")
                        if potential_url and "pdf" in potential_url.lower():
                            pdf_url = potential_url
                            break

        # üö´ Skip papers without PDFs (FIXED)
        if not pdf_url:
            continue
        papers.append({
            "title": clean(p.get("title", "")),
            "authors": [a.get("fullName", "") for a in p.get("authorList", {}).get("author", [])],
            "year": int(p["pubYear"]) if p.get("pubYear") else None,
            "paperId": p.get("id", ""),
            "abstract": clean(p.get("abstractText", "")),
            "citationCount": p.get("citedByCount", 0),
            "venue": p.get("journalTitle", ""),
            "url": p.get("pubmedUrl", ""),
            "pdf_url": pdf_url,
            "has_pdf": True,
            "source": "Europe PMC"
        })

    print(f"‚û° Europe PMC PDF results: {len(papers)}")
    return papers


# ====================================================
# 2. arXiv ‚Üí ALL papers have PDF
# ====================================================
def search_arxiv(query, limit=20):
    print("\nüîç Searching arXiv...")

    url = f"http://export.arxiv.org/api/query?search_query=all:{query}&start=0&max_results={limit}"
    try:
        feed = feedparser.parse(url)
    except:
        return []

    papers = []

    for entry in feed.entries:
        pdf_url = entry.id.replace("abs", "pdf") + ".pdf"

        papers.append({
            "title": clean(entry.title),
            "authors": [a.name for a in entry.authors],
            "year": int(entry.published[:4]),
            "paperId": entry.id,
            "abstract": clean(entry.summary),
            "citationCount": 0,
            "venue": "arXiv",
            "url": entry.link,
            "pdf_url": pdf_url,
            "has_pdf": True,
            "source": "arXiv"
        })

    print(f"‚û° arXiv PDF results: {len(papers)}")
    return papers


# ====================================================
# 3. Combine + PDF Only
# ====================================================
def search_papers(query, limit=20):
    print(f"\nüîé Searching for: {query}")

    pmc_papers = search_europe_pmc(query, limit)
    arxiv_papers = search_arxiv(query, limit)

    all_papers = pmc_papers + arxiv_papers

    print(f"\nüìä TOTAL PDF papers found: {len(all_papers)}")
    return all_papers


# ====================================================
# SAVE RESULTS
# ====================================================
def save_search_results(papers, topic):
    os.makedirs("data/search_results", exist_ok=True)

    safe_topic = "".join(c for c in topic if c.isalnum() or c == " ").replace(" ", "_")
    filename = f"paper_search_results_{safe_topic}.json"

    path = f"data/search_results/{filename}"

    with open(path, "w", encoding="utf-8") as f:
        json.dump({
            "topic": topic,
            "timestamp": datetime.now().isoformat(),
            "papers": papers
        }, f, indent=4)

    print(f"\nüíæ Saved results ‚Üí {path}")
    return path


# ====================================================
# DISPLAY RESULTS (PDF ONLY)
# ====================================================
def display_results(papers, limit=10):
    print("\n=========== PDF AVAILABLE PAPERS ===========\n")
    for i, p in enumerate(papers[:limit], 1):
        print(f"{i}. {p['title']}")
        print(f"   Authors: {', '.join(p['authors'][:4])}")
        print(f"   Year: {p['year']}  | Source: {p['source']}")
        print(f"   PDF: {p['pdf_url']}\n")


# ====================================================
# MAIN FUNCTION
# ====================================================
def main_search():
    query = input("Enter research topic: ").strip()

    papers = search_papers(query, limit=20)
    display_results(papers)
    save_search_results(papers, query)

    print("\nüéâ MODULE 1 COMPLETE ‚Äî PDF-ONLY MODE ENABLED ‚úî")
    return papers


# Run module
if __name__ == "__main__":
    main_search()
def main_search():
    query = input("Enter research topic: ").strip()

    papers = search_papers(query, limit=20)

    # ADDED: Check for empty results
    if not papers:
        print("\n‚ùå No papers with PDFs found. Try a different search query.")
        return []

    display_results(papers)
    save_search_results(papers, query)

    print("\nüéâ MODULE 1 COMPLETE ‚Äì PDF-ONLY MODE ENABLED ‚úì")
    return papers


Enter research topic: parkinson

üîé Searching for: parkinson

üîç Searching Europe PMC...
‚û° Europe PMC PDF results: 5

üîç Searching arXiv...
‚û° arXiv PDF results: 20

üìä TOTAL PDF papers found: 25


1. Correction: Metabolic modeling of microbial communities in the chicken ceca reveals a landscape of competition and co-operation.
   Authors: 
   Year: 2025  | Source: Europe PMC
   PDF: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC12720474/pdf/

2. Stigmatization and bias in interpreting lichen sclerosus risk factors.
   Authors: 
   Year: 2025  | Source: Europe PMC
   PDF: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC12685388/pdf/

3. A bibliometric analysis of non-coding RNAs in Parkinson disease: Research hotspots and emerging trends (2013-2022).
   Authors: 
   Year: 2025  | Source: Europe PMC
   PDF: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC12727337/pdf/

4. Recruitment of Latinx Older Adults With Parkinson Disease for a Remote Physical Activity Intervention Trial
   A

In [None]:
# ============================================
# MODULE 2: ROBUST PDF DOWNLOADER (multi-source + page-scan)
# ============================================
# Requirements:
# pip install requests python-docx PyMuPDF beautifulsoup4 -q

import os, json, requests, fitz
from pathlib import Path
from datetime import datetime
from docx import Document
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0 Safari/537.36"
}
REQUEST_TIMEOUT = 20  # seconds


# ------------------------------
# Helper: safe HEAD check if URL looks like a PDF
# ------------------------------
def try_head_is_pdf(url):
    """Return True if HEAD/GET indicates content-type PDF (handles redirects)."""
    if not url:
        return False
    try:
        # Try HEAD first (faster). Some servers disallow HEAD; fall back to GET with stream=True.
        r = requests.head(url, headers=HEADERS, allow_redirects=True, timeout=REQUEST_TIMEOUT)
        if r.status_code == 200 and "pdf" in r.headers.get("content-type", "").lower():
            return True
        # fallback to GET but do not download body
        r = requests.get(url, headers=HEADERS, stream=True, allow_redirects=True, timeout=REQUEST_TIMEOUT)
        ct = r.headers.get("content-type", "").lower()
        r.close()
        if r.status_code == 200 and "pdf" in ct:
            return True
    except Exception:
        return False
    return False


# ------------------------------
# Helper: try download PDF safely
# ------------------------------
def download_pdf(url, out_path):
    """Download PDF and verify by opening with PyMuPDF; return True if ok."""
    try:
        r = requests.get(url, headers=HEADERS, stream=True, timeout=REQUEST_TIMEOUT, allow_redirects=True)
        if r.status_code != 200:
            return False
        content_type = r.headers.get("content-type", "").lower()
        # quickly accept if content-type contains pdf OR url endswith .pdf
        if "pdf" not in content_type and not url.lower().split('?')[0].endswith('.pdf'):
            # still possible a PDF served with wrong header; try small peek
            chunk = r.raw.read(4)
            r.raw.close()
            r.close()
            if not chunk.startswith(b'%PDF'):
                return False
            # re-download properly
            r = requests.get(url, headers=HEADERS, stream=True, timeout=REQUEST_TIMEOUT, allow_redirects=True)
        # write to file
        with open(out_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        # verify with pymupdf
        fitz.open(str(out_path)).close()
        return True
    except Exception as e:
        # cleanup if file partially written
        try:
            if out_path.exists():
                out_path.unlink()
        except:
            pass
        return False


# ------------------------------
# Normalize / quick transforms for canonical PDF urls
# ------------------------------
def normalize_known_pdf_url(url):
    """Handle arXiv and PMC quick conversions, else return None."""
    if not url:
        return None
    u = url.strip()
    if "arxiv.org/abs/" in u:
        return u.replace("/abs/", "/pdf/") + ".pdf"
    if "ncbi.nlm.nih.gov/pmc/articles" in u:
        return u.rstrip("/") + "/pdf/"
    # add other simple heuristics if desired
    return None


# ------------------------------
# Extract candidate PDF links from landing page HTML
# ------------------------------
def extract_pdf_from_page(page_url):
    """Fetch landing page HTML and extract candidate PDF URLs using multiple heuristics."""
    candidates = []
    try:
        r = requests.get(page_url, headers=HEADERS, timeout=REQUEST_TIMEOUT, allow_redirects=True)
        if r.status_code != 200:
            return candidates
        text = r.text
        soup = BeautifulSoup(text, "html.parser")

        # 1) meta tags frequently used by publishers (citation_pdf_url)
        #    e.g., <meta name="citation_pdf_url" content="...pdf">
        for meta in soup.find_all("meta"):
            name = (meta.get("name") or "").lower()
            prop = (meta.get("property") or "").lower()
            content = meta.get("content") or meta.get("value") or ""
            if content and ("citation_pdf_url" in name or "citation_pdf_url" in prop):
                candidates.append(urljoin(page_url, content.strip()))

        # 2) link rel=alternate type=application/pdf
        for link in soup.find_all("link"):
            if (link.get("type") or "").lower() == "application/pdf" or (link.get("rel") and "alternate" in link.get("rel")):
                href = link.get("href")
                if href:
                    candidates.append(urljoin(page_url, href.strip()))

        # 3) anchor tags pointing to .pdf
        for a in soup.find_all("a", href=True):
            href = a["href"].strip()
            if ".pdf" in href.lower() or href.lower().endswith("/pdf"):
                candidates.append(urljoin(page_url, href))

        # 4) some publishers provide direct "download" links or query endpoints
        #    collect links that contain 'download' and later check via HEAD
        for a in soup.find_all("a", href=True):
            href = a["href"].lower()
            if "download" in href or "fulltext" in href:
                candidates.append(urljoin(page_url, a["href"]))

        # Deduplicate while preserving order
        seen = set()
        out = []
        for c in candidates:
            if c not in seen:
                seen.add(c)
                out.append(c)
        return out
    except Exception:
        return []


# ------------------------------
# Main search for a candidate PDF for a given article URL (pdf_url from search results)
# ------------------------------
def find_candidate_pdf(article_url):
    """Return a list of candidate PDF URLs in order of priority."""
    candidates = []

    # 1) If direct link already points to a PDF via header check, keep it
    if article_url:
        if try_head_is_pdf(article_url):
            candidates.append(article_url)

    # 2) Known quick normalizers (arXiv / PMC)
    norm = normalize_known_pdf_url(article_url)
    if norm:
        if try_head_is_pdf(norm):
            candidates.append(norm)
        else:
            # still include as candidate for download attempt (some arXiv links may require .pdf appended)
            candidates.append(norm)

    # 3) Try to parse landing page and extract PDF links
    page_candidates = extract_pdf_from_page(article_url) if article_url else []
    for pc in page_candidates:
        if pc not in candidates:
            candidates.append(pc)

    # 4) Also try Semantic Scholar PDF link pattern (optional):
    #    If article_url is a doi or known id, semantic scholar often hosts PDF. We'll try a search URL:
    #    e.g., https://www.semanticscholar.org/paper/<paper-slug>
    #    (Note: this is best-effort; may or may not help)
    # (Skipping automated semantic scholar search to avoid scraping policies ‚Äî rely on page parsing instead.)

    return candidates


# ------------------------------
# Create DOC fallback (metadata only)
# ------------------------------
def create_doc(paper, path):
    doc = Document()
    doc.add_heading(paper.get("title", "Untitled"), level=1)
    doc.add_paragraph("Authors: " + ", ".join(paper.get("authors", [])))
    doc.add_paragraph("Year: " + str(paper.get("year", "")))
    doc.add_paragraph("Source: " + str(paper.get("source", "")))
    doc.add_heading("Abstract", level=2)
    doc.add_paragraph(paper.get("abstract") or "Not available")
    doc.add_heading("Original URL", level=2)
    doc.add_paragraph(paper.get("pdf_url") or "")
    doc.save(path)


# ------------------------------
# Main pipeline for multiple papers
# ------------------------------
def download_papers_hybrid(papers, max_count=10, output_dir="downloads"):
    out_dir = Path(output_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    summary = []

    for idx, p in enumerate(papers[:max_count], start=1):
        title = p.get("title", f"paper_{idx}")
        safe = "".join(ch for ch in title if ch.isalnum())[:40] or f"paper_{idx}"
        folder = out_dir / f"{idx}_{safe}"
        folder.mkdir(parents=True, exist_ok=True)

        meta = {
            "paper_id": folder.name,
            "title": title,
            "source": p.get("source"),
            "original_url": p.get("pdf_url"),
            "candidates": [],
            "downloaded": False,
            "downloaded_path": None,
            "timestamp": datetime.now().isoformat()
        }

        # find candidates
        candidates = find_candidate_pdf(p.get("pdf_url"))
        meta["candidates"] = candidates

        # try candidates in order
        pdf_file_path = folder / "paper.pdf"
        for cand in candidates:
            if not cand:
                continue
            ok = download_pdf(cand, pdf_file_path)
            if ok:
                meta["downloaded"] = True
                meta["downloaded_path"] = str(pdf_file_path)
                meta["download_candidate"] = cand
                break

        # Fallback: if nothing downloaded, keep metadata and write a DOC with abstract
        doc_path = folder / "paper.docx"
        try:
            create_doc(p, doc_path)
        except Exception:
            pass

        # Save metadata
        with open(folder / "metadata.json", "w", encoding="utf-8") as f:
            json.dump(meta, f, indent=2, ensure_ascii=False)

        summary.append(meta)
        print(f"[{idx}] {title[:80]} ‚Äî PDF downloaded: {meta['downloaded']}")

    # Save overall report
    report_path = out_dir / f"download_report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    with open(report_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, indent=2, ensure_ascii=False)

    return summary, str(report_path)


# ------------------------------
# Example usage (integrate with your Module 1 output)
# ------------------------------
if __name__ == "__main__":
    # Expecting data/search_results/latest.json with {"topic":..., "papers":[{...}, ...]}
    srch = None
    try:
        files = sorted(Path("data/search_results").glob("*.json"), key=lambda f: f.stat().st_mtime)
        if not files:
            print("‚ùå No search_results JSON files found. Run module 1 first.")
            raise SystemExit
        with open(files[-1], "r", encoding="utf-8") as f:
            srch = json.load(f)
    except Exception as e:
        print("Error loading search results:", e)
        raise SystemExit

    papers = srch.get("papers", [])
    summary, report = download_papers_hybrid(papers, max_count=10, output_dir="downloads")
    print("Done. Report:", report)


[1] Correction: Metabolic modeling of microbial communities in the chicken ceca reve ‚Äî PDF downloaded: False
[2] Stigmatization and bias in interpreting lichen sclerosus risk factors. ‚Äî PDF downloaded: False
[3] A bibliometric analysis of non-coding RNAs in Parkinson disease: Research hotspo ‚Äî PDF downloaded: False
[4] Recruitment of Latinx Older Adults With Parkinson Disease for a Remote Physical  ‚Äî PDF downloaded: False
[5] Revisiting the 2015 MDS diagnostic criteria for Parkinson disease: insights from ‚Äî PDF downloaded: False
[6] Deep 1D-Convnet for accurate Parkinson disease detection and severity prediction ‚Äî PDF downloaded: True
[7] A Three-groups Non-local Model for Combining Heterogeneous Data Sources to Ident ‚Äî PDF downloaded: True
[8] Optimizing baryon acoustic oscillation surveys II: curvature, redshifts, and ext ‚Äî PDF downloaded: True
[9] Detection of 16 Gamma-Ray Pulsars Through Blind Frequency Searches Using the Fer ‚Äî PDF downloaded: True
[10] Identifica

In [None]:
# ============================================================
# MODULE 3: ROBUST PDF TEXT EXTRACTION (FIXED VERSION)
# ============================================================

!pip install pymupdf tqdm -q

import fitz
import json
import re
from pathlib import Path
from tqdm import tqdm
from datetime import datetime

# ============================================================
# SIMPLE CLEANER
# ============================================================

def clean_text(text):
    if not text:
        return ""
    text = re.sub(r"\s+", " ", text)
    return text.strip()

# ============================================================
# FIND PDF FILES (NO FILTERING)
# ============================================================

def find_pdfs(root="downloads"):
    root = Path(root)
    if not root.exists():
        print(" downloads folder not found")
        return []

    pdfs = list(root.rglob("*.pdf"))
    print(f" Found {len(pdfs)} PDF files")
    return pdfs

# ============================================================
# TEXT EXTRACTION (SIMPLE & RELIABLE)
# ============================================================

def extract_text(pdf_path):
    try:
        doc = fitz.open(pdf_path)
        text = ""

        for page in doc:
            text += page.get_text("text") + "\n"

        doc.close()
        return clean_text(text)

    except Exception as e:
        print(f"‚ùå Failed: {pdf_path.name} ‚Üí {e}")
        return ""

# ============================================================
# BASIC SECTION SPLITTER (LENIENT)
# ============================================================

def split_sections(text):
    sections = {
        "title": "",
        "abstract": "",
        "methods": "",
        "results": "",
        "conclusion": "",
        "full_text": text[:20000]
    }

    lower = text.lower()

    def grab(start, end=None):
        s = lower.find(start)
        if s == -1:
            return ""
        s += len(start)
        e = lower.find(end, s) if end else s + 3000
        return text[s:e].strip()

    sections["abstract"] = grab("abstract", "introduction")
    sections["methods"] = grab("methods", "results")
    sections["results"] = grab("results", "conclusion")
    sections["conclusion"] = grab("conclusion")

    # title guess
    for line in text.split("\n")[:5]:
        if 20 < len(line) < 150:
            sections["title"] = line.strip()
            break

    return sections

# ============================================================
# PROCESS SINGLE PDF
# ============================================================

def process_pdf(pdf):
    print(f"üìÑ Processing: {pdf.name}")

    text = extract_text(pdf)
    if not text:
        print(" No text extracted")
        return None

    sections = split_sections(text)

    return {
        "paper_id": pdf.stem,
        "filename": pdf.name,
        "extracted_at": datetime.now().isoformat(),
        "text_length": len(text),
        "sections": sections,
        "status": "extracted"
    }

# ============================================================
# SAVE OUTPUT
# ============================================================

def save_results(results):
    out = Path("data/extracted")
    out.mkdir(parents=True, exist_ok=True)

    for r in results:
        with open(out / f"{r['paper_id']}.json", "w", encoding="utf-8") as f:
            json.dump(r, f, indent=2, ensure_ascii=False)

    print(f" Saved {len(results)} extracted papers")

# ============================================================
# RUN MODULE 3
# ============================================================

def run_module_3(max_papers=5):
    print("\n=== MODULE 3: PDF EXTRACTION ===")

    pdfs = find_pdfs()
    pdfs = pdfs[:max_papers]

    results = []
    for pdf in tqdm(pdfs):
        r = process_pdf(pdf)
        if r:
            results.append(r)

    if results:
        save_results(results)
    else:
        print(" No PDFs could be processed")

    return results

# ============================================================
# AUTO RUN
# ============================================================

results = run_module_3()



=== MODULE 3: PDF EXTRACTION ===
 Found 5 PDF files


 20%|‚ñà‚ñà        | 1/5 [00:00<00:00,  5.62it/s]

üìÑ Processing: paper.pdf
üìÑ Processing: paper.pdf


 40%|‚ñà‚ñà‚ñà‚ñà      | 2/5 [00:00<00:00,  7.40it/s]

üìÑ Processing: paper.pdf
üìÑ Processing: paper.pdf


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:00<00:00,  8.25it/s]

üìÑ Processing: paper.pdf
 Saved 5 extracted papers





In [None]:

import os

# üîë SET GEMINI API KEY (FLASH FREE)
os.environ["GEMINI_API_KEY"] = "AIzaSyA5WZeo2_KvziOWk0Doq3fu6VgZ15QBxFc"

print("‚úÖ GEMINI API KEY INITIALISED")


‚úÖ GEMINI API KEY INITIALISED


In [None]:
!pip uninstall -y google-generativeai




Found existing installation: google-generativeai 0.3.2
Uninstalling google-generativeai-0.3.2:
  Successfully uninstalled google-generativeai-0.3.2


In [None]:
import os
os.environ["GEMINI_API_KEY"] = "import os"
os.environ["GEMINI_API_KEY"] = "AIzaSyBbv3izkdoCge_Edsnf98A2CJ3jwar_L_o"



In [None]:
!pip uninstall -y google-generativeai
!pip install -U google-generativeai scikit-learn matplotlib seaborn


Found existing installation: google-generativeai 0.8.6
Uninstalling google-generativeai-0.8.6:
  Successfully uninstalled google-generativeai-0.8.6
Collecting google-generativeai
  Using cached google_generativeai-0.8.6-py3-none-any.whl.metadata (3.9 kB)
Using cached google_generativeai-0.8.6-py3-none-any.whl (155 kB)
Installing collected packages: google-generativeai
Successfully installed google-generativeai-0.8.6


In [None]:
!pip install -U google-cloud-aiplatform


Collecting google-cloud-aiplatform
  Downloading google_cloud_aiplatform-1.132.0-py2.py3-none-any.whl.metadata (46 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m46.1/46.1 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading google_cloud_aiplatform-1.132.0-py2.py3-none-any.whl (8.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m8.2/8.2 MB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: google-cloud-aiplatform
  Attempting uninstall: google-cloud-aiplatform
    Found existing installation: google-cloud-aiplatform 1.130.0
    Uninstalling google-cloud-aiplatform-1.130.0:
      Successfully uninstalled google-cloud-aiplatform-1.130.0
Successfully installed google-cloud-aiplatform-1.132.0


In [11]:
from google.colab import auth
auth.authenticate_user()



In [12]:
!pip install -U google-cloud-aiplatform




In [14]:
import vertexai
from vertexai.generative_models import GenerativeModel

# init vertex
vertexai.init(
    project="gemini-test-483718",
    location="us-central1"
)

# EXACT model from your screenshot
model = GenerativeModel("gemini-2.0-flash-lite-001")

response = model.generate_content("Say hello in one line.")
print(response.text)



Hello!



In [16]:
import json
from pathlib import Path
import vertexai
from vertexai.generative_models import GenerativeModel

vertexai.init(
    project="gemini-test-483718",
    location="us-central1"
)

model = GenerativeModel("gemini-2.0-flash-lite-001")


In [17]:
def load_extracted_papers():
    path = Path("data/extracted")
    papers = []

    for f in path.glob("*.json"):
        with open(f, "r", encoding="utf-8") as fp:
            data = json.load(fp)
            if "sections" in data:
                papers.append(data)

    print(f"‚úÖ Loaded {len(papers)} extracted papers")
    return papers

papers = load_extracted_papers()


‚úÖ Loaded 1 extracted papers


In [18]:
def build_summary(papers):
    summary = ""
    for p in papers:
        summary += f"""
TITLE: {p['sections'].get('title','')}
METHODS: {p['sections'].get('methods','')[:500]}
RESULTS: {p['sections'].get('results','')[:500]}
"""
    return summary

base_text = build_summary(papers)


In [19]:
abstract = model.generate_content(
    "Write an academic ABSTRACT based on the following papers:\n" + base_text
).text

methods = model.generate_content(
    "Write a METHODS COMPARISON section comparing the approaches:\n" + base_text
).text

results = model.generate_content(
    "Write a RESULTS SYNTHESIS highlighting trends and findings:\n" + base_text
).text

print("\n===== ABSTRACT =====\n", abstract)
print("\n===== METHODS =====\n", methods)
print("\n===== RESULTS =====\n", results)



===== ABSTRACT =====
 Here's an abstract based on the provided information, focusing on the study's cohort and methodology:

**Abstract:**

This study investigates the genetic architecture of Parkinson's disease (PD) through an analysis of a large cohort of patients. The research utilized whole-exome sequencing (WES) data from a cohort of 683 index PD patients, primarily characterized by early-onset disease, assembled by the French and Mediterranean Parkinson‚Äôs Disease Genetics Study group (FMPD cohort). Prior to the current analysis, known mutations in a panel of Parkinsonism-associated genes (listed in Supplementary Table 4) had been excluded through WES and multiplex ligation-dependent probe amplification. Furthermore, expansions in the SCA2 and SCA17 genes were excluded using ExpansionHunter. This approach aims to provide a comprehensive genetic characterization of PD by focusing on a well-defined cohort and implementing rigorous pre-screening of known genetic contributors.


==

In [20]:
!pip install gradio python-docx -q


In [21]:
import gradio as gr
from docx import Document
from pathlib import Path


In [22]:
def generate_final_report(
    abstract,
    methods,
    results,
    cross_analysis,
    references
):
    output_dir = Path("data/final_report")
    output_dir.mkdir(parents=True, exist_ok=True)

    file_path = output_dir / "final_research_report.docx"

    doc = Document()
    doc.add_heading("Final Research Report", level=1)

    doc.add_heading("Abstract", level=2)
    doc.add_paragraph(abstract)

    doc.add_heading("Cross-Paper Analysis", level=2)
    doc.add_paragraph(cross_analysis)

    doc.add_heading("Methods Comparison", level=2)
    doc.add_paragraph(methods)

    doc.add_heading("Results Synthesis", level=2)
    doc.add_paragraph(results)

    doc.add_heading("References", level=2)
    doc.add_paragraph(references)

    doc.save(file_path)

    return f"‚úÖ Final report generated at:\n{file_path.resolve()}"


In [25]:
# # ============================================
# # MODULE 5: REVIEW + FINAL REPORT UI (GEMINI)
# # ============================================

# !pip install gradio python-docx -q

# import gradio as gr
# import google.generativeai as genai
# import os
# from docx import Document
# from pathlib import Path
# from datetime import datetime

# # ============================================
# # GEMINI CONFIG
# # ============================================

# genai.configure(api_key=os.environ["GEMINI_API_KEY"])
# model = genai.GenerativeModel("gemini-2.0-flash")

# def call_gemini(prompt: str) -> str:
#     response = model.generate_content(prompt)
#     return response.text.strip()

# # ============================================
# # CRITIQUE FUNCTION
# # ============================================

# def critique_text(abstract, methods, results):
#     prompt = f"""
# You are an academic reviewer.

# Critically evaluate the following sections.

# Provide:
# - Strengths
# - Weaknesses
# - Improvement Suggestions

# ABSTRACT:
# {abstract}

# METHODS:
# {methods}

# RESULTS:
# {results}
# """
#     return call_gemini(prompt)

# # ============================================
# # FINAL REPORT GENERATION
# # ============================================

# def generate_final_report(abstract, methods, results):
#     output_dir = Path("data/final_report")
#     output_dir.mkdir(parents=True, exist_ok=True)

#     filename = f"Final_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.docx"
#     file_path = output_dir / filename

#     # ---- Create DOCX ----
#     doc = Document()
#     doc.add_heading("Final Research Report", level=1)

#     doc.add_heading("Abstract", level=2)
#     doc.add_paragraph(abstract)

#     doc.add_heading("Methods Comparison", level=2)
#     doc.add_paragraph(methods)

#     doc.add_heading("Results Synthesis", level=2)
#     doc.add_paragraph(results)

#     doc.save(file_path)

#     return str(file_path)

# # ============================================
# # GRADIO UI
# # ============================================

# with gr.Blocks(title="Academic Paper Review & Report Generator") as demo:

#     gr.Markdown("## üìò Automated Academic Writing & Review System (Gemini)")

#     abstract_box = gr.Textbox(lines=8, label="Abstract")
#     methods_box = gr.Textbox(lines=8, label="Methods Comparison")
#     results_box = gr.Textbox(lines=8, label="Results Synthesis")

#     critique_btn = gr.Button("üîç Critique Sections")
#     generate_btn = gr.Button("üìÑ Generate Final Report")

#     critique_output = gr.Textbox(lines=10, label="Reviewer Feedback")
#     download_link = gr.File(label="Download Final Report")

#     critique_btn.click(
#         critique_text,
#         inputs=[abstract_box, methods_box, results_box],
#         outputs=critique_output
#     )

#     generate_btn.click(
#         generate_final_report,
#         inputs=[abstract_box, methods_box, results_box],
#         outputs=download_link
#     )

# demo.launch()


# ============================================
# MODULE 5: REVIEW + FINAL REPORT UI (GEMINI)
# ============================================

!pip install gradio python-docx -q

import gradio as gr
import google.generativeai as genai
import os
from docx import Document
from pathlib import Path
from datetime import datetime

# ============================================
# GEMINI CONFIG
# ============================================

genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = genai.GenerativeModel("gemini-2.0-flash")

def call_gemini(prompt: str) -> str:
    response = model.generate_content(prompt)
    return response.text.strip()

# ============================================
# CRITIQUE FUNCTION
# ============================================

def critique_text(abstract, methods, results):
    prompt = f"""
You are an academic reviewer.

Critically evaluate the following sections.

Provide:
- Strengths
- Weaknesses
- Improvement Suggestions

ABSTRACT:
{abstract}

METHODS:
{methods}

RESULTS:
{results}
"""
    return call_gemini(prompt)

# ============================================
# FINAL REPORT GENERATION
# ============================================

def generate_final_report(abstract, methods, results):
    output_dir = Path("data/final_report")
    output_dir.mkdir(parents=True, exist_ok=True)

    filename = f"Final_Report_{datetime.now().strftime('%Y%m%d_%H%M%S')}.docx"
    file_path = output_dir / filename

    # ---- Create DOCX ----
    doc = Document()
    doc.add_heading("Final Research Report", level=1)

    doc.add_heading("Abstract", level=2)
    doc.add_paragraph(abstract)

    doc.add_heading("Methods Comparison", level=2)
    doc.add_paragraph(methods)

    doc.add_heading("Results Synthesis", level=2)
    doc.add_paragraph(results)

    doc.save(file_path)

    return str(file_path)

# ============================================
# GRADIO UI
# ============================================

with gr.Blocks(title="Academic Paper Review & Report Generator") as demo:

    gr.Markdown("## üìò Automated Academic Writing & Review System (Gemini)")

    abstract_box = gr.Textbox(lines=8, label="Abstract")
    methods_box = gr.Textbox(lines=8, label="Methods Comparison")
    results_box = gr.Textbox(lines=8, label="Results Synthesis")

    critique_btn = gr.Button("üîç Critique Sections")
    generate_btn = gr.Button("üìÑ Generate Final Report")

    critique_output = gr.Textbox(lines=10, label="Reviewer Feedback")
    download_link = gr.File(label="Download Final Report")

    critique_btn.click(
        critique_text,
        inputs=[abstract_box, methods_box, results_box],
        outputs=critique_output
    )

    generate_btn.click(
        generate_final_report,
        inputs=[abstract_box, methods_box, results_box],
        outputs=download_link
    )

demo.launch()



It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://50f1e29ff7411067ab.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
# ============================================================
# MODULE 5: DATASET GENERATOR FOR RESEARCH PAPERS (FINAL FIXED)
# ============================================================

!pip install pandas openpyxl -q

import json
import pandas as pd
import re
from pathlib import Path
from datetime import datetime

# ============================================================
# 1. LOAD ALL EXTRACTED PAPERS (FIXED)
# ============================================================

def load_all_extracted(data_dir="data/extracted"):
    path = Path(data_dir)

    if not path.exists():
        print(f"‚ùå Directory not found: {data_dir}")
        print("   Please run Module 3 first.")
        return []

    # Load all JSON files except summary/stat files
    files = [
        f for f in path.glob("*.json")
        if not any(x in f.name.lower() for x in ["summary", "stats"])
    ]

    if not files:
        print("‚ùå No extracted paper JSON files found.")
        print(f"   Checked path: {path.resolve()}")
        return []

    print(f"üìÑ Found {len(files)} extracted paper files.")

    papers = []
    for f in files:
        try:
            with open(f, "r", encoding="utf-8") as fp:
                data = json.load(fp)
                if "sections" in data:
                    papers.append(data)
                else:
                    print(f"‚ö†Ô∏è Skipped invalid JSON: {f.name}")
        except Exception as e:
            print(f"‚ö†Ô∏è Failed reading {f.name}: {str(e)[:60]}")

    print(f"‚úÖ Loaded {len(papers)} valid papers.")
    return papers

# ============================================================
# 2. TEXT CLEANING HELPERS
# ============================================================

def clean_text(t):
    if not t:
        return ""
    t = re.sub(r'\s+', ' ', t)
    return t.strip()

def extract_year(paper):
    if "year" in paper and paper["year"]:
        return paper["year"]
    match = re.search(r"(19|20)\d{2}", paper.get("filename", ""))
    return int(match.group()) if match else None

# ============================================================
# 3. RULE-BASED INFO EXTRACTION
# ============================================================

def keyword_extract(text, keywords, max_items=5):
    if not text:
        return []

    found = []
    text_low = text.lower()
    sentences = re.split(r'[.!?]', text)

    for kw in keywords:
        for sent in sentences:
            if kw in sent.lower() and len(sent.strip()) > 25:
                found.append(clean_text(sent)[:300])
                if len(found) >= max_items:
                    return found
    return found

def extract_methods(p):
    return keyword_extract(
        p.get("sections", {}).get("methods", ""),
        ["we use", "our method", "approach", "technique", "implementation"]
    )

def extract_datasets(p):
    return keyword_extract(
        p.get("sections", {}).get("methods", ""),
        ["dataset", "benchmark", "data source", "collected data"]
    )

def extract_findings(p):
    return keyword_extract(
        p.get("sections", {}).get("results", ""),
        ["result", "significant", "improved", "outperforms"]
    )

def extract_limitations(p):
    return keyword_extract(
        p.get("sections", {}).get("conclusion", ""),
        ["limitation", "future work", "challenge", "not address"]
    )

def extract_contributions(p):
    return keyword_extract(
        p.get("sections", {}).get("introduction", ""),
        ["contribution", "we propose", "novel", "we present"]
    )

def extract_metrics(p):
    return keyword_extract(
        p.get("sections", {}).get("methods", ""),
        ["accuracy", "precision", "recall", "f1", "metric"]
    )

def normalize_list(lst):
    return "; ".join(lst) if lst else ""

# ============================================================
# 4. BUILD DATASET
# ============================================================

def build_dataset(papers):
    rows = []
    print("\nüìä Building dataset...")

    for i, p in enumerate(papers, 1):
        sections = p.get("sections", {})

        print(f"  [{i}/{len(papers)}] Processing: {p.get('paper_id', 'unknown')}")

        row = {
            "paper_id": p.get("paper_id", f"paper_{i}"),
            "filename": p.get("filename", ""),
            "year": extract_year(p),
            "total_characters": p.get("total_characters", 0),

            "title": clean_text(sections.get("title", ""))[:500],
            "abstract": clean_text(sections.get("abstract", ""))[:2000],
            "introduction": clean_text(sections.get("introduction", ""))[:2000],
            "methods_text": clean_text(sections.get("methods", ""))[:2000],
            "results_text": clean_text(sections.get("results", ""))[:2000],
            "conclusion_text": clean_text(sections.get("conclusion", ""))[:2000],

            "methods": normalize_list(extract_methods(p)),
            "datasets": normalize_list(extract_datasets(p)),
            "key_findings": normalize_list(extract_findings(p)),
            "limitations": normalize_list(extract_limitations(p)),
            "contributions": normalize_list(extract_contributions(p)),
            "metrics": normalize_list(extract_metrics(p)),

            "num_methods": len(extract_methods(p)),
            "num_datasets": len(extract_datasets(p)),
            "num_findings": len(extract_findings(p)),
            "num_limitations": len(extract_limitations(p)),
            "num_contributions": len(extract_contributions(p)),
        }

        rows.append(row)

    df = pd.DataFrame(rows)
    print(f"‚úÖ Dataset created: {df.shape[0]} rows √ó {df.shape[1]} columns")
    return df

# ============================================================
# 5. SAVE DATASET (PROPER FOLDER STRUCTURE)
# ============================================================

def save_dataset(df, base_dir="data/dataset"):
    base = Path(base_dir)
    base.mkdir(parents=True, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    folder = base / f"dataset_{timestamp}"
    formats = folder / "formats"
    analysis = folder / "analysis"

    formats.mkdir(parents=True, exist_ok=True)
    analysis.mkdir(exist_ok=True)

    csv_path = formats / "papers_dataset.csv"
    xlsx_path = formats / "papers_dataset.xlsx"
    json_path = formats / "papers_dataset.json"

    df.to_csv(csv_path, index=False)
    df.to_excel(xlsx_path, index=False)
    df.to_json(json_path, orient="records", indent=2, force_ascii=False)

    stats = {
        "created_at": datetime.now().isoformat(),
        "total_papers": len(df),
        "total_columns": len(df.columns),
        "papers_with_year": int(df["year"].notna().sum()),
        "avg_characters": int(df["total_characters"].mean()),
    }

    with open(analysis / "dataset_statistics.json", "w") as f:
        json.dump(stats, f, indent=2)

    # backward-compatible CSV
    df.to_csv(base / "papers_dataset.csv", index=False)

    print("\nüìÅ Dataset saved successfully!")
    print(f"üìÇ Location: {folder.resolve()}")

    return folder

# ============================================================
# 6. MAIN RUNNER
# ============================================================

def generate_paper_dataset():
    print("\n" + "="*70)
    print("MODULE 5: DATASET GENERATOR")
    print("="*70)

    papers = load_all_extracted()
    if not papers:
        print("‚ùå No papers loaded. Aborting.")
        return None

    df = build_dataset(papers)

    print("\nüìå Dataset Preview:")
    print(df.head(3).to_string())

    save_dataset(df)

    print("\nüéâ MODULE 5 COMPLETED SUCCESSFULLY!")
    return df

# ============================================================
# AUTO RUN (COLAB FRIENDLY)
# ============================================================

generate_paper_dataset()



MODULE 5: DATASET GENERATOR
‚ùå Directory not found: data/extracted
   Please run Module 3 first.
‚ùå No papers loaded. Aborting.
