In [None]:
#  !pip install requests pandas tqdm

import os
import re
import time
import json
import csv
import requests
import pandas as pd
from tqdm import tqdm
from typing import List, Dict, Optional

# ---------------------------
# CONFIG - Replace API_KEY
# ---------------------------
API_KEY = "JhnIenwb3246rhRB3pNlw5FR2MuiNx5M4BCQ9DhS"  # <--Semantic Scholar API key
DATA_DIR = "data"
PDF_DIR = os.path.join(DATA_DIR, "pdfs")
METADATA_JSON = os.path.join(DATA_DIR, "metadata.json")
METADATA_CSV = os.path.join(DATA_DIR, "metadata.csv")
SEARCH_LIMIT = 12  # number of results to fetch by default

os.makedirs(PDF_DIR, exist_ok=True)
os.makedirs(DATA_DIR, exist_ok=True)

# ---------------------------
# SMART SUGGESTIONS (Feature 1)
# ---------------------------
SUGGESTIONS = {
    "ai": ["artificial intelligence healthcare", "deep learning medical imaging", "explainable AI"],
    "nlp": ["transformers for text summarization", "nlp in healthcare"],
    "ml": ["machine learning for diagnosis", "ml anomaly detection"],
    "covid": ["covid-19 vaccine efficacy", "covid-19 transmission modeling"]
}

def auto_suggest(topic: str):
    key = topic.lower().strip()
    printed = False
    for short, suggestions in SUGGESTIONS.items():
        if key == short:
            print("\nüí° Suggested research keyword variations:")
            for s in suggestions:
                print("   üî∏", s)
            printed = True
    if not printed and len(topic.split()) <= 2:
        # show general hints
        print("\nüí° Tip: Consider adding domain or method words (e.g., 'healthcare', 'CNN', 'transformer') for better search results.")
    print()

# ---------------------------
# UTILITIES
# ---------------------------
def sanitize_filename(name: str, max_len: int = 100) -> str:
    name = re.sub(r'[\\/*?:"<>|]', "", name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name[:max_len].strip()

KEYWORD_TAGS = ["disease", "covid", "classification", "deep", "cnn", "transformer",
                "diagnosis", "health", "medical", "image", "nlp", "survey", "review", "anomaly"]

def auto_tags(title: str) -> List[str]:
    title_lower = title.lower()
    tags = [kw for kw in KEYWORD_TAGS if kw in title_lower]
    # also add words longer than 5 chars that appear frequently as naive tag (not too noisy)
    extra = [w for w in set(re.findall(r'\b[a-z]{6,}\b', title_lower)) if w not in tags][:3]
    return list(dict.fromkeys(tags + extra))  # preserve order, uniq

def title_relevance_score(title: str, query: str) -> int:
    # simple scoring: keyword exact matches + token overlap
    title_l = title.lower()
    query_tokens = [t for t in re.findall(r'\w+', query.lower()) if len(t) > 1]
    score = 0
    for t in query_tokens:
        if t in title_l:
            score += 2
        # partial token in title: +1
        for word in re.findall(r'\w+', title_l):
            if t in word and t != word:
                score += 1
    return score

def is_duplicate(title: str, existing_meta: List[Dict]) -> bool:
    t = title.lower().strip()
    for m in existing_meta:
        if m.get("title","").lower().strip() == t:
            return True
    return False

def load_existing_metadata() -> List[Dict]:
    if os.path.exists(METADATA_JSON):
        try:
            with open(METADATA_JSON, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            return []
    return []

def save_metadata_list(metadata: List[Dict]):
    with open(METADATA_JSON, "w", encoding="utf-8") as f:
        json.dump(metadata, f, indent=2, ensure_ascii=False)
    if metadata:
        keys = metadata[0].keys()
        with open(METADATA_CSV, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=list(keys))
            writer.writeheader()
            writer.writerows(metadata)

# ---------------------------
# SEMANTIC SCHOLAR SEARCH (core)
# ---------------------------
def semantic_scholar_search(query: str, limit: int = SEARCH_LIMIT) -> List[Dict]:
    base = "https://api.semanticscholar.org/graph/v1/paper/search"
    fields = "title,authors,year,url,isOpenAccess,openAccessPdf,externalIds,abstract"
    params = {"query": query, "limit": limit, "fields": fields}
    headers = {"x-api-key": API_KEY}
    resp = requests.get(base, params=params, headers=headers, timeout=30)
    resp.raise_for_status()
    data = resp.json()
    papers = []
    for item in data.get("data", []):
        title = item.get("title", "No title")
        authors = ", ".join([a.get("name","") for a in item.get("authors", [])]) if item.get("authors") else ""
        year = item.get("year", None)
        url = item.get("url", None)
        is_open = bool(item.get("isOpenAccess", False))
        pdf_url = None
        if item.get("openAccessPdf"):
            pdf_url = item["openAccessPdf"].get("url")
        abstract = item.get("abstract", "")
        external = item.get("externalIds", {})
        papers.append({
            "title": title,
            "authors": authors,
            "year": year,
            "url": url,
            "is_open_access": is_open,
            "pdf_url": pdf_url,
            "external_ids": external,
            "abstract": abstract
        })
    return papers

# ---------------------------
# DISPLAY / INSIGHTS (Feature 2 & 3)
# ---------------------------
def prepare_and_rank(papers: List[Dict], query: str, existing_meta: List[Dict]) -> List[Dict]:
    for p in papers:
        p["score"] = title_relevance_score(p["title"], query)
        p["tags"] = auto_tags(p["title"])
        p["is_duplicate"] = is_duplicate(p["title"], existing_meta)
    # sort by score (desc) then year (desc if exists)
    papers_sorted = sorted(papers, key=lambda x: (x["score"], x.get("year") or 0), reverse=True)
    return papers_sorted

def emoji_access(p):
    return "üîì" if p["is_open_access"] else "üîí"

def emoji_pdf(p):
    return "üìÑ" if p.get("pdf_url") else "‚ùå"

def show_insights_table(papers: List[Dict]):
    rows = []
    for i,p in enumerate(papers):
        rows.append({
            "Index": i,
            "Title (short)": (p["title"][:70] + "...") if len(p["title"])>70 else p["title"],
            "Year": p.get("year"),
            "Authors": (p["authors"][:40] + "...") if p.get("authors") and len(p["authors"])>40 else p.get("authors"),
            "Access": emoji_access(p),
            "PDF": emoji_pdf(p),
            "Score": p.get("score", 0),
            "Tags": ", ".join(p.get("tags", [])),
            "Duplicate": "‚ö†" if p.get("is_duplicate") else ""
        })
    df = pd.DataFrame(rows)
    display(df)
    return df

# ---------------------------
# DOWNLOAD LOGIC
# ---------------------------
def download_file(url: str, dest_path: str, max_retries: int = 3, retry_delay: float = 1.2) -> bool:
    for attempt in range(1, max_retries + 1):
        try:
            r = requests.get(url, stream=True, timeout=30)
            r.raise_for_status()
            total = int(r.headers.get('content-length', 0))
            with open(dest_path, "wb") as f:
                if total and total > 0:
                    pbar = tqdm(total=total, unit="B", unit_scale=True, desc=f"Downloading", leave=False)
                    for chunk in r.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
                            pbar.update(len(chunk))
                    pbar.close()
                else:
                    f.write(r.content)
            return True
        except Exception as e:
            print(f"Warning: attempt {attempt} failed for URL: {url[:80]}... Error: {e}")
            time.sleep(retry_delay * attempt)
    return False

def download_selected_papers(papers: List[Dict], selected_indices: List[int], existing_meta: List[Dict]) -> List[Dict]:
    metadata_entries = []
    for idx in selected_indices:
        if idx < 0 or idx >= len(papers):
            print(f"Index {idx} out of range ‚Äî skipped.")
            continue
        p = papers[idx]
        title_clean = sanitize_filename(p["title"]) or f"paper_{idx}"
        filename = f"{p.get('year') or 'na'}_{title_clean}.pdf"
        dest_path = os.path.join(PDF_DIR, filename)

        entry = {
            "index_selected": idx,
            "title": p["title"],
            "authors": p.get("authors"),
            "year": p.get("year"),
            "source_url": p.get("url"),
            "pdf_url": p.get("pdf_url"),
            "local_path": None,
            "status": None,
            "score": p.get("score"),
            "tags": p.get("tags"),
            "is_duplicate": p.get("is_duplicate")
        }

        if p.get("is_duplicate"):
            entry["status"] = "duplicate_skipped"
            print(f"‚ö† Duplicate detected - skipped: {p['title']}")
            metadata_entries.append(entry)
            continue

        if not p.get("pdf_url"):
            entry["status"] = "skipped_no_pdf"
            print(f"‚ö† Skipped (no open PDF): {p['title']}")
            metadata_entries.append(entry)
            continue

        print(f"üì• Downloading [{idx}] {p['title']}")
        ok = download_file(p["pdf_url"], dest_path)
        if ok:
            entry["local_path"] = dest_path
            entry["status"] = "downloaded"
            print(f"‚úÖ Saved to {dest_path}")
        else:
            entry["status"] = "failed_download"
            print(f"‚ùå Failed to download: {p['title']}")
        metadata_entries.append(entry)
    return metadata_entries

# ---------------------------
# MAIN INTERACTIVE FLOW
# ---------------------------
def run_enhanced_flow():
    print("=== Enhanced Milestone-1: Paper Retriever (Your Independent Version) ===\n")
    existing_meta = load_existing_metadata()

    topic = input("Enter your research topic / query: ").strip()
    if not topic:
        print("Topic empty. Exiting.")
        return

    # Feature 1: Smart suggestions
    auto_suggest(topic)

    # Ask year filter optionally (user chose B earlier)
    apply_filter = input("Do you want to filter results by year range? (y/n): ").strip().lower().startswith("y")
    start_year = end_year = None
    if apply_filter:
        try:
            start_year = int(input("Enter start year (e.g., 2019): ").strip())
            end_year = int(input("Enter end year (e.g., 2025): ").strip())
            if start_year > end_year:
                start_year, end_year = end_year, start_year
        except Exception:
            print("Invalid input for years; continuing without year filter.")
            apply_filter = False

    # Perform search
    print("\nüîé Searching Semantic Scholar...")
    papers = semantic_scholar_search(topic, limit=SEARCH_LIMIT)
    if not papers:
        print("No papers found.")
        return

    # Feature 2 & 4 & 5: ranking, tags, duplicate detection
    papers = prepare_and_rank(papers, topic, existing_meta)

    # optional year filtering
    if apply_filter:
        papers_filtered = [p for p in papers if p.get("year") and start_year <= int(p["year"]) <= end_year]
        print(f"Filtered: {len(papers_filtered)} out of {len(papers)} match year range {start_year}-{end_year}.")
        papers = papers_filtered
        if not papers:
            print("No papers after filtering by year.")
            return

    # show insights table (feature 3)
    print("\nüìä Results insights:")
    show_insights_table(papers)

    # selection
    selection_str = input("Select paper indices to download (comma-separated, e.g. 0,2,3) or 'all' to attempt all: ").strip()
    if selection_str.lower() == "all":
        selected_indices = list(range(len(papers)))
    else:
        try:
            selected_indices = [int(s.strip()) for s in selection_str.split(",") if s.strip()!=""]
        except Exception:
            print("Invalid selection. Exiting.")
            return

    print(f"Selected indices: {selected_indices}")
    if not input("Proceed with downloads? (y/n): ").strip().lower().startswith("y"):
        print("Cancelled.")
        return

    # download selected
    metadata_new = download_selected_papers(papers, selected_indices, existing_meta)

    # combine metadata (append)
    combined_meta = existing_meta + metadata_new
    save_metadata_list(combined_meta)
    print(f"\nüìÅ Metadata saved to: {METADATA_JSON} and {METADATA_CSV}")
    print(f"üìÇ PDFs saved under: {PDF_DIR}")

    # show summary table
    if metadata_new:
        display(pd.DataFrame(metadata_new))
    else:
        print("No new metadata generated.")

# Run it:
if __name__ == "__main__":
    run_enhanced_flow()


=== Enhanced Milestone-1: Paper Retriever (Your Independent Version) ===


üí° Tip: Consider adding domain or method words (e.g., 'healthcare', 'CNN', 'transformer') for better search results.


üîé Searching Semantic Scholar...
Filtered: 12 out of 12 match year range 2020-2025.

üìä Results insights:


Unnamed: 0,Index,Title (short),Year,Authors,Access,PDF,Score,Tags,Duplicate
0,0,Social Determinants of Mental Health,2024,Robert A. Horne,üîí,‚ùå,4,"health, determinants, mental, social",
1,1,The social determinants of mental health and d...,2024,"James B Kirkbride, Deidre M. Anglin, Ian...",üîì,üìÑ,4,"health, mental, disorder, prevention",
2,2,Factors Associated With Mental Health Outcomes...,2020,"Jianbo Lai, Simeng Ma, Ying Wang, Zhongx...",üîì,üìÑ,4,"disease, health, workers, mental, coronavirus",
3,3,Substance Abuse and Mental Health Services Adm...,2020,"Brian Altman, Jane Powers, Christian Huy...",üîì,üìÑ,4,"health, mental, administration, substance",
4,4,Impact of COVID-19 pandemic on mental health i...,2020,"Jiaqi Xiong, Orly Lipsitz, F. Nasri, L. ...",üîì,üìÑ,4,"covid, health, review, mental, pandemic, general",
5,5,Mental Health and the Covid-19 Pandemic.,2020,"B. Pfefferbaum, C. North",üîì,üìÑ,4,"covid, health, mental, pandemic",
6,6,Mental health problems and social media exposu...,2020,"Junling Gao, P. Zheng, Yingnan Jia, Hao ...",üîì,üìÑ,4,"covid, health, mental, during, problems",
7,7,Effects of COVID-19 on College Students‚Äô Menta...,2020,"Changwon Son, Sudeep Hegde, Alec Smith, ...",üîì,üìÑ,4,"covid, health, survey, mental, states, united",
8,8,The outbreak of COVID-19 coronavirus and its i...,2020,"J. Torales, M. O‚ÄôHiggins, J. Castaldelli...",üîì,üìÑ,4,"covid, health, global, coronavirus, mental",
9,9,"Mental Health, Substance Use, and Suicidal Ide...",2020,"Mark E Czeisler, R. Lane, Emiko Petrosky...",üîì,üìÑ,4,"covid, health, mental, pandemic, during",


Selected indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]
Proceed with downloads? (y/n): y
‚ö† Skipped (no open PDF): Social Determinants of Mental Health
üì• Downloading [1] The social determinants of mental health and disorder: evidence, prevention and recommendations
‚ùå Failed to download: The social determinants of mental health and disorder: evidence, prevention and recommendations
üì• Downloading [2] Factors Associated With Mental Health Outcomes Among Health Care Workers Exposed to Coronavirus Disease 2019
‚ùå Failed to download: Factors Associated With Mental Health Outcomes Among Health Care Workers Exposed to Coronavirus Disease 2019
üì• Downloading [3] Substance Abuse and Mental Health Services Administration




‚úÖ Saved to data/pdfs/2020_Substance Abuse and Mental Health Services Administration.pdf
üì• Downloading [4] Impact of COVID-19 pandemic on mental health in the general population: A systematic review
‚ùå Failed to download: Impact of COVID-19 pandemic on mental health in the general population: A systematic review
üì• Downloading [5] Mental Health and the Covid-19 Pandemic.
‚ùå Failed to download: Mental Health and the Covid-19 Pandemic.
üì• Downloading [6] Mental health problems and social media exposure during COVID-19 outbreak
‚úÖ Saved to data/pdfs/2020_Mental health problems and social media exposure during COVID-19 outbreak.pdf
üì• Downloading [7] Effects of COVID-19 on College Students‚Äô Mental Health in the United States: Interview Survey Study
‚úÖ Saved to data/pdfs/2020_Effects of COVID-19 on College Students‚Äô Mental Health in the United States Interview Survey Study.pdf
üì• Downloading [8] The outbreak of COVID-19 coronavirus and its impact on global mental health




‚úÖ Saved to data/pdfs/2020_Mental Health, Substance Use, and Suicidal Ideation During the COVID-19 Pandemic ‚Äî United States, Ju.pdf
üì• Downloading [10] COVID-19 and mental health: A review of the existing literature




‚úÖ Saved to data/pdfs/2020_COVID-19 and mental health A review of the existing literature.pdf
üì• Downloading [11] Timely mental health care for the 2019 novel coronavirus outbreak is urgently needed
‚ùå Failed to download: Timely mental health care for the 2019 novel coronavirus outbreak is urgently needed

üìÅ Metadata saved to: data/metadata.json and data/metadata.csv
üìÇ PDFs saved under: data/pdfs


Unnamed: 0,index_selected,title,authors,year,source_url,pdf_url,local_path,status,score,tags,is_duplicate
0,0,Social Determinants of Mental Health,Robert A. Horne,2024,https://www.semanticscholar.org/paper/d6414c0b...,,,skipped_no_pdf,4,"[health, determinants, mental, social]",False
1,1,The social determinants of mental health and d...,"James B Kirkbride, Deidre M. Anglin, Ian Colma...",2024,https://www.semanticscholar.org/paper/efcce83f...,https://onlinelibrary.wiley.com/doi/pdfdirect/...,,failed_download,4,"[health, mental, disorder, prevention]",False
2,2,Factors Associated With Mental Health Outcomes...,"Jianbo Lai, Simeng Ma, Ying Wang, Zhongxiang C...",2020,https://www.semanticscholar.org/paper/2f946366...,https://jamanetwork.com/journals/jamanetworkop...,,failed_download,4,"[disease, health, workers, mental, coronavirus]",False
3,3,Substance Abuse and Mental Health Services Adm...,"Brian Altman, Jane Powers, Christian Huygen, C...",2020,https://www.semanticscholar.org/paper/db87118c...,http://files.eric.ed.gov/fulltext/ED445460.pdf,data/pdfs/2020_Substance Abuse and Mental Heal...,downloaded,4,"[health, mental, administration, substance]",False
4,4,Impact of COVID-19 pandemic on mental health i...,"Jiaqi Xiong, Orly Lipsitz, F. Nasri, L. Lui, H...",2020,https://www.semanticscholar.org/paper/c400f685...,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7...,,failed_download,4,"[covid, health, review, mental, pandemic, gene...",False
5,5,Mental Health and the Covid-19 Pandemic.,"B. Pfefferbaum, C. North",2020,https://www.semanticscholar.org/paper/bd734c10...,https://www.nejm.org/doi/pdf/10.1056/NEJMp2008017,,failed_download,4,"[covid, health, mental, pandemic]",False
6,6,Mental health problems and social media exposu...,"Junling Gao, P. Zheng, Yingnan Jia, Hao Chen, ...",2020,https://www.semanticscholar.org/paper/e6352661...,https://journals.plos.org/plosone/article/file...,data/pdfs/2020_Mental health problems and soci...,downloaded,4,"[covid, health, mental, during, problems]",False
7,7,Effects of COVID-19 on College Students‚Äô Menta...,"Changwon Son, Sudeep Hegde, Alec Smith, Xiaome...",2020,https://www.semanticscholar.org/paper/a0da8c95...,https://www.jmir.org/2020/9/e21279/PDF,data/pdfs/2020_Effects of COVID-19 on College ...,downloaded,4,"[covid, health, survey, mental, states, united]",False
8,8,The outbreak of COVID-19 coronavirus and its i...,"J. Torales, M. O‚ÄôHiggins, J. Castaldelli-Maia,...",2020,https://www.semanticscholar.org/paper/de299e10...,https://journals.sagepub.com/doi/pdf/10.1177/0...,,failed_download,4,"[covid, health, global, coronavirus, mental]",False
9,9,"Mental Health, Substance Use, and Suicidal Ide...","Mark E Czeisler, R. Lane, Emiko Petrosky, J. W...",2020,https://www.semanticscholar.org/paper/5f816081...,https://doi.org/10.15585/mmwr.mm6932a1,"data/pdfs/2020_Mental Health, Substance Use, a...",downloaded,4,"[covid, health, mental, pandemic, during]",False
