In [3]:
2#  !pip install requests pandas tqdm

import os
import re
import time
import json
import csv
import requests
import pandas as pd
from tqdm import tqdm
from typing import List, Dict, Optional

# ---------------------------
# CONFIG - Replace API_KEY
# ---------------------------
API_KEY = "JhnIenwb3246rhRB3pNlw5FR2MuiNx5M4BCQ9DhS"  # <--Semantic Scholar API key
DATA_DIR = "data"
PDF_DIR = os.path.join(DATA_DIR, "pdfs")
METADATA_JSON = os.path.join(DATA_DIR, "metadata.json")
METADATA_CSV = os.path.join(DATA_DIR, "metadata.csv")
SEARCH_LIMIT = 12  # number of results to fetch by default

os.makedirs(PDF_DIR, exist_ok=True)
os.makedirs(DATA_DIR, exist_ok=True)

# ---------------------------
# SMART SUGGESTIONS (Feature 1)
# ---------------------------
SUGGESTIONS = {
    "ai": ["artificial intelligence healthcare", "deep learning medical imaging", "explainable AI"],
    "nlp": ["transformers for text summarization", "nlp in healthcare"],
    "ml": ["machine learning for diagnosis", "ml anomaly detection"],
    "covid": ["covid-19 vaccine efficacy", "covid-19 transmission modeling"]
}

def auto_suggest(topic: str):
    key = topic.lower().strip()
    printed = False
    for short, suggestions in SUGGESTIONS.items():
        if key == short:
            print("\nüí° Suggested research keyword variations:")
            for s in suggestions:
                print("   üî∏", s)
            printed = True
    if not printed and len(topic.split()) <= 2:
        # show general hints
        print("\nüí° Tip: Consider adding domain or method words (e.g., 'healthcare', 'CNN', 'transformer') for better search results.")
    print()

# ---------------------------
# UTILITIES
# ---------------------------
def sanitize_filename(name: str, max_len: int = 100) -> str:
    name = re.sub(r'[\\/*?:"<>|]', "", name)
    name = re.sub(r'\s+', ' ', name).strip()
    return name[:max_len].strip()

KEYWORD_TAGS = ["disease", "covid", "classification", "deep", "cnn", "transformer",
                "diagnosis", "health", "medical", "image", "nlp", "survey", "review", "anomaly"]

def auto_tags(title: str) -> List[str]:
    title_lower = title.lower()
    tags = [kw for kw in KEYWORD_TAGS if kw in title_lower]
    # also add words longer than 5 chars that appear frequently as naive tag (not too noisy)
    extra = [w for w in set(re.findall(r'\b[a-z]{6,}\b', title_lower)) if w not in tags][:3]
    return list(dict.fromkeys(tags + extra))  # preserve order, uniq

def title_relevance_score(title: str, query: str) -> int:
    # simple scoring: keyword exact matches + token overlap
    title_l = title.lower()
    query_tokens = [t for t in re.findall(r'\w+', query.lower()) if len(t) > 1]
    score = 0
    for t in query_tokens:
        if t in title_l:
            score += 2
        # partial token in title: +1
        for word in re.findall(r'\w+', title_l):
            if t in word and t != word:
                score += 1
    return score

def is_duplicate(title: str, existing_meta: List[Dict]) -> bool:
    t = title.lower().strip()
    for m in existing_meta:
        if m.get("title","").lower().strip() == t:
            return True
    return False

def load_existing_metadata() -> List[Dict]:
    if os.path.exists(METADATA_JSON):
        try:
            with open(METADATA_JSON, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception:
            return []
    return []

def save_metadata_list(metadata: List[Dict]):
    with open(METADATA_JSON, "w", encoding="utf-8") as f:
        json.dump(metadata, f, indent=2, ensure_ascii=False)
    if metadata:
        keys = metadata[0].keys()
        with open(METADATA_CSV, "w", newline="", encoding="utf-8") as f:
            writer = csv.DictWriter(f, fieldnames=list(keys))
            writer.writeheader()
            writer.writerows(metadata)

# ---------------------------
# SEMANTIC SCHOLAR SEARCH (core)
# ---------------------------
def semantic_scholar_search(query: str, limit: int = SEARCH_LIMIT) -> List[Dict]:
    base = "https://api.semanticscholar.org/graph/v1/paper/search"
    fields = "title,authors,year,url,isOpenAccess,openAccessPdf,externalIds,abstract"
    params = {"query": query, "limit": limit, "fields": fields}
    headers = {"x-api-key": API_KEY}
    resp = requests.get(base, params=params, headers=headers, timeout=30)
    resp.raise_for_status()
    data = resp.json()
    papers = []
    for item in data.get("data", []):
        title = item.get("title", "No title")
        authors = ", ".join([a.get("name","") for a in item.get("authors", [])]) if item.get("authors") else ""
        year = item.get("year", None)
        url = item.get("url", None)
        is_open = bool(item.get("isOpenAccess", False))
        pdf_url = None
        if item.get("openAccessPdf"):
            pdf_url = item["openAccessPdf"].get("url")
        abstract = item.get("abstract", "")
        external = item.get("externalIds", {})
        papers.append({
            "title": title,
            "authors": authors,
            "year": year,
            "url": url,
            "is_open_access": is_open,
            "pdf_url": pdf_url,
            "external_ids": external,
            "abstract": abstract
        })
    return papers

# ---------------------------
# DISPLAY / INSIGHTS (Feature 2 & 3)
# ---------------------------
def prepare_and_rank(papers: List[Dict], query: str, existing_meta: List[Dict]) -> List[Dict]:
    for p in papers:
        p["score"] = title_relevance_score(p["title"], query)
        p["tags"] = auto_tags(p["title"])
        p["is_duplicate"] = is_duplicate(p["title"], existing_meta)
    # sort by score (desc) then year (desc if exists)
    papers_sorted = sorted(papers, key=lambda x: (x["score"], x.get("year") or 0), reverse=True)
    return papers_sorted

def emoji_access(p):
    return "üîì" if p["is_open_access"] else "üîí"

def emoji_pdf(p):
    return "üìÑ" if p.get("pdf_url") else "‚ùå"

def show_insights_table(papers: List[Dict]):
    rows = []
    for i,p in enumerate(papers):
        rows.append({
            "Index": i,
            "Title (short)": (p["title"][:70] + "...") if len(p["title"])>70 else p["title"],
            "Year": p.get("year"),
            "Authors": (p["authors"][:40] + "...") if p.get("authors") and len(p["authors"])>40 else p.get("authors"),
            "Access": emoji_access(p),
            "PDF": emoji_pdf(p),
            "Score": p.get("score", 0),
            "Tags": ", ".join(p.get("tags", [])),
            "Duplicate": "‚ö†" if p.get("is_duplicate") else ""
        })
    df = pd.DataFrame(rows)
    display(df)
    return df

# ---------------------------
# DOWNLOAD LOGIC
# ---------------------------
def download_file(url: str, dest_path: str, max_retries: int = 3, retry_delay: float = 1.2) -> bool:
    for attempt in range(1, max_retries + 1):
        try:
            r = requests.get(url, stream=True, timeout=30)
            r.raise_for_status()
            total = int(r.headers.get('content-length', 0))
            with open(dest_path, "wb") as f:
                if total and total > 0:
                    pbar = tqdm(total=total, unit="B", unit_scale=True, desc=f"Downloading", leave=False)
                    for chunk in r.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
                            pbar.update(len(chunk))
                    pbar.close()
                else:
                    f.write(r.content)
            return True
        except Exception as e:
            print(f"Warning: attempt {attempt} failed for URL: {url[:80]}... Error: {e}")
            time.sleep(retry_delay * attempt)
    return False

def download_selected_papers(papers: List[Dict], selected_indices: List[int], existing_meta: List[Dict]) -> List[Dict]:
    metadata_entries = []
    for idx in selected_indices:
        if idx < 0 or idx >= len(papers):
            print(f"Index {idx} out of range ‚Äî skipped.")
            continue
        p = papers[idx]
        title_clean = sanitize_filename(p["title"]) or f"paper_{idx}"
        filename = f"{p.get('year') or 'na'}_{title_clean}.pdf"
        dest_path = os.path.join(PDF_DIR, filename)

        entry = {
            "index_selected": idx,
            "title": p["title"],
            "authors": p.get("authors"),
            "year": p.get("year"),
            "source_url": p.get("url"),
            "pdf_url": p.get("pdf_url"),
            "local_path": None,
            "status": None,
            "score": p.get("score"),
            "tags": p.get("tags"),
            "is_duplicate": p.get("is_duplicate")
        }

        if p.get("is_duplicate"):
            entry["status"] = "duplicate_skipped"
            print(f"‚ö† Duplicate detected - skipped: {p['title']}")
            metadata_entries.append(entry)
            continue

        if not p.get("pdf_url"):
            entry["status"] = "skipped_no_pdf"
            print(f"‚ö† Skipped (no open PDF): {p['title']}")
            metadata_entries.append(entry)
            continue

        print(f"üì• Downloading [{idx}] {p['title']}")
        ok = download_file(p["pdf_url"], dest_path)
        if ok:
            entry["local_path"] = dest_path
            entry["status"] = "downloaded"
            print(f"‚úÖ Saved to {dest_path}")
        else:
            entry["status"] = "failed_download"
            print(f"‚ùå Failed to download: {p['title']}")
        metadata_entries.append(entry)
    return metadata_entries

# ---------------------------
# MAIN INTERACTIVE FLOW
# ---------------------------
def run_enhanced_flow():
    print("=== Enhanced Milestone-1: Paper Retriever (Your Independent Version) ===\n")
    existing_meta = load_existing_metadata()

    topic = input("Enter your research topic / query: ").strip()
    if not topic:
        print("Topic empty. Exiting.")
        return

    # Feature 1: Smart suggestions
    auto_suggest(topic)

    # Ask year filter optionally (user chose B earlier)
    apply_filter = input("Do you want to filter results by year range? (y/n): ").strip().lower().startswith("y")
    start_year = end_year = None
    if apply_filter:
        try:
            start_year = int(input("Enter start year (e.g., 2019): ").strip())
            end_year = int(input("Enter end year (e.g., 2025): ").strip())
            if start_year > end_year:
                start_year, end_year = end_year, start_year
        except Exception:
            print("Invalid input for years; continuing without year filter.")
            apply_filter = False

    # Perform search
    print("\nüîé Searching Semantic Scholar...")
    papers = semantic_scholar_search(topic, limit=SEARCH_LIMIT)
    if not papers:
        print("No papers found.")
        return

    # Feature 2 & 4 & 5: ranking, tags, duplicate detection
    papers = prepare_and_rank(papers, topic, existing_meta)

    # optional year filtering
    if apply_filter:
        papers_filtered = [p for p in papers if p.get("year") and start_year <= int(p["year"]) <= end_year]
        print(f"Filtered: {len(papers_filtered)} out of {len(papers)} match year range {start_year}-{end_year}.")
        papers = papers_filtered
        if not papers:
            print("No papers after filtering by year.")
            return

    # show insights table (feature 3)
    print("\nüìä Results insights:")
    show_insights_table(papers)

    # selection
    selection_str = input("Select paper indices to download (comma-separated, e.g. 0,2,3) or 'all' to attempt all: ").strip()
    if selection_str.lower() == "all":
        selected_indices = list(range(len(papers)))
    else:
        try:
            selected_indices = [int(s.strip()) for s in selection_str.split(",") if s.strip()!=""]
        except Exception:
            print("Invalid selection. Exiting.")
            return

    print(f"Selected indices: {selected_indices}")
    if not input("Proceed with downloads? (y/n): ").strip().lower().startswith("y"):
        print("Cancelled.")
        return

    # download selected
    metadata_new = download_selected_papers(papers, selected_indices, existing_meta)

    # combine metadata (append)
    combined_meta = existing_meta + metadata_new
    save_metadata_list(combined_meta)
    print(f"\nüìÅ Metadata saved to: {METADATA_JSON} and {METADATA_CSV}")
    print(f"üìÇ PDFs saved under: {PDF_DIR}")

    # show summary table
    if metadata_new:
        display(pd.DataFrame(metadata_new))
    else:
        print("No new metadata generated.")

# Run it:
if __name__ == "__main__":
    run_enhanced_flow()


=== Enhanced Milestone-1: Paper Retriever (Your Independent Version) ===

Enter your research topic / query: healthcare

üí° Tip: Consider adding domain or method words (e.g., 'healthcare', 'CNN', 'transformer') for better search results.

Do you want to filter results by year range? (y/n): y
Enter start year (e.g., 2019): 2020
Enter end year (e.g., 2025): 2025

üîé Searching Semantic Scholar...
Filtered: 8 out of 12 match year range 2020-2025.

üìä Results insights:


Unnamed: 0,Index,Title (short),Year,Authors,Access,PDF,Score,Tags,Duplicate
0,0,Global Regulatory Frameworks for the Use of Ar...,2024,"K. Palaniappan, Elaine Yan Ting Lin, Sil...",üîì,üìÑ,2,"health, artificial, sector, intelligence",
1,1,Transformative Potential of AI in Healthcare: ...,2024,"Molly Bekbolatova, Jonathan Mayer, Chiwe...",üîì,üìÑ,2,"health, transformative, navigating, applications",
2,2,The Role of AI in Hospitals and Clinics: Trans...,2024,"Shiva Maleki Varnosfaderani, Mohamad For...",üîì,üìÑ,2,"health, hospitals, century, transforming",
3,3,Generative AI in healthcare: an implementation...,2024,S. Reddy,üîì,üìÑ,2,"health, implementation, generative, translational",
4,4,Balancing Privacy and Progress: A Review of Pr...,2024,"Steven M. Williamson, Victor R. Prybutok",üîì,üìÑ,2,"health, review, systemic, patient, perceptions",
5,5,A Comprehensive Review on Exploring the Impact...,2024,"Pankajkumar A Anawade, Deepak Sharma, Sh...",üîì,üìÑ,2,"health, review, accessibility, exploring, comp...",
6,6,"ChatGPT Utility in Healthcare Education, Resea...",2023,Malik Sallam,üîì,üìÑ,2,"health, review, promising, chatgpt, education",
7,7,Revolutionizing healthcare: the role of artifi...,2023,"Shuroug A. Alowais, Sahar S. Alghamdi, N...",üîì,üìÑ,2,"health, artificial, intelligence, revolutionizing",


Select paper indices to download (comma-separated, e.g. 0,2,3) or 'all' to attempt all: all
Selected indices: [0, 1, 2, 3, 4, 5, 6, 7]
Proceed with downloads? (y/n): y
üì• Downloading [0] Global Regulatory Frameworks for the Use of Artificial Intelligence (AI) in the Healthcare Services Sector
‚ùå Failed to download: Global Regulatory Frameworks for the Use of Artificial Intelligence (AI) in the Healthcare Services Sector
üì• Downloading [1] Transformative Potential of AI in Healthcare: Definitions, Applications, and Navigating the Ethical Landscape and Public Perspectives
‚ùå Failed to download: Transformative Potential of AI in Healthcare: Definitions, Applications, and Navigating the Ethical Landscape and Public Perspectives
üì• Downloading [2] The Role of AI in Hospitals and Clinics: Transforming Healthcare in the 21st Century
‚ùå Failed to download: The Role of AI in Hospitals and Clinics: Transforming Healthcare in the 21st Century
üì• Downloading [3] Generative AI in healthc



‚úÖ Saved to data/pdfs/2024_Generative AI in healthcare an implementation science informed translational path on application, in.pdf
üì• Downloading [4] Balancing Privacy and Progress: A Review of Privacy Challenges, Systemic Oversight, and Patient Perceptions in AI-Driven Healthcare
‚ùå Failed to download: Balancing Privacy and Progress: A Review of Privacy Challenges, Systemic Oversight, and Patient Perceptions in AI-Driven Healthcare
üì• Downloading [5] A Comprehensive Review on Exploring the Impact of Telemedicine on Healthcare Accessibility




‚úÖ Saved to data/pdfs/2024_A Comprehensive Review on Exploring the Impact of Telemedicine on Healthcare Accessibility.pdf
üì• Downloading [6] ChatGPT Utility in Healthcare Education, Research, and Practice: Systematic Review on the Promising Perspectives and Valid Concerns
‚ùå Failed to download: ChatGPT Utility in Healthcare Education, Research, and Practice: Systematic Review on the Promising Perspectives and Valid Concerns
üì• Downloading [7] Revolutionizing healthcare: the role of artificial intelligence in clinical practice


                                                                

‚úÖ Saved to data/pdfs/2023_Revolutionizing healthcare the role of artificial intelligence in clinical practice.pdf

üìÅ Metadata saved to: data/metadata.json and data/metadata.csv
üìÇ PDFs saved under: data/pdfs




Unnamed: 0,index_selected,title,authors,year,source_url,pdf_url,local_path,status,score,tags,is_duplicate
0,0,Global Regulatory Frameworks for the Use of Ar...,"K. Palaniappan, Elaine Yan Ting Lin, Silke Vogel",2024,https://www.semanticscholar.org/paper/2fba8788...,https://www.mdpi.com/2227-9032/12/5/562/pdf?ve...,,failed_download,2,"[health, artificial, sector, intelligence]",False
1,1,Transformative Potential of AI in Healthcare: ...,"Molly Bekbolatova, Jonathan Mayer, Chiwei Ong,...",2024,https://www.semanticscholar.org/paper/467411fd...,https://www.mdpi.com/2227-9032/12/2/125/pdf?ve...,,failed_download,2,"[health, transformative, navigating, applicati...",False
2,2,The Role of AI in Hospitals and Clinics: Trans...,"Shiva Maleki Varnosfaderani, Mohamad Forouzanfar",2024,https://www.semanticscholar.org/paper/a779f5dd...,https://www.mdpi.com/2306-5354/11/4/337/pdf?ve...,,failed_download,2,"[health, hospitals, century, transforming]",False
3,3,Generative AI in healthcare: an implementation...,S. Reddy,2024,https://www.semanticscholar.org/paper/25361e95...,https://implementationscience.biomedcentral.co...,data/pdfs/2024_Generative AI in healthcare an ...,downloaded,2,"[health, implementation, generative, translati...",False
4,4,Balancing Privacy and Progress: A Review of Pr...,"Steven M. Williamson, Victor R. Prybutok",2024,https://www.semanticscholar.org/paper/e98c743f...,https://www.mdpi.com/2076-3417/14/2/675/pdf?ve...,,failed_download,2,"[health, review, systemic, patient, perceptions]",False
5,5,A Comprehensive Review on Exploring the Impact...,"Pankajkumar A Anawade, Deepak Sharma, Shailesh...",2024,https://www.semanticscholar.org/paper/5f11cbbd...,https://assets.cureus.com/uploads/review_artic...,data/pdfs/2024_A Comprehensive Review on Explo...,downloaded,2,"[health, review, accessibility, exploring, com...",False
6,6,"ChatGPT Utility in Healthcare Education, Resea...",Malik Sallam,2023,https://www.semanticscholar.org/paper/dfdf7ff0...,https://www.mdpi.com/2227-9032/11/6/887/pdf?ve...,,failed_download,2,"[health, review, promising, chatgpt, education]",False
7,7,Revolutionizing healthcare: the role of artifi...,"Shuroug A. Alowais, Sahar S. Alghamdi, Nada Al...",2023,https://www.semanticscholar.org/paper/5cde4748...,https://bmcmededuc.biomedcentral.com/counter/p...,data/pdfs/2023_Revolutionizing healthcare the ...,downloaded,2,"[health, artificial, intelligence, revolutioni...",False


In [4]:
!pip install pymupdf nltk scikit-learn


Collecting pymupdf
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m24.1/24.1 MB[0m [31m57.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.7


In [5]:
import os

PDF_DIR = "data/pdfs"

print("Folder exists:", os.path.exists(PDF_DIR))

if os.path.exists(PDF_DIR):
    files = os.listdir(PDF_DIR)
    print("Files inside data/pdfs:", files)
else:
    print("‚ùå data/pdfs folder not found")


Folder exists: True
Files inside data/pdfs: ['2024_Generative AI in healthcare an implementation science informed translational path on application, in.pdf', '2024_A Comprehensive Review on Exploring the Impact of Telemedicine on Healthcare Accessibility.pdf', '2023_Revolutionizing healthcare the role of artificial intelligence in clinical practice.pdf']


In [6]:
## Extract text
import fitz

files = os.listdir(PDF_DIR)

if not files:
    print("‚ùå No PDFs found. Upload or download PDFs first.")
else:
    pdf_path = os.path.join(PDF_DIR, files[0])
    print("Using PDF:", pdf_path)

    doc = fitz.open(pdf_path)
    print("Total pages:", len(doc))

    text = ""
    for page in doc:
        text += page.get_text()

    print("\nüîπ STEP 2 OUTPUT (first 500 chars):\n")
    print(text[:500])
    print("\nTotal characters extracted:", len(text))


Using PDF: data/pdfs/2024_Generative AI in healthcare an implementation science informed translational path on application, in.pdf
Total pages: 15

üîπ STEP 2 OUTPUT (first 500 chars):

Reddy ÔªøImplementation Science           (2024) 19:27  
https://doi.org/10.1186/s13012-024-01357-9
DEBATE
Open Access
¬© The Author(s) 2024. Open Access This article is licensed under a Creative Commons Attribution 4.0 International License, which 
permits use, sharing, adaptation, distribution and reproduction in any medium or format, as long as you give appropriate credit to the 
original author(s) and the source, provide a link to the Creative Commons licence, and indicate if changes were made.

Total characters extracted: 77441


In [7]:
## Split into sections
import re

def split_into_sections(text):
    patterns = {
        "abstract": r"\babstract\b",
        "introduction": r"\bintroduction\b",
        "methodology": r"\b(method|methodology)\b",
        "results": r"\bresults\b",
        "conclusion": r"\b(conclusion|discussion)\b"
    }

    text_l = text.lower()
    indices = {k: re.search(v, text_l).start()
               for k, v in patterns.items() if re.search(v, text_l)}

    sections = {}
    sorted_items = sorted(indices.items(), key=lambda x: x[1])

    for i, (sec, start) in enumerate(sorted_items):
        end = sorted_items[i+1][1] if i+1 < len(sorted_items) else len(text)
        sections[sec] = text[start:end]

    return sections


sections = split_into_sections(text)

print("\nüîπ STEP 3 OUTPUT ‚Äì Sections Found:")
print(sections.keys())



üîπ STEP 3 OUTPUT ‚Äì Sections Found:
dict_keys(['abstract', 'results', 'methodology', 'introduction', 'conclusion'])


In [8]:
# Key findings
import nltk

# Ensure necessary NLTK resources are available
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('tokenizers/punkt_tab')
except LookupError:
    nltk.download('punkt_tab')

from nltk.tokenize import sent_tokenize

# Function to extract key findings (sentence-wise)
def extract_key_findings(text):
    if not text:
        return []
    # Split text into sentences
    sentences = sent_tokenize(text)
    # Here you can add further filtering or keyword extraction if needed
    return sentences

# Example usage with sections dictionary
# sections = {'Introduction': 'Your text here...', 'Results': 'Your results text...'}
for sec, content in sections.items():
    print(f"\n{sec.upper()}:")
    for s in extract_key_findings(content):
        print("‚Ä¢", s)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



ABSTRACT:
‚Ä¢ Abstract‚ÄÉ
Background‚ÄÇ Artificial intelligence (AI), particularly generative AI, has¬†emerged as¬†a¬†transformative tool in¬†healthcare, 
with¬†the¬†potential to¬†revolutionize clinical decision-making and¬†improve health outcomes.
‚Ä¢ Generative AI, capable 
of¬†generating new data such as¬†text and¬†images, holds promise in¬†enhancing patient care, revolutionizing disease 
diagnosis and¬†expanding treatment options.
‚Ä¢ However, the¬†utility and¬†impact of¬†generative AI in¬†healthcare remain 
poorly understood, with¬†concerns around¬†ethical and¬†medico-legal implications, integration into¬†healthcare ser-
vice delivery and¬†workforce utilisation.
‚Ä¢ Also, there is¬†not¬†a clear pathway to¬†implement and¬†integrate generative AI 
in¬†healthcare delivery.
‚Ä¢ Methods‚ÄÇ This article aims to¬†provide a¬†comprehensive overview of¬†the¬†use of¬†generative AI in¬†healthcare, focus-
ing on¬†the¬†utility of¬†the¬†technology in¬†healthcare and¬†its translational applicati

In [None]:
# Process all PDFs
print("\nüîπ STEP 5 OUTPUT ‚Äì All PDFs")

for pdf in os.listdir(PDF_DIR):
    path = os.path.join(PDF_DIR, pdf)
    text = extract_text_from_pdf(path)
    sections = split_into_sections(text)

    print("\nPDF:", pdf)
    print("Sections:", list(sections.keys()))




üîπ STEP 5 OUTPUT ‚Äì All PDFs

PDF: 2020_Substance Abuse and Mental Health Services Administration.pdf
Sections: ['abstract', 'results', 'introduction', 'methodology', 'conclusion']


In [27]:
# -------------------------------------------------------
# WEEK 5‚Äì6: OFFLINE DRAFT GENERATION + APA REFERENCES
# - Generates Abstract, Methods, Results
# - Synthesizes multiple papers
# - Formats APA references from metadata.csv
# - No OpenAI API required
# -------------------------------------------------------
import os
import pandas as pd

OUTPUT_DIR = "drafts"
os.makedirs(OUTPUT_DIR, exist_ok=True)

DATA_DIR = "data"  # folder containing metadata.csv
METADATA_CSV = os.path.join(DATA_DIR, "metadata.csv")

def generate_apa_references(metadata_file, output_file):
    if not os.path.exists(metadata_file):
        print(f"‚ùå Metadata file '{metadata_file}' not found. Skipping references.")
        return
    df = pd.read_csv(metadata_file)
    references = []
    for _, row in df.iterrows():
        authors = row.get('authors', 'Unknown Author')
        year = row.get('year', 'n.d.')
        title = row.get('title', 'Untitled')
        venue = row.get('venue', 'Unknown Journal')
        references.append(f"{authors} ({year}). {title}. {venue}.")
    with open(os.path.join(OUTPUT_DIR, output_file), "w", encoding="utf-8") as f:
        f.write("\n\n".join(references))
    print(f"‚úÖ APA references saved to {output_file}")

# Generate references
generate_apa_references(METADATA_CSV, "references.txt")


‚úÖ APA references saved to references.txt
