In [None]:
!pip install -U ibm-watsonx-ai sentence-transformers transformers regex


Collecting ibm-watsonx-ai
  Downloading ibm_watsonx_ai-1.4.1-py3-none-any.whl.metadata (3.3 kB)
Collecting regex
  Downloading regex-2025.9.18-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.5/40.5 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting lomond (from ibm-watsonx-ai)
  Downloading lomond-0.3.3-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting ibm-cos-sdk<2.15.0,>=2.12.0 (from ibm-watsonx-ai)
  Downloading ibm_cos_sdk-2.14.3.tar.gz (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.8/58.8 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ibm-cos-sdk-core==2.14.3 (from ibm-cos-sdk<2.15.0,>=2.12.0->ibm-watsonx-ai)
  Downloading ibm_cos_sdk_core-2.14.3.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m18.7 MB/s[0m eta [

In [None]:
# ============================================================
# COLAB: Tweet summary + company & drug extraction (watsonx)
# - Uses local file already in Colab workspace
# - Includes therapy classes like "chemotherapy" and "immunotherapy"
# ============================================================

# 0) Install minimal deps
%pip install -U ibm-watsonx-ai regex

# 1) ---- HARD-CODE YOUR CREDENTIALS & MODEL ----
WATSONX_API_KEY = "B5EQe7EQckE5N1eLMtaAfGi1Gr9rOqGjyyGLN8MMispB"
WATSONX_URL     = "https://us-south.ml.cloud.ibm.com"   # adjust if your region differs
# Use ONE of these. If both are filled, SPACE_ID will be used.
WATSONX_PROJECT_ID = ""  # e.g., "250bddc8-62e9-447d-b4cd-fffba8bfff05"
WATSONX_SPACE_ID   = "1f66bc7c-f805-476f-92f6-1d6ebd561f15"

MODEL_ID = "meta-llama/llama-3-3-70b-instruct"  # chat_model on watsonx

# 2) ---- RUNTIME SETTINGS ----
TEMPERATURE = 0.3
TOP_P = 0.9
MAX_NEW_TOKENS = 300

# 3) ---- IMPORTS & MODEL SETUP ----
import os, json, re, os.path
from typing import List, Dict, Optional
from urllib.parse import urlparse

assert WATSONX_API_KEY and WATSONX_URL and (WATSONX_PROJECT_ID or WATSONX_SPACE_ID), \
    "Missing WATSONX_API_KEY/WATSONX_URL and either PROJECT_ID or SPACE_ID."

os.environ["WATSONX_API_KEY"]  = WATSONX_API_KEY
os.environ["WATSONX_URL"]      = WATSONX_URL
if WATSONX_SPACE_ID:
    os.environ["WATSONX_SPACE_ID"]   = WATSONX_SPACE_ID
    os.environ.pop("WATSONX_PROJECT_ID", None)
else:
    os.environ["WATSONX_PROJECT_ID"] = WATSONX_PROJECT_ID
    os.environ.pop("WATSONX_SPACE_ID", None)

from ibm_watsonx_ai import Credentials
from ibm_watsonx_ai.foundation_models import Model

creds = Credentials(url=os.environ["WATSONX_URL"], api_key=os.environ["WATSONX_API_KEY"])
params = {"decoding_method":"sample","temperature":float(TEMPERATURE),"top_p":float(TOP_P),"max_new_tokens":int(MAX_NEW_TOKENS)}

mdl_kwargs = {}
if os.getenv("WATSONX_SPACE_ID"):
    mdl_kwargs["space_id"] = os.environ["WATSONX_SPACE_ID"]
else:
    mdl_kwargs["project_id"] = os.environ["WATSONX_PROJECT_ID"]

wxa_model = Model(model_id=MODEL_ID, credentials=creds, params=params, **mdl_kwargs)

def watsonx_generate(prompt: str) -> str:
    """Try chat; fall back to non-chat generate_text; return plain string."""
    try:
        if hasattr(wxa_model, "start_chat"):
            chat = wxa_model.start_chat()
            resp = chat.send_message(prompt)
            text = getattr(resp, "message", None) or getattr(resp, "generated_text", None)
            if isinstance(text, str) and text.strip():
                return text
            if isinstance(resp, dict):
                return resp.get("generated_text") or resp.get("message") or json.dumps(resp)
    except Exception:
        pass
    out = wxa_model.generate_text(prompt=prompt)
    if isinstance(out, str):
        return out
    if isinstance(out, dict):
        if "results" in out and out["results"]:
            cand = out["results"][0].get("generated_text") or out["results"][0].get("text")
            if cand:
                return cand
        return out.get("generated_text") or json.dumps(out)
    return str(out)

# 4) ---- HELPERS ----
def trim_tweet(text: str, max_chars: int = 280) -> str:
    t = re.sub(r"\s+", " ", text.strip())
    if len(t) <= max_chars:
        return t
    cut = t[:max_chars]
    idx = max(cut.rfind("."), cut.rfind(";"), cut.rfind(","), cut.rfind(" "))
    if idx > max_chars * 0.6:
        return cut[:idx].rstrip()
    return cut.rstrip()

URL_COMPANY_MAP = {
    "gene.com": "Genentech",
    "roche.com": "Roche",
    "novartis.com": "Novartis",
    "pfizer.com": "Pfizer",
    "astrazeneca.com": "AstraZeneca",
    "lilly.com": "Eli Lilly",
    "merck.com": "Merck",
    "sanofi.com": "Sanofi",
    "gilead.com": "Gilead",
    "bms.com": "BMS",
    "amgen.com": "Amgen",
    "gsk.com": "GSK",
    "bayer.com": "Bayer",
    "takeda.com": "Takeda",
    "boehringer-ingelheim.com": "Boehringer Ingelheim",
    "beigene.com": "BeiGene",
    "seagen.com": "Seagen",
    "sermonixpharma.com": "Sermonix Pharma",
}
KNOWN_PHARMA = {
    "Genentech","Roche","Novartis","Pfizer","AstraZeneca","Eli Lilly","Merck",
    "Sanofi","Gilead","BMS","Amgen","GSK","Bayer","Takeda","Boehringer Ingelheim",
    "BeiGene","Seagen","Sermonix Pharma","Sermonix"
}
ORG_SUFFIXES = r"(?: Inc\.?| Corp\.?| Corporation| Ltd\.?| LLC| plc| AG| SA| NV| Co\.?)"

def company_from_url(url: str) -> Optional[str]:
    try:
        host = urlparse(url).netloc.lower()
        for prefix in ("www.", "amp.", "m.", "news."):
            if host.startswith(prefix):
                host = host[len(prefix):]
        if host in URL_COMPANY_MAP:
            return URL_COMPANY_MAP[host]
        parts = host.split(".")
        brand = parts[-2] if len(parts) >= 2 else parts[0]
        brand = brand.replace("-", " ").strip()
        return brand.capitalize() if brand else None
    except Exception:
        return None

def find_companies_from_text(text: str) -> List[str]:
    found = set()
    for k in KNOWN_PHARMA:
        if re.search(rf"\b{k}\b", text, flags=re.I):
            found.add(k)
    for m in re.finditer(rf"\b([A-Z][A-Za-z&\-]+(?:\s+[A-Z][A-Za-z&\-]+){{0,2}})(?:{ORG_SUFFIXES})?\b", text):
        name = m.group(0).strip()
        if name.lower() in {"q", "fda", "phase", "trial", "study", "breast", "cancer"}:
            continue
        found.add(name)
    out = [re.sub(r"\s+", " ", f).strip() for f in found]
    return list({x.lower(): x for x in out}.values())

# Therapy classes we’re allowed to include if they appear verbatim
THERAPY_CLASSES = [
    "chemotherapy",
    "immunotherapy",
    # add more if you want:
    # "endocrine therapy", "hormonal therapy", "targeted therapy",
]

def llm_extract_drugs_strict(headline: str, content: str) -> List[str]:
    """
    Extract drug/product names verbatim (including therapy classes like
    'chemotherapy' and 'immunotherapy' if they appear in the text).
    """
    snippet = (headline + "\n" + content).strip()
    if len(snippet) > 5000:
        snippet = snippet[:5000]

    prompt = f"""
Extract drug, product, or therapy names that appear **verbatim** in the provided text.

RULES:
- Return ONLY names that exist in the text (headline or content).
- If a brand and generic appear like "Itovebi (inavolisib)", return both items separately.
- INCLUDE therapy classes when they appear (e.g., "chemotherapy", "immunotherapy").
- Return a pure JSON array of strings (no commentary).

TEXT:
\"\"\"{snippet}\"\"\"

Return JSON array:
"""
    raw = watsonx_generate(prompt).strip()
    match = re.search(r"\[\s*(?:\".*?\")\s*(?:,.*?)*\]", raw, flags=re.S)
    items = []
    if match:
        try:
            items = json.loads(match.group(0))
        except Exception:
            items = []
    if not isinstance(items, list):
        items = []
    items = [s.strip() for s in items if isinstance(s, str) and s.strip()]

    # Validate: keep only names that literally appear in the text (normalize ™/®)
    low = snippet.lower()
    validated = []
    for name in items:
        norm = re.sub(r"[™®]", "", name).strip()
        if norm and (norm.lower() in low):
            validated.append(name)

    # Ensure therapy classes are included if present in text, even if the LLM missed them
    for cls in THERAPY_CLASSES:
        if cls.lower() in low and cls not in validated:
            validated.append(cls)

    # Dedup preserve order
    seen = set(); final = []
    for x in validated:
        k = x.lower()
        if k not in seen:
            seen.add(k); final.append(x)
    return final

def llm_summary_tweet(headline: str, content: str) -> str:
    snippet = (headline + "\n" + content).strip()
    if len(snippet) > 6000:
        snippet = snippet[:6000]

    prompt = f"""
Write ONE tweet-style summary (<= 280 characters) strictly based on the article text below.
REQUIREMENTS:
- Mention concrete specifics present in the text (e.g., drug name(s), indication, setting/line, and endpoint if mentioned).
- No emojis, no hashtags, no marketing language, no invented details.
- One concise sentence or two short clauses.

TEXT:
\"\"\"{snippet}\"\"\"

Tweet (<=280 chars):
"""
    raw = watsonx_generate(prompt)
    tweet = raw.splitlines()[0].strip() if raw else ""
    return trim_tweet(tweet, max_chars=280)

def process_articles(articles: List[Dict]) -> List[Dict]:
    results = []
    for art in articles:
        headline = (art.get("headline") or "").strip()
        url = (art.get("url") or "").strip()
        content = (art.get("content") or "").strip()
        text_all = (headline + "\n" + content).strip()

        summary = llm_summary_tweet(headline, content)

        companies = []
        from_url = company_from_url(url)
        if from_url:
            companies.append(from_url)
        companies += find_companies_from_text(text_all)

        filtered = []
        for c in companies:
            c_norm = c.strip()
            if not c_norm or c_norm in filtered:
                continue
            if (c_norm in KNOWN_PHARMA) or (c_norm == from_url) or re.search(ORG_SUFFIXES + r"$", c_norm):
                filtered.append(c_norm)

        drugs = llm_extract_drugs_strict(headline, content)

        results.append({
            "headline": headline,
            "url": url,
            "summary_tweet": summary,
            "companies": filtered,
            "drugs": drugs
        })
    return results

print("✅ watsonx model ready:", MODEL_ID)

# 5) ---- LOAD INPUT JSON FROM LOCAL FILE IN COLAB ----
# If your file is already in the working directory as 'breast_cancer_news_content' (with or without .json):
base_name = "breast_cancer_news_content"
candidates = [base_name, base_name + ".json", base_name + ".JSON"]
INPUT_PATH = None
for p in candidates:
    if os.path.exists(p):
        INPUT_PATH = p
        break
if INPUT_PATH is None:
    raise FileNotFoundError("Could not find 'breast_cancer_news_content' (with or without .json/.JSON) in the current folder.")

print("Reading:", INPUT_PATH)
with open(INPUT_PATH, "r", encoding="utf-8") as f:
    articles_in = json.load(f)

# 6) ---- RUN & SAVE OUTPUT ----
processed = process_articles(articles_in)

OUTPUT_PATH = "article_summaries_extractions.json"
with open(OUTPUT_PATH, "w", encoding="utf-8") as f:
    json.dump(processed, f, ensure_ascii=False, indent=2)

print("Sample output preview:")
print(json.dumps(processed[:1], ensure_ascii=False, indent=2))

# 7) ---- DOWNLOAD ----
from google.colab import files
files.download(OUTPUT_PATH)






✅ watsonx model ready: meta-llama/llama-3-3-70b-instruct
Reading: breast_cancer_news_content.json
Sample output preview:
[
  {
    "headline": "FDA Approves New Targeted Treatment For Advanced Hormone Receptor-Positive, HER2-Negative Breast Cancer With A PIK3CA Mutation",
    "url": "https://www.gene.com/media/news-features/fda-approves-new-targeted-treatment-for-advanced-hormone-receptor-positive-her2-negative-breast-cancer-with-a-pik3ca-mutation",
    "summary_tweet": "\"The FDA approved Itovebi (inavolisib) in combination with palbociclib and fulvestrant for first-line treatment of HR+, HER2-, PIK3CA-mutated metastatic breast cancer, doubling progression-free survival in the Phase III INAVO120 trial.\"",
    "companies": [
      "Genentech"
    ],
    "drugs": [
      "Itovebi",
      "inavolisib",
      "palbociclib",
      "fulvestrant",
      "targeted therapy"
    ]
  }
]


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Entity Extraction + Summarization Agent

In [2]:
# ============================================================
# COLAB: Article extraction (watsonx) — strict output schema + incremental writes
# - Outputs ONLY requested fields
# - Company from URL only
# - Drug names exclude therapy classes
# - Trial names only proper names (e.g., KEYNOTE-522)
# - Incremental NDJSON + rolling JSON snapshot
# ============================================================

# 0) Install minimal deps
%pip install -U ibm-watsonx-ai regex dateparser

# 1) ---- CREDENTIALS & MODEL ----
WATSONX_API_KEY = "B5EQe7EQckE5N1eLMtaAfGi1Gr9rOqGjyyGLN8MMispB"
WATSONX_URL     = "https://us-south.ml.cloud.ibm.com"
WATSONX_PROJECT_ID = ""
WATSONX_SPACE_ID   = "1f66bc7c-f805-476f-92f6-1d6ebd561f15"
MODEL_ID = "meta-llama/llama-3-3-70b-instruct"

# 2) ---- RUNTIME SETTINGS ----
TEMPERATURE = 0.3
TOP_P = 0.9
MAX_NEW_TOKENS = 300

# 3) ---- IMPORTS & MODEL SETUP ----
import os, json, re, os.path, io
from typing import List, Dict, Optional
from urllib.parse import urlparse
import dateparser

assert WATSONX_API_KEY and WATSONX_URL and (WATSONX_PROJECT_ID or WATSONX_SPACE_ID), \
    "Missing WATSONX_API_KEY/WATSONX_URL and either PROJECT_ID or SPACE_ID."

os.environ["WATSONX_API_KEY"]  = WATSONX_API_KEY
os.environ["WATSONX_URL"]      = WATSONX_URL
if WATSONX_SPACE_ID:
    os.environ["WATSONX_SPACE_ID"]   = WATSONX_SPACE_ID
    os.environ.pop("WATSONX_PROJECT_ID", None)
else:
    os.environ["WATSONX_PROJECT_ID"] = WATSONX_PROJECT_ID
    os.environ.pop("WATSONX_SPACE_ID", None)

from ibm_watsonx_ai import Credentials
from ibm_watsonx_ai.foundation_models import Model

creds = Credentials(url=os.environ["WATSONX_URL"], api_key=os.environ["WATSONX_API_KEY"])
params = {"decoding_method":"sample","temperature":float(TEMPERATURE),"top_p":float(TOP_P),"max_new_tokens":int(MAX_NEW_TOKENS)}

mdl_kwargs = {}
if os.getenv("WATSONX_SPACE_ID"):
    mdl_kwargs["space_id"] = os.environ["WATSONX_SPACE_ID"]
else:
    mdl_kwargs["project_id"] = os.environ["WATSONX_PROJECT_ID"]

wxa_model = Model(model_id=MODEL_ID, credentials=creds, params=params, **mdl_kwargs)

def watsonx_generate(prompt: str) -> str:
    """Try chat; fall back to non-chat generate_text; return plain string."""
    try:
        if hasattr(wxa_model, "start_chat"):
            chat = wxa_model.start_chat()
            resp = chat.send_message(prompt)
            text = getattr(resp, "message", None) or getattr(resp, "generated_text", None)
            if isinstance(text, str) and text.strip():
                return text
            if isinstance(resp, dict):
                return resp.get("generated_text") or resp.get("message") or json.dumps(resp)
    except Exception:
        pass
    out = wxa_model.generate_text(prompt=prompt)
    if isinstance(out, str):
        return out
    if isinstance(out, dict):
            if "results" in out and out["results"]:
                cand = out["results"][0].get("generated_text") or out["results"][0].get("text")
                if cand:
                    return cand
            return out.get("generated_text") or json.dumps(out)
    return str(out)

# 4) ---- HELPERS ----
def trim_tweet(text: str, max_chars: int = 280) -> str:
    t = re.sub(r"\s+", " ", text.strip())
    if len(t) <= max_chars:
        return t
    cut = t[:max_chars]
    idx = max(cut.rfind("."), cut.rfind(";"), cut.rfind(","), cut.rfind(" "))
    if idx > max_chars * 0.6:
        return cut[:idx].rstrip()
    return cut.rstrip()

URL_COMPANY_MAP = {
    "gene.com": "Genentech",
    "roche.com": "Roche",
    "novartis.com": "Novartis",
    "pfizer.com": "Pfizer",
    "astrazeneca.com": "AstraZeneca",
    "lilly.com": "Eli Lilly",
    "merck.com": "Merck",
    "sanofi.com": "Sanofi",
    "gilead.com": "Gilead",
    "bms.com": "BMS",
    "amgen.com": "Amgen",
    "gsk.com": "GSK",
    "bayer.com": "Bayer",
    "takeda.com": "Takeda",
    "boehringer-ingelheim.com": "Boehringer Ingelheim",
    "beigene.com": "BeiGene",
    "seagen.com": "Seagen",
    "sermonixpharma.com": "Sermonix Pharma",
    "janssen.com": "Janssen",
    "johnsonandjohnson.com": "Johnson & Johnson",
    "jnj.com": "Johnson & Johnson",
    "abbvie.com": "AbbVie",
    "abbott.com": "Abbott",
    "biogen.com": "Biogen",
    "celgene.com": "Celgene",
    "regeneron.com": "Regeneron",
    "modernatx.com": "Moderna",
}

def company_from_url(url: str) -> Optional[str]:
    try:
        host = urlparse(url).netloc.lower()
        for prefix in ("www.", "amp.", "m.", "news.", "media.", "investor."):
            if host.startswith(prefix):
                host = host[len(prefix):]
        if host in URL_COMPANY_MAP:
            return URL_COMPANY_MAP[host]
        parts = host.split(".")
        brand = parts[-2] if len(parts) >= 2 else parts[0]
        brand = brand.replace("-", " ").strip()
        return brand.capitalize() if brand else None
    except Exception:
        return None

# Indications (keep distinct)
BREAST_CANCER_INDICATIONS = [
    "breast cancer", "metastatic breast cancer", "early breast cancer", "advanced breast cancer",
    "HER2-positive breast cancer", "HER2+ breast cancer", "triple negative breast cancer", "TNBC",
    "hormone receptor positive breast cancer", "HR+ breast cancer", "ER+ breast cancer", "PR+ breast cancer",
    "inflammatory breast cancer", "ductal carcinoma", "lobular carcinoma", "locally advanced breast cancer"
]

def extract_publication_date(content: str, headline: str) -> Optional[str]:
    """
    Extract publication date prioritizing the start of content (plus headline).
    """
    text_for_date = (headline + " " + content[:2000])
    date_patterns = [
        r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b',
        r'\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b',
        r'\b\d{4}-\d{2}-\d{2}\b',
        r'\b\d{1,2}/\d{1,2}/\d{4}\b',
        r'\b\d{1,2}-\d{1,2}-\d{4}\b',
        r'\b(?:Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday),\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b'
    ]
    for pattern in date_patterns:
        matches = re.findall(pattern, text_for_date, re.IGNORECASE)
        for match in matches:
            parsed_date = dateparser.parse(match)
            if parsed_date:
                return parsed_date.strftime('%Y-%m-%d')
    # LLM fallback
    try:
        prompt = f"""
Extract the publication date from the following text. Return ONLY the date in YYYY-MM-DD format or "Not found" if no clear date.

TEXT:
\"\"\"{text_for_date[:3000]}\"\"\"


Date (YYYY-MM-DD or "Not found"):
"""
        response = watsonx_generate(prompt).strip()
        if response and response != "Not found" and re.match(r'^\d{4}-\d{2}-\d{2}$', response):
            return response
    except Exception:
        pass
    return None

def extract_indication(headline: str, content: str) -> List[str]:
    text_all = (headline + " " + content).lower()
    found = []
    for indication in BREAST_CANCER_INDICATIONS:
        if indication.lower() in text_all:
            found.append(indication)
    # LLM assist for other disease mentions
    try:
        prompt = f"""
Extract all specific disease indications, cancer subtypes, or medical conditions mentioned in this text.
Return ONLY a JSON array of strings with the specific indications found.

TEXT:
\"\"\"{text_all[:4000]}\"\"\"


JSON array:
"""
        response = watsonx_generate(prompt).strip()
        match = re.search(r'\[.*\]', response, flags=re.S)
        if match:
            llm_list = json.loads(match.group(0))
            if isinstance(llm_list, list):
                for ind in llm_list:
                    if isinstance(ind, str) and ind.strip():
                        if any(k in ind.lower() for k in ['cancer','carcinoma','tumor','neoplasm','metastatic','advanced']):
                            found.append(ind.strip())
    except Exception:
        pass
    # distinct, preserve order
    seen=set(); out=[]
    for x in found:
        if x not in seen:
            seen.add(x); out.append(x)
    return out

def llm_extract_drug_names_only(headline: str, content: str) -> List[str]:
    """
    Extract drug/product names that appear verbatim in text.
    EXCLUDES therapy classes (e.g., chemotherapy, immunotherapy, targeted therapy).
    """
    snippet = (headline + "\n" + content).strip()
    if len(snippet) > 5000:
        snippet = snippet[:5000]

    prompt = f"""
Extract drug or product names that appear **verbatim** in the text below.

RULES:
- Return ONLY names that literally appear in the text (headline or content).
- If a brand and its generic appear like "Itovebi (inavolisib)", return BOTH as separate strings.
- EXCLUDE therapy class terms like "chemotherapy", "immunotherapy", "targeted therapy", "radiotherapy", etc.
- Return a pure JSON array of strings (no commentary).

TEXT:
\"\"\"{snippet}\"\"\"


Return JSON array:
"""
    raw = watsonx_generate(prompt).strip()
    match = re.search(r"\[\s*(?:\".*?\")\s*(?:,.*?)*\]", raw, flags=re.S)
    items = []
    if match:
        try:
            items = json.loads(match.group(0))
        except Exception:
            items = []
    if not isinstance(items, list):
        items = []
    items = [s.strip() for s in items if isinstance(s, str) and s.strip()]

    # Validate literal presence (normalize ™/®). No therapy classes allowed.
    low = snippet.lower()
    banned = {"chemotherapy","immunotherapy","endocrine therapy","hormonal therapy","targeted therapy",
              "radiation therapy","radiotherapy","adjuvant therapy","neoadjuvant therapy"}
    validated = []
    for name in items:
        norm = re.sub(r"[™®]", "", name).strip()
        if norm and (norm.lower() in low) and (norm.lower() not in banned):
            validated.append(name)

    # Dedup, preserve order
    seen=set(); out=[]
    for x in validated:
        k=x.lower()
        if k not in seen:
            seen.add(k); out.append(x)
    return out

# ===== Trial phase & study name extraction =====
ROMAN_MAP = {"I":"1","II":"2","III":"3","IV":"4","V":"5"}

def extract_trial_phase(headline: str, content: str) -> List[str]:
    """
    Normalize to forms like 'Phase 1', 'Phase 2/3', 'Phase 3b'.
    """
    text = f"{headline}\n{content}"
    phases = set()
    patterns = [
        r'\b[Pp]hase\s*(I{1,3}V?|V|1|2|3|4)(?:\s*[/\-]\s*(I{1,3}V?|V|1|2|3|4))?\s*([a-dA-D])?\b',
        r'\b[Pp](?:h|H)?\s*(\d)(?:\s*/\s*(\d))?\b',
        r'\b(?:[Pp]hase)?\s*(\d)\s*/\s*(\d)\b',
        r'\b[Pp]hase\s*(I{1,3})([a-dA-D])\b',
        r'\b[Pp](\d)([a-dA-D])\b'
    ]
    for pat in patterns:
        for m in re.finditer(pat, text):
            grp = [g for g in m.groups() if g]
            if not grp:
                continue
            nums=[]; suffix=""
            for g in grp:
                G=g.upper()
                if G in ROMAN_MAP: nums.append(ROMAN_MAP[G])
                elif re.fullmatch(r"[IVX]+", G): nums.append(str(len(G)))  # coarse
                elif re.fullmatch(r"\d", G): nums.append(G)
                elif re.fullmatch(r"[A-D]", G): suffix = G.lower()
            if len(nums)==1:
                phases.add(f"Phase {nums[0]}{suffix}")
            elif len(nums)>=2:
                phases.add(f"Phase {nums[0]}/{nums[1]}{suffix}")

    # LLM fallback
    if not phases:
        try:
            snippet = (headline + "\n" + content)[:4500]
            prompt = f"""
From the text, extract the clinical trial PHASE if mentioned.
Return ONLY a JSON array of normalized strings like "Phase 1", "Phase 2/3", "Phase 3b".
If none, return [].

TEXT:
\"\"\"{snippet}\"\"\"


JSON array:
"""
            resp = watsonx_generate(prompt).strip()
            mm = re.search(r'\[.*\]', resp, flags=re.S)
            if mm:
                arr = json.loads(mm.group(0))
                if isinstance(arr, list):
                    for x in arr:
                        if isinstance(x, str) and x.strip().lower().startswith("phase"):
                            phases.add(x.strip())
        except Exception:
            pass

    out=[]; seen=set()
    for p in phases:
        if p not in seen:
            seen.add(p); out.append(p)
    return out

STUDY_NAME_HINTS = [
    "keynote","tropion","destiny","checkmate","impower","impassion",
    "monarch","olympiad","ascend","clarity","polo","bright","palace","compas","compass"
]

def extract_trial_names(headline: str, content: str) -> List[str]:
    """
    Return only PROPER study names (e.g., 'KEYNOTE-522', 'TROPION-Breast01').
    Never return lone roman numerals or plain 'phase'.
    """
    text = f"{headline}\n{content}"
    names=set()

    # Strict patterns that require letters (prevents capturing plain 'III')
    pats = [
        r'["“]?([A-Z][A-Z0-9]+(?:[-–][A-Za-z0-9]+){0,3})["”]?\s+(?:trial|study)\b',
        r'\b(?:trial|study)\s+["“]?([A-Z][A-Z0-9]+(?:[-–][A-Za-z0-9]+){0,3})["”]?\b',
        r'["“]([A-Za-z][^"”]{2,80})["”]\s+(?:trial|study)\b',
    ]
    for pat in pats:
        for m in re.finditer(pat, text):
            cand = re.sub(r'\s+', ' ', m.group(1).strip())
            # Require at least one letter AND at least one letter or digit after a hyphen to resemble real study codes
            if re.search(r'[A-Za-z]', cand) and (re.search(r'-', cand) or len(cand) >= 4):
                # Exclude pure roman numerals
                if not re.fullmatch(r'[IVX]+', cand, flags=re.I):
                    names.add(cand)

    # Quoted anywhere with window and having letters
    for m in re.finditer(r'["“]([^"”]{3,80})["”]', text):
        span_start, span_end = m.span()
        window = text[max(0, span_start-60):min(len(text), span_end+60)]
        if re.search(r'\b(trial|study)\b', window, flags=re.I):
            cand = re.sub(r'\s+', ' ', m.group(1).strip())
            if re.search(r'[A-Za-z]', cand) and not re.fullmatch(r'[IVX]+', cand, flags=re.I):
                names.add(cand)

    # LLM fallback, but keep STRICT acceptance
    try:
        snippet = text[:4500]
        prompt = f"""
If the text mentions a named clinical study/trial (e.g., KEYNOTE-522, TROPION-Breast01), return ONLY a pure JSON array of those names.
If none, return [].

TEXT:
\"\"\"{snippet}\"\"\"


JSON array:
"""
        resp = watsonx_generate(prompt).strip()
        mm = re.search(r'\[.*\]', resp, flags=re.S)
        if mm:
            arr = json.loads(mm.group(0))
            if isinstance(arr, list):
                for cand in arr:
                    if isinstance(cand, str):
                        c = re.sub(r'\s+', ' ', cand.strip())
                        if c and re.search(r'[A-Za-z]', c) and not re.fullmatch(r'[IVX]+', c, flags=re.I):
                            names.add(c)
    except Exception:
        pass

    out=[]; seen=set()
    for n in names:
        if n not in seen:
            seen.add(n); out.append(n)
    return out

def llm_summary_280(headline: str, content: str) -> str:
    snippet = (headline + "\n" + content).strip()
    if len(snippet) > 6000:
        snippet = snippet[:6000]
    prompt = f"""
Write ONE tweet-style summary (<= 280 characters) strictly based on the article text below.
REQUIREMENTS:
- Mention concrete specifics present in the text (e.g., drug name(s), indication, setting/line, and endpoint if mentioned).
- No emojis, no hashtags, no marketing language, no invented details.
- One concise sentence or two short clauses.

TEXT:
\"\"\"{snippet}\"\"\"


Tweet (<=280 chars):
"""
    raw = watsonx_generate(prompt)
    tweet = raw.splitlines()[0].strip() if raw else ""
    return trim_tweet(tweet, max_chars=280)

# 5) ---- PROCESSOR (STRICT SCHEMA) ----
def process_articles(articles: List[Dict]) -> List[Dict]:
    results = []
    for art in articles:
        headline = (art.get("headline") or "").strip()
        url = (art.get("link") or art.get("url") or "").strip()
        content = (art.get("content") or "").strip()

        # Required fields
        published_date = extract_publication_date(content, headline)
        indications = extract_indication(headline, content)
        drug_names = llm_extract_drug_names_only(headline, content)
        trial_phases = extract_trial_phase(headline, content)
        trial_names = extract_trial_names(headline, content)
        summary = llm_summary_280(headline, content)
        company_name = company_from_url(url)

        # Build EXACTLY the requested output shape
        record = {
            "published_date": published_date,      # 1) date extracted (from start area)
            "content": content,                    # 2) full content
            "entities": {                          # 3) entities bundle
                "drug_names": drug_names,
                "company_name": company_name,
                "trial_phases": trial_phases,
                "trial_names": trial_names,
                "indications": list(dict.fromkeys(indications))  # distinct
            },
            "summary_280": summary,                # 4) summary <= 280 chars
            "url": url,                            # 5) actual url
            "headline": headline                   # 6) headline
        }
        results.append(record)
    return results

print("✅ watsonx model ready:", MODEL_ID)

# 6) ---- LOAD INPUT JSON FROM LOCAL FILE IN COLAB ----
INPUT_FILENAME = "breast_cancer_articles_with_pdf_20251019_203237.json"

import glob, json
if not os.path.exists(INPUT_FILENAME):
    json_files = glob.glob("breast_cancer_articles_with_pdf_*.json")
    if json_files:
        INPUT_FILENAME = json_files[0]
        print(f"Using found file: {INPUT_FILENAME}")
    else:
        raise FileNotFoundError(f"Could not find {INPUT_FILENAME} or similar breast_cancer_articles_with_pdf_*.json files")

print("Reading:", INPUT_FILENAME)
with open(INPUT_FILENAME, "r", encoding="utf-8") as f:
    articles_in = json.load(f)

print(f"Loaded {len(articles_in)} articles")

# 7) ---- RUN with INCREMENTAL WRITES ----
print("Processing articles with incremental writes...")

OUTPUT_JSON = "enhanced_article_analysis.json"   # rolling snapshot (array)
OUTPUT_NDJSON = "enhanced_article_analysis.ndjson"  # one record per line

# Start fresh files
open(OUTPUT_NDJSON, "w", encoding="utf-8").close()
processed_so_far: List[Dict] = []

def write_incremental(current_list: List[Dict], last_record: Dict):
    """Append one line to NDJSON and also overwrite the JSON array file atomically."""
    with open(OUTPUT_NDJSON, "a", encoding="utf-8") as g:
        g.write(json.dumps(last_record, ensure_ascii=False) + "\n")
        g.flush()
        os.fsync(g.fileno())
    tmp = OUTPUT_JSON + ".tmp"
    with open(tmp, "w", encoding="utf-8") as f:
        json.dump(current_list, f, ensure_ascii=False, indent=2)
        f.flush()
        os.fsync(f.fileno())
    os.replace(tmp, OUTPUT_JSON)

processed_count = 0
for rec in process_articles(articles_in):
    processed_so_far.append(rec)
    write_incremental(processed_so_far, rec)
    processed_count += 1
    if processed_count % 10 == 0:
        print(f"... {processed_count} / {len(articles_in)} done")

print(f"Processed {processed_count} articles")
print("Sample output preview:")
print(json.dumps(processed_so_far[:1], ensure_ascii=False, indent=2))

# 8) ---- STATS (optional)
print("\n" + "="*50)
print("EXTRACTION STATS")
print("="*50)
articles_with_date = sum(1 for a in processed_so_far if a.get('published_date'))
articles_with_drugs = sum(1 for a in processed_so_far if a["entities"].get('drug_names'))
articles_with_company = sum(1 for a in processed_so_far if a["entities"].get('company_name'))
articles_with_trial_phase = sum(1 for a in processed_so_far if a["entities"].get('trial_phases'))
articles_with_trial_name = sum(1 for a in processed_so_far if a["entities"].get('trial_names'))
articles_with_indications = sum(1 for a in processed_so_far if a["entities"].get('indications'))
print(f"Total: {len(processed_so_far)}")
print(f"Published date: {articles_with_date}")
print(f"Drug names: {articles_with_drugs}")
print(f"Company name: {articles_with_company}")
print(f"Trial phases: {articles_with_trial_phase}")
print(f"Trial names: {articles_with_trial_name}")
print(f"Indications: {articles_with_indications}")

# 9) ---- DOWNLOAD ----
from google.colab import files
files.download(OUTPUT_JSON)
files.download(OUTPUT_NDJSON)

print(f"\n✅ Live progress written to {OUTPUT_NDJSON}")
print(f"✅ Cumulative array written to {OUTPUT_JSON} and downloaded")






✅ watsonx model ready: meta-llama/llama-3-3-70b-instruct
Reading: breast_cancer_articles_with_pdf_20251019_203237.json
Loaded 150 articles
Processing articles with incremental writes...
... 10 / 150 done
... 20 / 150 done
... 30 / 150 done
... 40 / 150 done
... 50 / 150 done
... 60 / 150 done
... 70 / 150 done
... 80 / 150 done
... 90 / 150 done
... 100 / 150 done
... 110 / 150 done
... 120 / 150 done
... 130 / 150 done
... 140 / 150 done
... 150 / 150 done
Processed 150 articles
Sample output preview:
[
  {
    "published_date": null,
    "content": "Positive results from the TROPION-Breast02 Phase III trial showedDatroway(datopotamab deruxtecan) demonstrated a statistically significant and clinically meaningful improvement for the dual primary endpoints of overall survival (OS) and progression-free survival (PFS) compared to investigator's choice of chemotherapy as 1st-line treatment for patients with locally recurrent inoperable or metastatic triple-negative breast cancer (TNBC) for

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


✅ Live progress written to enhanced_article_analysis.ndjson
✅ Cumulative array written to enhanced_article_analysis.json and downloaded
