In [3]:
# ✅ ONE-CELL REFERRAL LABELING PIPELINE (LLaMA 3.2 + KEYWORDS, 10 ROWS)

!pip install transformers fpdf pytrends beautifulsoup4 requests selenium haystack-ai -q

import pandas as pd, torch, os
from google.colab import files
from haystack.components.generators import HuggingFaceLocalGenerator

# 🔁 Upload file
uploaded = files.upload()
df = pd.read_csv(next(iter(uploaded))).head(10)  # Only process 10 rows

# 🔐 Hugging Face token (replace this!)
os.environ["HF_TOKEN"] = "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"  # <<=== PUT YOUR TOKEN HERE

# 🚀 Load LLaMA 3.2 Instruct
generator = HuggingFaceLocalGenerator(
    model="meta-llama/Llama-3.2-3B-Instruct",
    huggingface_pipeline_kwargs={"device_map": "auto", "torch_dtype": torch.bfloat16},
    generation_kwargs={"max_new_tokens": 100, "do_sample": False, "temperature": 0.1}
)
generator.warm_up()

# 🩺 Specialist mapping
specialist_keywords = {
    "Cardiologist": ["chest pain", "shortness of breath", "palpitations", "hypertension", "heart attack", "high blood pressure"],
    "Neurologist": ["headache", "dizziness", "numbness", "memory loss", "seizure", "tremors", "stroke", "fainting"],
    "Pulmonologist": ["cough", "asthma", "wheezing", "copd", "bronchitis"],
    "Dermatologist": ["rash", "itching", "acne", "eczema", "skin lesion", "mole", "psoriasis", "redness"],
    "Gastroenterologist": ["nausea", "vomiting", "diarrhea", "abdominal pain", "bloating", "constipation", "indigestion"],
    "Urologist": ["painful urination", "frequent urination", "blood in urine", "prostate", "kidney stone"],
    "Endocrinologist": ["diabetes", "thyroid", "hormonal imbalance", "weight gain", "weight loss", "fatigue"],
    "Psychiatrist": ["depression", "anxiety", "hallucinations", "mood swings", "suicidal thoughts"],
    "ENT Specialist": ["sore throat", "ear pain", "nasal congestion", "sinus", "hearing loss", "vertigo"],
    "Orthopedic": ["joint pain", "back pain", "fracture", "sprain", "stiffness", "mobility issue"],
    "Ophthalmologist": ["blurred vision", "eye pain", "redness", "eye discharge", "vision loss"],
    "Gynecologist": ["irregular periods", "pregnancy", "pelvic pain", "vaginal discharge"],
    "General Practitioner": ["fever", "fatigue", "cold", "flu", "checkup", "general"]
}

# 🔍 Keyword + LLaMA prediction helpers
def keyword_match(text):
    text = str(text).lower()
    return [spec for spec, kws in specialist_keywords.items() if any(k in text for k in kws)] or ["Uncertain"]

def llama_predict(text):
    prompt = f"Which medical specialist should this patient see based on the text below?\n\n{text}\n\nAnswer ONLY with the specialist's title."
    try: return generator.run(prompt)["replies"][0].strip()
    except: return "Uncertain"

# 🔄 Stage 1: section_text
df["section_text_label_keyword"] = df["section_text"].apply(keyword_match)
df["section_text_label_llm"] = df["section_text"].apply(llama_predict)

def resolve_stage1(row):
    return row["section_text_label_llm"] if row["section_text_label_llm"] in row["section_text_label_keyword"] else "Uncertain"
df["final_label_stage1"] = df.apply(resolve_stage1, axis=1)

# 🔄 Stage 2: dialogue for uncertain cases
mask = df["final_label_stage1"] == "Uncertain"
df.loc[mask, "dialogue_label_keyword"] = df.loc[mask, "dialogue"].apply(keyword_match)
df.loc[mask, "dialogue_label_llm"] = df.loc[mask, "dialogue"].apply(llama_predict)

def resolve_final(row):
    if row["final_label_stage1"] != "Uncertain":
        return row["final_label_stage1"]
    if isinstance(row.get("dialogue_label_keyword"), list) and row["dialogue_label_llm"] in row["dialogue_label_keyword"]:
        return row["dialogue_label_llm"]
    return "Uncertain"
df["final_label"] = df.apply(resolve_final, axis=1)

# 💾 Save + download
df.to_csv("labeled_referral_output_llama_real.csv", index=False)
files.download("labeled_referral_output_llama_real.csv")


Saving MTS-Dialog-TestSet-1-MEDIQA-Chat-2023.csv to MTS-Dialog-TestSet-1-MEDIQA-Chat-2023 (2).csv


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
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