In [1]:
# ✅ Install Needed Libraries
!pip install keybert sentence-transformers transformers

# ✅ Imports
import pandas as pd
import torch
import json
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
from transformers import pipeline


Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Col

FileNotFoundError: [Errno 2] No such file or directory: '/content/MTS-Dialog-TrainingSet.csv'

In [4]:

# ✅ Step 1: Load dataset
df = pd.read_csv("/content/MTS-Dialog-TrainingSet.csv")
print("✅ Data loaded:", df.shape)

# ✅ Step 2: Load Medical KeyBERT model
medical_encoder = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
kw_model = KeyBERT(model=medical_encoder)

# ✅ Step 3: Define professional specialist list
specialist_list = [
    "Cardiologist", "Neurologist", "Gastroenterologist", "Dermatologist", "Orthopedic Surgeon",
    "Psychiatrist", "Pulmonologist", "Endocrinologist", "Nephrologist", "Hematologist",
    "Oncologist", "Rheumatologist", "Urologist", "Ophthalmologist", "ENT Specialist",
    "General Surgeon", "Infectious Disease Specialist", "Pediatrician", "Gynecologist", "Allergist"
]

# ✅ Step 4: Extract key phrases
def extract_key_phrases(text, top_n=5):
    if pd.isna(text) or not isinstance(text, str):
        return []
    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 3),
        stop_words='english',
        top_n=top_n
    )
    return [kw[0] for kw in keywords]

df['key_phrases'] = df['section_text'].apply(lambda x: extract_key_phrases(x, top_n=5))

# ✅ Step 5: Load Clinical LLM (free, real model)
llm = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",  # << switched to flan-t5-base (free + available)
    device=0 if torch.cuda.is_available() else -1
)

# ✅ Step 6: Predict specialist from key phrases
def predict_specialist(key_phrases):
    if not key_phrases:
        return "Unknown"

    prompt = f"""
You are a medical classification expert.

From these medical keywords:
{', '.join(key_phrases)}

Pick ONLY one specialist title from this list:
{', '.join(specialist_list)}

Respond with only the specialist title, nothing else.
    """
    output = llm(prompt, max_new_tokens=30, do_sample=False)
    response_text = output[0]['generated_text'].strip()

    # Force output to match one of the allowed specialists
    for specialist in specialist_list:
        if specialist.lower() in response_text.lower():
            return specialist

    return "Unknown"

# Apply prediction
df['specialist'] = df['key_phrases'].apply(predict_specialist)




✅ Data loaded: (1201, 4)


config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Device set to use cuda:0


KeyError: "['id'] not in index"

In [5]:
# ✅ Step 7: Save cleaned output
final_df = df[['ID', 'section_header', 'section_text', 'specialist']]  # keep original + specialist
final_df.to_csv("/content/MTS-Dialog-TrainingSet_LABELED_FIXED.csv", index=False)

print("✅ Done! Saved as /content/MTS-Dialog-TrainingSet_LABELED_FIXED.csv 🎯")

✅ Done! Saved as /content/MTS-Dialog-TrainingSet_LABELED_FIXED.csv 🎯


In [8]:
# # ✅ Install needed libraries first
# !pip install -q transformers sentence-transformers keybert faiss-cpu

# ✅ Imports
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
import numpy as np

# ✅ Step 1: Load your dataset
df = pd.read_csv("/content/MTS-Dialog-TrainingSet.csv")
print("✅ Data loaded:", df.shape)

# ✅ Step 2: Load Medical Sentence Encoder for Key Phrases
encoder = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
kw_model = KeyBERT(model=encoder)

# ✅ Step 3: Extract Key Phrases
def extract_key_phrases(text, top_n=5):
    if pd.isna(text) or not isinstance(text, str):
        return []
    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 3),
        stop_words='english',
        top_n=top_n
    )
    return [kw[0] for kw in keywords]

df['key_phrases'] = df['section_text'].apply(lambda x: extract_key_phrases(x, top_n=5))

# ✅ Step 4: Load Medical LLM (ClinicalCamel-2-7B-Instruct)
tokenizer = AutoTokenizer.from_pretrained("Writer/camel-5b-hf")  # lighter model
model = AutoModelForCausalLM.from_pretrained("Writer/camel-5b-hf", device_map="auto", torch_dtype=torch.bfloat16)
# llm = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)



✅ Data loaded: (1201, 4)


tokenizer_config.json:   0%|          | 0.00/748 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/787 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/25.4k [00:00<?, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

pytorch_model-00002-of-00003.bin:   0%|          | 0.00/9.99G [00:00<?, ?B/s]

pytorch_model-00001-of-00003.bin:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

pytorch_model-00003-of-00003.bin:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

ValueError: The model has been loaded with `accelerate` and therefore cannot be moved to a specific device. Please discard the `device` argument when creating your pipeline object.

In [9]:
llm = pipeline("text-generation", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [10]:
# ✅ Step 5: Define Specialist List
specialist_list = [
    "Cardiologist", "Neurologist", "Gastroenterologist", "Dermatologist", "Pulmonologist",
    "Oncologist", "Ophthalmologist", "Psychiatrist", "Endocrinologist", "ENT Specialist",
    "Nephrologist", "Rheumatologist", "Hematologist", "Orthopedic Surgeon", "Urologist",
    "General Practitioner", "Pediatrician", "Allergist", "Gynecologist", "Infectious Disease Specialist"
]

# ✅ Step 6: Generate Specialist Prediction
def predict_specialist(key_phrases):
    if not key_phrases:
        return "Unknown"
    prompt = f"""
    Given the following medical keywords: {', '.join(key_phrases)}

    Choose the most appropriate specialist from the list below:
    {', '.join(specialist_list)}

    Only output ONE specialist name from the list.
    """
    output = llm(prompt, max_new_tokens=20, do_sample=False)
    response = output[0]['generated_text'].strip()

    for specialist in specialist_list:
        if specialist.lower() in response.lower():
            return specialist
    return "General Practitioner"  # fallback

df['specialist'] = df['key_phrases'].apply(predict_specialist)

# ✅ Step 7: Save Final Output
final_df = df[['ID', 'section_header', 'section_text', 'dialogue', 'specialist']]
final_df.to_csv("/content/MTS-Dialog-TrainingSet_LABELED_FIXED.csv", index=False)
print("✅ Saved final labeled file!")


✅ Saved final labeled file!


In [11]:
# 📥 Step 1: Load Dataset
import pandas as pd
df = pd.read_csv("/content/MTS-Dialog-TrainingSet.csv")
print(f"✅ Data loaded: {df.shape}")

# 📚 Step 2: Load Biomedical Sentence Encoder (for key phrases)
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT

bio_encoder = SentenceTransformer("pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb")
kw_model = KeyBERT(model=bio_encoder)

# 🔑 Step 3: Extract Key Phrases Function
def extract_key_phrases(text, top_n=5):
    if pd.isna(text) or not isinstance(text, str):
        return []
    keywords = kw_model.extract_keywords(
        text,
        keyphrase_ngram_range=(1, 3),
        stop_words='english',
        top_n=top_n
    )
    return [kw[0] for kw in keywords]

df['key_phrases'] = df['section_text'].apply(lambda x: extract_key_phrases(x, top_n=5))

# 🤖 Step 4: Load Lightweight Clinical LLM
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")  # free, light model
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", device_map="auto", torch_dtype=torch.bfloat16)

llm = pipeline("text2text-generation", model=model, tokenizer=tokenizer)

# 👨‍⚕️ Step 5: Define Professional Specialist List
specialist_list = [
    "Cardiologist", "Neurologist", "Dermatologist", "Gastroenterologist",
    "Pulmonologist", "ENT", "Psychiatrist", "Endocrinologist",
    "Nephrologist", "Orthopedic", "Oncologist", "General Physician"
]

# 🏷️ Step 6: Predict Specialist Based on Key Phrases
def predict_specialist(key_phrases):
    if not key_phrases:
        return "General Physician"

    prompt = f"""
You are a medical triage assistant.

Given the following medical key phrases: {', '.join(key_phrases)}

Pick the most appropriate medical specialist from the following list only:
{', '.join(specialist_list)}.

If unsure, return "General Physician".

Answer with ONLY the specialist name.
"""

    response = llm(prompt, max_new_tokens=20, do_sample=False)[0]['generated_text'].strip()
    # Sanitize: force into list if model misbehaves
    for specialist in specialist_list:
        if specialist.lower() in response.lower():
            return specialist
    return "General Physician"

# ⚡ Apply specialist prediction
df['specialist'] = df['key_phrases'].apply(predict_specialist)




✅ Data loaded: (1201, 4)


Device set to use cuda:0


KeyError: "['id'] not in index"

In [12]:
# 📦 Step 7: Save the new labeled dataset
final_df = df[['ID', 'section_header', 'section_text', 'specialist']]
final_df.to_csv("/content/MTS-Dialog-TrainingSet_LABELED_FIXED3.csv", index=False)

print("✅ Successfully saved as: MTS-Dialog-TrainingSet_LABELED_FIXED.csv")

✅ Successfully saved as: MTS-Dialog-TrainingSet_LABELED_FIXED.csv
