In [None]:
!pip install -q kaggle transformers datasets sentence-transformers faiss-cpu peft accelerate evaluate scikit-learn


In [None]:
import random
import pandas as pd

# --- Templates ---

low_risk_templates = [
    "{age_statement}, no significant family history of cancer or inherited disorders.",
    "{age_statement}, one distant relative with late-onset cancer, no other risk factors.",
    "{age_statement}, parents and siblings healthy, no signs of hereditary disease.",
    "{age_statement}, no cancer cases on either side of family."
]

moderate_risk_templates = [
    "{age_statement}, one first-degree relative with early-onset {cancer}.",
    "{age_statement}, two second-degree relatives with {cancer}.",
    "{age_statement}, one parent diagnosed with {cancer} before age 50.",
    "{age_statement}, sibling diagnosed with {cancer} at {early_age}."
]

high_risk_templates = [
    "{age_statement}, multiple first-degree relatives with early-onset {cancer}.",
    "{age_statement}, strong family clustering of {cancer}, affecting several generations.",
    "{age_statement}, known hereditary mutation ({gene}) in mother and sibling.",
    "{age_statement}, parent and grandparent both diagnosed with {cancer} before age 45."
]

high_risk_templates += [
    "{age_statement}, mother had breast cancer before 40 and a maternal aunt had ovarian cancer.",
    "{age_statement}, multiple cases of breast and ovarian cancer on the maternal side, including first-degree relatives."
]

cancers = [
    "breast cancer", "ovarian cancer", "colorectal cancer", "thyroid cancer",
    "pancreatic cancer", "stomach cancer", "melanoma", "prostate cancer"
]

genes = ["BRCA1", "BRCA2", "MLH1", "MSH2", "TP53", "PALB2"]

def random_age():
    return random.randint(20, 70)

def random_early_age():
    return random.randint(20, 45)

data = []
n_samples = 1000

for _ in range(n_samples):
    case_type = random.choices(
        ["low", "moderate", "high"],
        weights=[0.4, 0.35, 0.25],
        k=1
    )[0]

    age_statement = f"Patient {random_age()} years old"
    cancer = random.choice(cancers)
    gene = random.choice(genes)
    early_age = random_early_age()

    if case_type == "low":
        template = random.choice(low_risk_templates)
        label = 0
        category = "low_risk"
        reason = "No signs of hereditary clustering or early-onset cases."

    elif case_type == "moderate":
        template = random.choice(moderate_risk_templates)
        label = 1
        category = "moderate_risk"
        reason = "Some early-onset cases or first-degree relatives suggest increased risk."

    else:  # high risk
        template = random.choice(high_risk_templates)
        label = 2
        category = "high_risk"
        reason = "Multiple early-onset cancers, strong clustering, or known mutation."

    case_text = template.format(
        age_statement=age_statement,
        cancer=cancer,
        gene=gene,
        early_age=early_age
    )

    data.append([case_text, label, category, reason])

df = pd.DataFrame(data, columns=["case_text", "label", "category", "reason"])
df.to_csv("genetic_cases.csv", index=False)

df.head(), df.shape

(                                           case_text  label       category  \
 0  Patient 21 years old, two second-degree relati...      1  moderate_risk   
 1  Patient 63 years old, one first-degree relativ...      1  moderate_risk   
 2  Patient 33 years old, no significant family hi...      0       low_risk   
 3  Patient 65 years old, one parent diagnosed wit...      1  moderate_risk   
 4  Patient 20 years old, known hereditary mutatio...      2      high_risk   
 
                                               reason  
 0  Some early-onset cases or first-degree relativ...  
 1  Some early-onset cases or first-degree relativ...  
 2  No signs of hereditary clustering or early-ons...  
 3  Some early-onset cases or first-degree relativ...  
 4  Multiple early-onset cancers, strong clusterin...  ,
 (1000, 4))

In [None]:
import os, pandas as pd

print(os.listdir())          # see genetic_cases.csv
df = pd.read_csv("genetic_cases.csv")
df.head()
df.shape


['.config', 'wandb', 'medtext_lora_results', 'genetic_lora_model', 'genetic_cases.csv', 'sample_data']


(1000, 4)

In [None]:
import pandas as pd

# Load the synthetic genetic triage dataset
df = pd.read_csv("genetic_cases.csv")

df.head(), df.shape


(                                           case_text  label       category  \
 0  Patient 21 years old, two second-degree relati...      1  moderate_risk   
 1  Patient 63 years old, one first-degree relativ...      1  moderate_risk   
 2  Patient 33 years old, no significant family hi...      0       low_risk   
 3  Patient 65 years old, one parent diagnosed wit...      1  moderate_risk   
 4  Patient 20 years old, known hereditary mutatio...      2      high_risk   
 
                                               reason  
 0  Some early-onset cases or first-degree relativ...  
 1  Some early-onset cases or first-degree relativ...  
 2  No signs of hereditary clustering or early-ons...  
 3  Some early-onset cases or first-degree relativ...  
 4  Multiple early-onset cancers, strong clusterin...  ,
 (1000, 4))

In [None]:
from datasets import Dataset, DatasetDict


# Class names for genetic counseling triage
id2label = {
    0: "no_counseling",
    1: "counseling_recommended",
    2: "high_risk_counseling"
}
label2id = {v: k for k, v in id2label.items()}
label_names = [id2label[i] for i in sorted(id2label.keys())]

print("id2label:", id2label)
print("label2id:", label2id)
print("label_names:", label_names)

# Prepare columns for the model: text + label_id
df_model = df[["case_text", "label"]].rename(
    columns={"case_text": "text", "label": "label_id"}
)

dataset = Dataset.from_pandas(df_model)

# Shuffle + train/validation split
dataset = dataset.shuffle(seed=42)
train_test = dataset.train_test_split(test_size=0.2, seed=42)

ds = DatasetDict({
    "train": train_test["train"],
    "validation": train_test["test"]
})

ds


id2label: {0: 'no_counseling', 1: 'counseling_recommended', 2: 'high_risk_counseling'}
label2id: {'no_counseling': 0, 'counseling_recommended': 1, 'high_risk_counseling': 2}
label_names: ['no_counseling', 'counseling_recommended', 'high_risk_counseling']


DatasetDict({
    train: Dataset({
        features: ['text', 'label_id'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['text', 'label_id'],
        num_rows: 200
    })
})

In [None]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
import numpy as np
import evaluate

from peft import LoraConfig, get_peft_model




In [None]:
# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

tokenized_ds = ds.map(tokenize_function, batched=True)

# prepare for Trainer
tokenized_ds = tokenized_ds.remove_columns(["text"])
tokenized_ds = tokenized_ds.rename_column("label_id", "labels")
tokenized_ds.set_format("torch")

tokenized_ds


Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 800
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 200
    })
})

In [None]:
num_labels = len(label_names)

base_model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

base_model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [None]:
# Simple LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_lin", "v_lin"],  # DistilBERT
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS"
)

model = get_peft_model(base_model, lora_config)
model.to(device)

# check % of trainable parameters
model.print_trainable_parameters()


trainable params: 740,355 || all params: 67,696,134 || trainable%: 1.0936


In [None]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")["f1"]
    return {"accuracy": acc, "f1": f1}


In [None]:
import transformers

batch_size = 8

print("Transformers version:", transformers.__version__)


Transformers version: 4.57.3


In [None]:
training_args = transformers.TrainingArguments(
    output_dir="./medtext_lora_results",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

#Train the model
trainer.train()

# Manual evaluation after training
eval_results = trainer.evaluate()
print("\n=== Evaluation results ===")
print(eval_results)


  trainer = Trainer(


Step,Training Loss
10,1.0252
20,0.954
30,0.9584
40,0.8315
50,0.6956
60,0.5894
70,0.3753
80,0.3311
90,0.2635
100,0.1596



=== Evaluation results ===
{'eval_loss': 0.0022435197606682777, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 0.8897, 'eval_samples_per_second': 224.782, 'eval_steps_per_second': 28.098, 'epoch': 3.0}


In [None]:
trainer.save_model("genetic_lora_model")
tokenizer.save_pretrained("genetic_lora_model")


('genetic_lora_model/tokenizer_config.json',
 'genetic_lora_model/special_tokens_map.json',
 'genetic_lora_model/vocab.txt',
 'genetic_lora_model/added_tokens.json',
 'genetic_lora_model/tokenizer.json')

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Predictions on validation set
predictions = trainer.predict(tokenized_ds["validation"])

y_true = predictions.label_ids
y_logits = predictions.predictions
y_pred = np.argmax(y_logits, axis=-1)

print("=== Classification report ===")
print(classification_report(
    y_true, y_pred,
    target_names=[id2label[i] for i in sorted(id2label.keys())]
))

print("=== Confusion matrix ===")
print(confusion_matrix(y_true, y_pred))


=== Classification report ===
                        precision    recall  f1-score   support

         no_counseling       1.00      1.00      1.00        76
counseling_recommended       1.00      1.00      1.00        78
  high_risk_counseling       1.00      1.00      1.00        46

              accuracy                           1.00       200
             macro avg       1.00      1.00      1.00       200
          weighted avg       1.00      1.00      1.00       200

=== Confusion matrix ===
[[76  0  0]
 [ 0 78  0]
 [ 0  0 46]]


In [None]:
def rule_based_override(text: str, model_pred: int) -> int:
    """
    Simple rule-based overrides applied on top of model prediction.
    """

    t = text.lower()

    # Example of a strong rule: multiple early breast/ovarian cancers
    if ("mother" in t and "breast cancer" in t and any(age in t for age in ["38", "39", "before 40"])) \
       and ("ovarian cancer" in t or "aunt" in t):
        return 2  # high_risk_counseling

    # Example: case with known mutation (BRCA1 / BRCA2 / TP53 / etc.)
    if any(gene.lower() in t for gene in ["brca1", "brca2", "tp53", "mlh1", "msh2", "pms2"]):
        return 2

    # Exemple : plusieurs cancers sur plusieurs générations → high risk
    if "multiple" in t and "cancer" in t and ("family" in t or "relatives" in t):
        return 2



    return model_pred


In [None]:
# Clear description of genetic counseling triage classes

label_desc = {
    "no_counseling": "No genetic counseling recommended (in this prototype).",
    "counseling_recommended": "Genetic counseling recommended (in this prototype).",
    "high_risk_counseling": "High-risk case: genetic counseling strongly recommended (in this prototype)."
}

def predict_case(text: str):
    """
    Takes a genetic case text and returns the model prediction.
    """
    model.eval()
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=-1)[0].cpu().numpy()

# raw prediction of the model
    pred_id = int(probs.argmax())

# apply business rules
    pred_id = rule_based_override(text, pred_id)

# recalculate the final label
    pred_label = id2label[pred_id]
    confidence = float(probs[pred_id])


    result = {
        "text": text,
        "pred_id": pred_id,
        "pred_label": pred_label,
        "confidence": confidence,
        "explanation":label_desc[pred_label]
    }
    return result



In [None]:
test_text = """
Patient 33 years old, mother had breast cancer at 39 and maternal aunt ovarian cancer at 52.
The patient is worried about hereditary risk.
"""

predict_case(test_text)



{'text': '\nPatient 33 years old, mother had breast cancer at 39 and maternal aunt ovarian cancer at 52.\nThe patient is worried about hereditary risk.\n',
 'pred_id': 2,
 'pred_label': 'high_risk_counseling',
 'confidence': 0.9805808067321777,
 'explanation': 'High-risk case: genetic counseling strongly recommended (in this prototype).'}

In [None]:
# Fake genetic counseling guidelines (educational only)

guidelines = [
    "Multiple first-degree relatives with early-onset breast or ovarian cancer may indicate a hereditary cancer syndrome and justify genetic counseling.",
    "Early-onset colorectal cancer in a patient or close relatives, especially under age 50, can suggest a hereditary colorectal cancer syndrome.",
    "Known pathogenic variants in genes such as BRCA1, BRCA2, or mismatch repair genes in close relatives strongly support referral for genetic counseling.",
    "Clusters of similar cancers across generations in the same side of the family may indicate an inherited cancer risk.",
    "In the absence of strong family history or early-onset disease, genetic counseling may be less urgent but can still be considered if the patient is anxious.",
    "Any recommendation for genetic testing or counseling must ultimately be confirmed by a qualified genetics professional. This system is only an educational prototype."
]

len(guidelines), guidelines



(6,
 ['Multiple first-degree relatives with early-onset breast or ovarian cancer may indicate a hereditary cancer syndrome and justify genetic counseling.',
  'Early-onset colorectal cancer in a patient or close relatives, especially under age 50, can suggest a hereditary colorectal cancer syndrome.',
  'Known pathogenic variants in genes such as BRCA1, BRCA2, or mismatch repair genes in close relatives strongly support referral for genetic counseling.',
  'Clusters of similar cancers across generations in the same side of the family may indicate an inherited cancer risk.',
  'In the absence of strong family history or early-onset disease, genetic counseling may be less urgent but can still be considered if the patient is anxious.',
  'Any recommendation for genetic testing or counseling must ultimately be confirmed by a qualified genetics professional. This system is only an educational prototype.'])

In [None]:
from sentence_transformers import SentenceTransformer
import faiss

# Sentence embedding model (CPU or GPU)
embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embed_model = SentenceTransformer(embed_model_name, device=device)

# Compute embeddings for guidelines
guideline_embeddings = embed_model.encode(guidelines, convert_to_numpy=True)

d = guideline_embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(guideline_embeddings)

index.ntotal


6

In [None]:
def retrieve_rules(case_text: str, k: int = 3):
    """
    Return top-k closest guidelines to the case text.
    """
    case_embedding = embed_model.encode([case_text], convert_to_numpy=True)
    distances, indices = index.search(case_embedding, k)
    retrieved = []
    for dist, idx in zip(distances[0], indices[0]):
        retrieved.append({
            "guideline": guidelines[idx],
            "distance": float(dist)
        })
    return retrieved


In [None]:
def explain_case(case_text: str):
    """
    Full pipeline: genetic triage prediction + RAG + explanation (FR/EN).
    """
    pred = predict_case(case_text)
    rules = retrieve_rules(case_text, k=3)

    label = pred["pred_label"]
    conf = pred["confidence"]

    # Intro

    intro = (
        f"According to this prototype, the case is classified as: {label}. "
        f"Model confidence: {conf:.2f}.\n"
        "This classification estimates the priority level for genetic counseling based on the described family history."
    )

    # Rules
    guidelines = "Most relevant genetic guidelines found:\n" + "\n".join(
        f"- {r['guideline']}" for r in rules
    )

    #  Disclaimer

    disclaimer = (
        "\n\n⚠️ IMPORTANT: This tool is for educational purposes only, built on synthetic cases. "
        "It does NOT replace real medical advice. All decisions must be made by qualified "
        "healthcare professionals specialized in genetics."
    )

    explanation = {
        "prediction": pred,
        "rules": rules,
        "explanation": intro + "\n\n" + guidelines + disclaimer
    }

    return explanation



In [None]:
case_example = """Patient 29 years old. Mother diagnosed with breast cancer at 38 and maternal grandmother with ovarian cancer at 52. The patient is asking whether genetic counseling is needed.
"""

result = explain_case(case_example)

print("=== PREDICTION ===")
print(result["prediction"])


print("\n=== EXPLANATION ===\n")
print(result["explanation"])



=== PREDICTION ===
{'text': 'Patient 29 years old. Mother diagnosed with breast cancer at 38 and maternal grandmother with ovarian cancer at 52. The patient is asking whether genetic counseling is needed.\n', 'pred_id': 2, 'pred_label': 'high_risk_counseling', 'confidence': 0.6286334991455078, 'explanation': 'High-risk case: genetic counseling strongly recommended (in this prototype).'}

=== EXPLANATION ===

According to this prototype, the case is classified as: high_risk_counseling. Model confidence: 0.63.
This classification estimates the priority level for genetic counseling based on the described family history.

Most relevant genetic guidelines found:
- Multiple first-degree relatives with early-onset breast or ovarian cancer may indicate a hereditary cancer syndrome and justify genetic counseling.
- In the absence of strong family history or early-onset disease, genetic counseling may be less urgent but can still be considered if the patient is anxious.
- Any recommendation for 