# Load libraries

In [None]:
# Install necessary libraries
! pip install datasets
! pip install huggingface_hub
! pip install -U sentence-transformers
! pip install peft
! pip install wandb
# Connect to hugging face and wandb
! huggingface-cli login --token YOUR_HF_TOKEN
! wandb login YOUR_WANDB_TOKEN

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
from datasets import load_dataset

from sentence_transformers import SentenceTransformer, SentenceTransformerTrainingArguments, SentenceTransformerTrainer
from transformers import EarlyStoppingCallback
from sentence_transformers.losses import MultipleNegativesRankingLoss, TripletLoss
from sentence_transformers.training_args import BatchSamplers # to remove duplicates while training
from sentence_transformers.evaluation import TripletEvaluator, SequentialEvaluator, SentenceEvaluator

from peft import LoraConfig, IA3Config, get_peft_model

import os
import logging

# For custom evaluator
import torch.nn.functional as F
import torch
import csv
from contextlib import nullcontext

In [None]:
os.environ["WANDB_PROJECT"]="medical"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

# Load triplet dataset

In [None]:
dataset = load_dataset("bebeyondo/medical-triplet")

README.md:   0%|          | 0.00/666 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/44.9M [00:00<?, ?B/s]

val-00000-of-00001.parquet:   0%|          | 0.00/15.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/15.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/52838 [00:00<?, ? examples/s]

Generating val split:   0%|          | 0/17613 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/17613 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['pairs_unique_id', 'idx', 'anchor', 'positive', 'negative'],
        num_rows: 52838
    })
    val: Dataset({
        features: ['pairs_unique_id', 'idx', 'anchor', 'positive', 'negative'],
        num_rows: 17613
    })
    test: Dataset({
        features: ['pairs_unique_id', 'idx', 'anchor', 'positive', 'negative'],
        num_rows: 17613
    })
})

In [None]:
dataset = dataset.remove_columns("idx")

In [None]:
dataset = dataset.remove_columns("pairs_unique_id")

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['anchor', 'positive', 'negative'],
        num_rows: 52838
    })
    val: Dataset({
        features: ['anchor', 'positive', 'negative'],
        num_rows: 17613
    })
    test: Dataset({
        features: ['anchor', 'positive', 'negative'],
        num_rows: 17613
    })
})

In [None]:
train_dataset = dataset["train"]
eval_dataset = dataset["val"]
test_dataset = dataset["test"]

# Model

In [None]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda")



## LoRA/DoRA/IA3

In [None]:
for param in model.parameters():
    param.requires_grad = False

config = LoraConfig(
    target_modules = ["value", "query"],
    use_dora=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.01,
    bias="none",
)
#IA3
peft_config = IA3Config(
    target_modules=["key", "value", "intermediate.dense"], feedforward_modules=["intermediate.dense"]
)

model = get_peft_model(model, config)

### Check number of trainable params

In [None]:
model.print_trainable_parameters()

trainable params: 147,456 || all params: 22,860,672 || trainable%: 0.6450


In [None]:
for name, param in model.named_parameters():
    print(f"Name: {name}, Size: {param.size()}, Requires Grad: {param.requires_grad}")

Name: base_model.model.0.auto_model.embeddings.word_embeddings.weight, Size: torch.Size([30522, 384]), Requires Grad: False
Name: base_model.model.0.auto_model.embeddings.position_embeddings.weight, Size: torch.Size([512, 384]), Requires Grad: False
Name: base_model.model.0.auto_model.embeddings.token_type_embeddings.weight, Size: torch.Size([2, 384]), Requires Grad: False
Name: base_model.model.0.auto_model.embeddings.LayerNorm.weight, Size: torch.Size([384]), Requires Grad: False
Name: base_model.model.0.auto_model.embeddings.LayerNorm.bias, Size: torch.Size([384]), Requires Grad: False
Name: base_model.model.0.auto_model.encoder.layer.0.attention.self.query.base_layer.weight, Size: torch.Size([384, 384]), Requires Grad: False
Name: base_model.model.0.auto_model.encoder.layer.0.attention.self.query.base_layer.bias, Size: torch.Size([384]), Requires Grad: False
Name: base_model.model.0.auto_model.encoder.layer.0.attention.self.query.lora_A.default.weight, Size: torch.Size([16, 384]), 

In [None]:
model

PeftModel(
  (base_model): LoraModel(
    (model): SentenceTransformer(
      (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
      (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
      (2): Normalize()
    )
  )
)

# Custom evaluator

In [None]:
logger = logging.getLogger(__name__)

In [None]:
class UniformityAlignmentEvaluator(SentenceEvaluator):
    """
    Evaluator for Alignment and Uniformity metrics
    """

    def __init__(
        self,
        anchors: list[str],
        positives: list[str],
        negatives: list[str],
        name: str = "",
        batch_size: int = 16,
        show_progress_bar: bool = True,
        write_csv: bool = True,
        truncate_dim: int | None = None,
    ):
        super().__init__()
        self.anchors = anchors
        self.positives = positives
        self.negatives = negatives
        self.name = name
        self.truncate_dim = truncate_dim

        assert len(self.anchors) == len(self.positives)
        assert len(self.anchors) == len(self.negatives)

        self.batch_size = batch_size
        if show_progress_bar is None:
            show_progress_bar = (
                logger.getEffectiveLevel() == logging.INFO or logger.getEffectiveLevel() == logging.DEBUG
            )
        self.show_progress_bar = show_progress_bar
        self.write_csv = write_csv
        self.primary_metric = "alignment"

        self.csv_file = "uniformity_alignment_evaluation" + ("_" + name if name else "") + "_results.csv"
        self.csv_headers = ["epoch", "steps", "alignment", "uniformity", "alignment_negative"]

    def align_loss(self, x, y, alpha=2):
        # Have to l2 normalize : https://github.com/princeton-nlp/SimCSE/issues/41
        #x, y = F.normalize(x, dim=-1), F.normalize(y, dim=-1)
        return (x - y).norm(p=2, dim=1).pow(alpha).mean()

    def uniform_loss(self, x, t=2):
        #x = F.normalize(x, dim=-1)
        return torch.pdist(x, p=2).pow(2).mul(-t).exp().mean().log()

    def calculate_alignment_and_uniformity(self, embeddings_anchors, embeddings_positives, embeddings_negatives):
        # Normalize embeddings before computing alignment and uniformity
        embeddings_anchors = F.normalize(embeddings_anchors, dim=-1)
        embeddings_positives = F.normalize(embeddings_positives, dim=-1)
        embeddings_negatives = F.normalize(embeddings_negatives, dim=-1)

        # Compute alignment
        alignment = self.align_loss(embeddings_anchors, embeddings_positives)
        alignment_negative = self.align_loss(embeddings_anchors, embeddings_negatives)

        # Compute uniformity for each embedding set
        uniform_anchors = self.uniform_loss(embeddings_anchors)
        uniform_positives = self.uniform_loss(embeddings_positives)
        uniform_negatives = self.uniform_loss(embeddings_negatives)

        # Compute average uniformity (similar to how it's done in calculate_loss example)
        uniformity = (uniform_anchors + uniform_positives + uniform_negatives) / 3

        return alignment, alignment_negative, uniformity

    def calculate_accuracy_cos(self, embeddings_anchors, embeddings_positives, embeddings_negatives):
        # Cosine similarity (higher is more similar)
        pos_cos_similarity = F.cosine_similarity(embeddings_anchors, embeddings_positives)
        neg_cos_similarity = F.cosine_similarity(embeddings_anchors, embeddings_negatives)

        # Count triplet accuracy based on cosine similarity
        num_triplets = len(pos_cos_similarity)
        num_correct_cos_triplets = torch.sum(pos_cos_similarity > neg_cos_similarity).item()

        accuracy_cos = num_correct_cos_triplets / num_triplets
        return accuracy_cos

    def __call__(self, model: SentenceTransformer, output_path: str = None, epoch: int = -1, steps: int = -1) -> dict[str, float]:
        if epoch != -1:
            if steps == -1:
                out_txt = f" after epoch {epoch}"
            else:
                out_txt = f" in epoch {epoch} after {steps} steps"
        else:
            out_txt = ""
        if self.truncate_dim is not None:
            out_txt += f" (truncated to {self.truncate_dim})"

        logger.info(f"UniformityAlignmentEvaluator: Evaluating the model on the {self.name} dataset{out_txt}:")

        with nullcontext() if self.truncate_dim is None else model.truncate_sentence_embeddings(self.truncate_dim):
            embeddings_anchors = model.encode(
                self.anchors,
                batch_size=self.batch_size,
                show_progress_bar=self.show_progress_bar,
                convert_to_tensor=True,
            ).to("cuda")
            embeddings_positives = model.encode(
                self.positives,
                batch_size=self.batch_size,
                show_progress_bar=self.show_progress_bar,
                convert_to_tensor=True,
            ).to("cuda")
            embeddings_negatives = model.encode(
                self.negatives,
                batch_size=self.batch_size,
                show_progress_bar=self.show_progress_bar,
                convert_to_tensor=True,
            ).to("cuda")
        # Compute accuracy based on cosine similarity
        accuracy_cos = self.calculate_accuracy_cos(embeddings_anchors, embeddings_positives, embeddings_negatives)

        # Process in batches to prevent memory overflow
        alignment, alignment_negative, uniformity = self.calculate_alignment_and_uniformity(
            embeddings_anchors, embeddings_positives, embeddings_negatives
        )
        logger.info(f"Alignment: {alignment:.6f}")
        logger.info(f"Uniformity: {uniformity:.6f}")
        logger.info(f"Alignment negative: {alignment_negative:.6f}")
        logger.info(f"Accuracy Cosine Distance:   \t{accuracy_cos * 100:.2f}%")

        if output_path is not None and self.write_csv:
            csv_path = os.path.join(output_path, self.csv_file)
            if not os.path.isfile(csv_path):
                with open(csv_path, newline="", mode="w", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow(self.csv_headers)
                    writer.writerow([epoch, steps, alignment, uniformity, alignment_negative, accuracy_cos])

            else:
                with open(csv_path, newline="", mode="a", encoding="utf-8") as f:
                    writer = csv.writer(f)
                    writer.writerow([epoch, steps, alignment, uniformity, alignment_negative, accuracy_cos])

        metrics = {
            "alignment": float(alignment),
            "uniformity": float(uniformity),
            "alignment_negative": float(alignment_negative),
            "accuracy_cos": float(accuracy_cos),
        }
        metrics = self.prefix_name_to_metrics(metrics, self.name)
        self.store_metrics_in_model_card_data(model, metrics)
        return metrics

    @property
    def description(self) -> str:
        return "Uniformity and Alignment Evaluation"


# Train

InfoNCE loss is synonomuos with MultipleNegativesRankingLoss (source: https://github.com/UKPLab/sentence-transformers/issues/1305 )

In [None]:
# 4. Define a loss function
#loss = TripletLoss(model)
loss = MultipleNegativesRankingLoss(model)

In [None]:
# 5. (Optional) Specify training arguments
# check documentation to change params
# https://sbert.net/docs/package_reference/sentence_transformer/training_args.html
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir="models/med",
    # Optional training parameters:
    num_train_epochs=10, # initial is 5
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    learning_rate=2e-3, # initial is 2e-5
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # losses that use "in-batch negatives" benefit from no duplicates
    # Optional tracking/debugging parameters:
    eval_strategy="epoch", # was steps
    #eval_steps=500,
    save_strategy="epoch", # was steps
    #load_best_model_at_end=True,
    #save_steps=500,
    save_total_limit=2,
    logging_steps=500, # may make higher, defines how often to log the perform
    run_name="lora-lr2-3-bs-128",  # wandb, name of the current run
    seed=42
)

### Checking evaluator

In [None]:
dev_evaluator = UniformityAlignmentEvaluator(
    anchors=eval_dataset["anchor"],
    positives=eval_dataset["positive"],
    negatives=eval_dataset["negative"],
    name="esg-embedding-dev",
)

In [None]:
dev_evaluator(model)

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

{'esg-embedding-dev_alignment': 0.7817179560661316,
 'esg-embedding-dev_uniformity': -2.971881866455078,
 'esg-embedding-dev_alignment_negative': 0.9833359122276306,
 'esg-embedding-dev_accuracy_cos': 0.6634304207119741}

## Run

In [None]:
# 7. Create a trainer & train
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss=loss,
    evaluator=dev_evaluator,
)
trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33marina-zemchik[0m ([33mesg-x-ml[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Esg-embedding-dev Alignment,Esg-embedding-dev Uniformity,Esg-embedding-dev Alignment Negative,Esg-embedding-dev Accuracy Cos
1,No log,1.708326,0.831963,-3.049641,1.166945,0.792426
2,1.861000,1.630616,0.929821,-3.284262,1.279409,0.798671
3,1.614800,1.558068,0.864562,-3.250219,1.214203,0.79112
4,1.507000,1.531311,0.893688,-3.327269,1.250639,0.796741
5,1.419200,1.502824,0.901476,-3.388221,1.263765,0.797536
6,1.419200,1.478977,0.886646,-3.376331,1.252486,0.800999
7,1.334400,1.46139,0.882336,-3.397685,1.242909,0.798217
8,1.271100,1.444876,0.895984,-3.45711,1.265929,0.798388
9,1.207400,1.436888,0.918717,-3.506334,1.29232,0.80117
10,1.157900,1.433611,0.930934,-3.522459,1.30767,0.803838


Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./models/med/checkpoint-413)... Done. 0.0s


Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./models/med/checkpoint-826)... Done. 0.0s


Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./models/med/checkpoint-1239)... Done. 0.0s


Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./models/med/checkpoint-1652)... Done. 0.0s


Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./models/med/checkpoint-2065)... Done. 0.0s


Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./models/med/checkpoint-2478)... Done. 0.0s


Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./models/med/checkpoint-2891)... Done. 0.0s


Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./models/med/checkpoint-3304)... Done. 0.0s


Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./models/med/checkpoint-3717)... Done. 0.0s
[34m[1mwandb[0m: Adding directory to artifact (./models/med/checkpoint-4130)... Done. 0.0s


Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

Batches:   0%|          | 0/1101 [00:00<?, ?it/s]

[34m[1mwandb[0m: Adding directory to artifact (./models/med/checkpoint-4130)... Done. 0.0s
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


TrainOutput(global_step=4130, training_loss=1.4130903156271282, metrics={'train_runtime': 4395.5721, 'train_samples_per_second': 120.207, 'train_steps_per_second': 0.94, 'total_flos': 0.0, 'train_loss': 1.4130903156271282, 'epoch': 10.0})

In [None]:
# (Optional) Evaluate the trained model on the test set
#test_evaluator = UniformityAlignmentEvaluator(
#    anchors=test_dataset["anchor"],
#    positives=test_dataset["positive"],
#    negatives=test_dataset["negative"],
#    name="esg-embedding-test",
#)
#test_evaluator(model)

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import numpy as np
base_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device="cuda")



In [None]:
dataset = load_dataset("DataFog/medical-transcription-instruct")
dataset['train'] = dataset['train'].remove_columns(['instruction', 'task_output'])
df = pd.DataFrame(dataset['train'])
df['num_tokens'] = df['description'].apply(lambda text: len(text.split()))
df_cleaned = df.drop_duplicates(subset=['transcription'])
df_cleaned = df.drop_duplicates(subset=['sample_name'])
train_df= df_cleaned[df_cleaned['description'].str.len() >= 50]


README.md:   0%|          | 0.00/3.16k [00:00<?, ?B/s]

(…)tafog-medical-transcription-instruct.csv:   0%|          | 0.00/138M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/38924 [00:00<?, ? examples/s]

In [None]:
from sklearn.model_selection import train_test_split
X = train_df['description']
y = train_df['medical_specialty']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print("Generating embeddings for training data...")
X_train_embeddings = model.encode(X_train.to_list(), batch_size=64, show_progress_bar=True)

print("Generating embeddings for test data...")
X_test_embeddings = model.encode(X_test.to_list(), batch_size=64, show_progress_bar=True)

Generating embeddings for training data...


Batches:   0%|          | 0/26 [00:00<?, ?it/s]

Generating embeddings for test data...


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
clf = LogisticRegression(random_state=42, max_iter=100)
clf.fit(X_train_embeddings, y_train)

y_pred = clf.predict(X_test_embeddings)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print(classification_report(y_test, y_pred))

Accuracy: 0.6143
                                precision    recall  f1-score   support

    Cardiovascular / Pulmonary       0.00      0.00      0.00         3
    Consult - History and Phy.       0.75      0.19      0.30        16
                   Dermatology       0.00      0.00      0.00         1
             Discharge Summary       0.00      0.00      0.00         4
        Emergency Room Reports       0.00      0.00      0.00         2
              Gastroenterology       0.00      0.00      0.00         3
              General Medicine       0.41      0.50      0.45        30
         Hematology - Oncology       0.00      0.00      0.00         5
     Hospice - Palliative Care       0.00      0.00      0.00         1
        IME-QME-Work Comp etc.       0.00      0.00      0.00         3
                       Letters       0.00      0.00      0.00         1
                    Nephrology       1.00      0.25      0.40         4
                     Neurology       0.40     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
