## **Text Summarization on anonymized data**

In [None]:
!pip install --upgrade --no-cache-dir transformers datasets accelerate

In [None]:
!pip install evaluate

In [None]:
!pip install rouge_score

#### **TRAINING CLEAR (training anon + test clear)**

In [4]:
# Hugging Face Transformers
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
from transformers import LEDTokenizer, LEDForConditionalGeneration


# Loading datasets
dataset = load_dataset("csv", data_files="anonymized_dataset_sample.csv", sep=',', split='train')
print(dataset.column_names)

dataset_clear = dataset.map(lambda x: {"full_note": x["full_note"], "summary": x["note"]})
dataset_anon = dataset.map(lambda x: {"full_note": x["anonym_full_note"], "summary": x["anonym_note"]})

dataset_clear_split = dataset_clear.train_test_split(test_size=0.2, seed=42)
train_data_clear = dataset_clear_split["train"]
test_data_clear = dataset_clear_split["test"]

dataset_anon_split = dataset_anon.train_test_split(test_size=0.2, seed=42)
train_data_anon = dataset_anon_split["train"]
test_data_anon = dataset_anon_split["test"]

# Tokenization
tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")

def preprocess(example):
    inputs = tokenizer(example["full_note"], truncation=True, padding="max_length", max_length=4096)
    targets = tokenizer(example["summary"], truncation=True, padding="max_length", max_length=512)
    inputs["labels"] = targets["input_ids"]
    return inputs

train_data_clear = train_data_clear.map(preprocess, batched=True)
train_data_anon = train_data_anon.map(preprocess, batched=True)
test_data_clear = test_data_clear.map(preprocess, batched=True)
test_data_anon = test_data_anon.map(preprocess, batched=True)

Generating train split: 0 examples [00:00, ? examples/s]

['index', 'note', 'anonym_note', 'sensitive_entity_note', 'full_note', 'anonym_full_note', 'sensitive_entity_full_note']


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [5]:
# Training Arguments
from transformers import LongformerForSequenceClassification
model_clear = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")

# Enable gradient checkpointing on the model
model_clear.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir="./results_clear_anonymds",        # folder for checkpoints
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,                # effectively larger batch size
    fp16=True,                                    # mixed precision training
    save_strategy="epoch",                        # save checkpoint at every epoch
    num_train_epochs=2,

    # Logging
    logging_dir="./logs",
    logging_steps=50,                             # more frequent logging
    report_to=["tensorboard","wandb"],            # log to TensorBoard and W&B

    # Learning rate and optimization
    learning_rate=2e-5,                           # more stable LR for small batches
    weight_decay=0.01,                            # regularization
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    lr_scheduler_type="linear",
    warmup_steps=100                              # warmup during first steps
)

pytorch_model.bin:   0%|          | 0.00/648M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [6]:
# Trainer
trainer_clear = Trainer(
    model=model_clear,
    args=training_args,
    train_dataset=train_data_anon,
    eval_dataset=test_data_clear,
)
trainer_clear.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mraffaeleaurucci[0m ([33mraffaeleaurucci-universit-degli-studi-di-salerno[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,0.5113
100,0.228
150,0.1722
200,0.1312
250,0.1267
300,0.1085
350,0.112
400,0.1124
450,0.1069
500,0.1025


TrainOutput(global_step=500, training_loss=0.17115892028808594, metrics={'train_runtime': 8893.051, 'train_samples_per_second': 0.9, 'train_steps_per_second': 0.056, 'total_flos': 2.1601638678528e+16, 'train_loss': 0.17115892028808594, 'epoch': 2.0})

In [8]:
!zip -r results_clear_anonymds.zip results_clear_anonymds/checkpoint-500/

  adding: results_clear_anonymds/checkpoint-500/ (stored 0%)
  adding: results_clear_anonymds/checkpoint-500/scaler.pt (deflated 60%)
  adding: results_clear_anonymds/checkpoint-500/generation_config.json (deflated 33%)
  adding: results_clear_anonymds/checkpoint-500/trainer_state.json (deflated 70%)
  adding: results_clear_anonymds/checkpoint-500/config.json (deflated 61%)
  adding: results_clear_anonymds/checkpoint-500/training_args.bin (deflated 51%)
  adding: results_clear_anonymds/checkpoint-500/scheduler.pt (deflated 56%)
  adding: results_clear_anonymds/checkpoint-500/rng_state.pth (deflated 25%)
  adding: results_clear_anonymds/checkpoint-500/model.safetensors (deflated 11%)
  adding: results_clear_anonymds/checkpoint-500/optimizer.pt (deflated 14%)


In [10]:
losses = [x["loss"] for x in trainer_clear.state.log_history if "loss" in x]

for step, loss in enumerate(losses, 1):
    print(f"Step {step} - Training Loss: {loss}")

Step 1 - Training Loss: 0.5113
Step 2 - Training Loss: 0.228
Step 3 - Training Loss: 0.1722
Step 4 - Training Loss: 0.1312
Step 5 - Training Loss: 0.1267
Step 6 - Training Loss: 0.1085
Step 7 - Training Loss: 0.112
Step 8 - Training Loss: 0.1124
Step 9 - Training Loss: 0.1069
Step 10 - Training Loss: 0.1025


In [12]:
from transformers import pipeline, logging
from tqdm import tqdm

# Disable warnings
logging.set_verbosity_error()

# Disable gradient checkpointing
model_clear.gradient_checkpointing_disable()

# Summarization pipeline
summarizer_clear = pipeline("summarization", model=model_clear, tokenizer=tokenizer, device=0)

# Test set
test_texts = [ex["full_note"] for ex in test_data_clear]

# List to store the generated summaries
generated_clear = []

# Progress bar with tqdm
for text in tqdm(test_texts, desc="Generating summary"):
    summary = summarizer_clear(
        text,
        max_length=512,
        min_length=400,
        truncation=True
    )[0]["summary_text"]
    generated_clear.append(summary)

# Number of inferences
num_inferenze = len(generated_clear)
print(f"Number of inferences: {num_inferenze}")

# Reference summaries
references = [ex["summary"] for ex in test_data_anon]

Generating summary: 100%|██████████| 1000/1000 [57:35<00:00,  3.46s/it]


Number of inferences: 1000


In [18]:
generated_clear[0]

'The patient was a 65-year-old male who presented with a swelling of the posterior surface of the proximal third of his left leg. His prior history included a prolonged stay in hospital for treatment of bacterial endocarditis, when two mitral valve replacement operations were performed. He had also previously undergone two abdominal operations to treat an intestinal tumor and one varicose veins surgery.\r\nPhysical examination revealed a pulsating mass in the posterior region of the proximal third of the left leg. Femoral, popliteal, and dorsal pedal pulses were palpable and normal in both lower limbs. The posterior tibial artery pulse was absent, whereas the posterior tibial artery pulse was palpable in the right lower limb.\r\nMagnetic resonance angiography showed a saccular dilatation in the tibioperoneal trunk with a 4.4 cm diameter, at the level of the origin of the posterior tibial artery. The posterior tibial artery was also occluded ( ). Investigation was supplemented with labo

In [14]:
# Evaluations
import evaluate

# ROUGE
rouge = evaluate.load("rouge")

results_clear = rouge.compute(predictions=generated_clear, references=references)
print("ROUGE - Model on clear data:")
for k, v in results_clear.items():
    print(f"{k}: {v:.4f}")

ROUGE - Model on clear data:
rouge1: 0.5549
rouge2: 0.4063
rougeL: 0.5169
rougeLsum: 0.5382


In [16]:
# BLEU
bleu = evaluate.load("bleu")

references_bleu = [[ref] for ref in references]

results_bleu = bleu.compute(predictions=generated_clear, references=references_bleu)

print(f"BLEU score: {results_bleu['bleu']:.4f}")

BLEU score: 0.1900


In [17]:
from statistics import mean

texts = [ex["summary"] for ex in train_data_clear]
token_lengths = [len(tokenizer(text)["input_ids"]) for text in texts]
avg_tokens = mean(token_lengths)

print(f"Average token per entry in train set: {avg_tokens:.2f}")

Average token per entry in train set: 444.89


#### **TRAINING ANON (training anon + test anon)**

In [12]:
from transformers import LongformerForSequenceClassification
model_anon = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")

# Enable gradient checkpointing on the model
model_anon.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir="./results_anon_anonymds",        # folder for checkpoints
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,                # effectively larger batch size
    fp16=True,                                    # mixed precision training
    save_strategy="epoch",                        # save checkpoint at every epoch
    num_train_epochs=2,

    # Logging
    logging_dir="./logs",
    logging_steps=50,                             # more frequent logging
    report_to=["tensorboard","wandb"],            # log to TensorBoard and W&B

    # Learning rate and optimization
    learning_rate=2e-5,                           # more stable LR for small batches
    weight_decay=0.01,                            # regularization
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    lr_scheduler_type="linear",
    warmup_steps=100                              # warmup during first steps
)

# Trainer
trainer_anon = Trainer(
    model=model_anon,
    args=training_args,
    train_dataset=train_data_anon,
    eval_dataset=test_data_anon,
)
trainer_anon.train()

Step,Training Loss
50,0.5112
100,0.2279
150,0.1721
200,0.1319
250,0.1268
300,0.1085
350,0.1118
400,0.1123
450,0.1066
500,0.1022


TrainOutput(global_step=500, training_loss=0.17113373374938964, metrics={'train_runtime': 8806.7409, 'train_samples_per_second': 0.908, 'train_steps_per_second': 0.057, 'total_flos': 2.1601638678528e+16, 'train_loss': 0.17113373374938964, 'epoch': 2.0})

In [13]:
!zip -r results_anon_anonymds.zip results_anon_anonymds/checkpoint-500/

  adding: results_anon_anonymds/checkpoint-500/ (stored 0%)
  adding: results_anon_anonymds/checkpoint-500/rng_state.pth (deflated 25%)
  adding: results_anon_anonymds/checkpoint-500/optimizer.pt (deflated 14%)
  adding: results_anon_anonymds/checkpoint-500/scaler.pt (deflated 60%)
  adding: results_anon_anonymds/checkpoint-500/config.json (deflated 61%)
  adding: results_anon_anonymds/checkpoint-500/trainer_state.json (deflated 71%)
  adding: results_anon_anonymds/checkpoint-500/model.safetensors (deflated 11%)
  adding: results_anon_anonymds/checkpoint-500/scheduler.pt (deflated 56%)
  adding: results_anon_anonymds/checkpoint-500/generation_config.json (deflated 33%)
  adding: results_anon_anonymds/checkpoint-500/training_args.bin (deflated 51%)


In [14]:
losses = [x["loss"] for x in trainer_anon.state.log_history if "loss" in x]

for step, loss in enumerate(losses, 1):
    print(f"Step {step} - Training Loss: {loss}")

Step 1 - Training Loss: 0.5112
Step 2 - Training Loss: 0.2279
Step 3 - Training Loss: 0.1721
Step 4 - Training Loss: 0.1319
Step 5 - Training Loss: 0.1268
Step 6 - Training Loss: 0.1085
Step 7 - Training Loss: 0.1118
Step 8 - Training Loss: 0.1123
Step 9 - Training Loss: 0.1066
Step 10 - Training Loss: 0.1022


In [None]:
from transformers import pipeline, logging
from tqdm import tqdm

# Disable warnings
logging.set_verbosity_error()

# Disable gradient checkpointing
model_anon.gradient_checkpointing_disable()

# Pipeline for summary generation
summarizer_anon = pipeline("summarization", model=model_clear, tokenizer=tokenizer, device=0)

# Test set
test_texts = [ex["full_note"] for ex in test_data_anon]

# List to store the generated summaries
generated_anon = []

# Progress bar with tqdm
for text in tqdm(test_texts, desc="Generating summaries"):
    summary = summarizer_anon(
        text,
        max_length=512,
        min_length=400,
        truncation=True
    )[0]["summary_text"]
    generated_anon.append(summary)

# Number of inferences
num_inferenze = len(generated_anon)
print(f"Number of inferences: {num_inferenze}")

# Reference summaries
references = [ex["summary"] for ex in test_data_anon]

In [19]:
generated_anon[0]

NameError: name 'generated_anon' is not defined

In [24]:
# Evaluations
import evaluate

# ROUGE
rouge = evaluate.load("rouge")

results_anon = rouge.compute(predictions=generated_anon, references=references)
print("ROUGE - Model on anonymized data:")
for k, v in results_anon.items():
    print(f"{k}: {v:.4f}")

ROUGE - Model on anonymized data:
rouge1: 0.5178
rouge2: 0.4427
rougeL: 0.4634
rougeLsum: 0.4859


In [23]:
# BLEU
bleu = evaluate.load("bleu")

references_bleu = [[ref] for ref in references]

results_bleu = bleu.compute(predictions=generated_anon, references=references_bleu)

print(f"BLEU score: {results_bleu['bleu']:.4f}")

BLEU score: 0.2360
