### **Text Summarization on encrypted data**

In [6]:
!pip install --upgrade --no-cache-dir transformers datasets accelerate



In [7]:
!pip install evaluate



In [8]:
!pip install rouge_score



In [None]:
!pip install bert-score

#### **TRAINING**

In [14]:
# Hugging Face Transformers
from transformers import Trainer, TrainingArguments
from datasets import load_dataset
from transformers import LEDTokenizer, LEDForConditionalGeneration


# Loading datasets
dataset = load_dataset("csv", data_files="encrypted_dataset_sample.csv", sep=',', split='train')
print(dataset.column_names)

dataset = dataset.map(lambda x: {"full_note": x["full_note"], "summary": x["encrypt_note"]})
dataset_split = dataset.train_test_split(test_size=0.2, seed=42)
train_data = dataset_split["train"]
test_data = dataset_split["test"]

# Tokenization
tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")

def preprocess(example):
    inputs = tokenizer(example["full_note"], truncation=True, padding="max_length", max_length=4096)
    targets = tokenizer(example["summary"], truncation=True, padding="max_length", max_length=512)
    inputs["labels"] = targets["input_ids"]
    return inputs

train_data = train_data.map(preprocess, batched=True)
test_data = test_data.map(preprocess, batched=True)

Generating train split: 0 examples [00:00, ? examples/s]

['index', 'note', 'encrypt_note', 'sensitive_entity_note', 'full_note', 'encrypt_full_note', 'sensitive_entity_full_note']


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [10]:
# Training Arguments
from transformers import LongformerForSequenceClassification
model_clear = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")

# Enable gradient checkpointing on the model
model_clear.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir="./results_encryption_summary",        # folder for checkpoints
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,                # effectively larger batch size
    fp16=True,                                    # mixed precision training
    save_strategy="epoch",                        # save checkpoint at every epoch
    num_train_epochs=2,

    # Logging
    logging_dir="./logs",
    logging_steps=50,                             # more frequent logging
    report_to=["tensorboard","wandb"],            # log to TensorBoard and W&B

    # Learning rate and optimization
    learning_rate=2e-5,                           # more stable LR for small batches
    weight_decay=0.01,                            # regularization
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-8,
    lr_scheduler_type="linear",
    warmup_steps=100                              # warmup during first steps
)

pytorch_model.bin:   0%|          | 0.00/648M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/648M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

In [11]:
# Trainer
trainer = Trainer(
    model=model_clear,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
)
trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mraffaeleaurucci[0m ([33mraffaeleaurucci-universit-degli-studi-di-salerno[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,5.4711
100,4.1306
150,3.4148
200,3.0944
250,2.8735
300,2.7092
350,2.6064
400,2.538
450,2.4986
500,2.4872


TrainOutput(global_step=500, training_loss=3.182392425537109, metrics={'train_runtime': 8766.6658, 'train_samples_per_second': 0.913, 'train_steps_per_second': 0.057, 'total_flos': 2.1601638678528e+16, 'train_loss': 3.182392425537109, 'epoch': 2.0})

In [12]:
!zip -r results_encryption_summary.zip results_encryption_summary/checkpoint-500/

  adding: results_encryption_summary/checkpoint-500/ (stored 0%)
  adding: results_encryption_summary/checkpoint-500/trainer_state.json (deflated 70%)
  adding: results_encryption_summary/checkpoint-500/optimizer.pt (deflated 13%)
  adding: results_encryption_summary/checkpoint-500/config.json (deflated 61%)
  adding: results_encryption_summary/checkpoint-500/scheduler.pt (deflated 62%)
  adding: results_encryption_summary/checkpoint-500/generation_config.json (deflated 33%)
  adding: results_encryption_summary/checkpoint-500/model.safetensors (deflated 11%)
  adding: results_encryption_summary/checkpoint-500/scaler.pt (deflated 64%)
  adding: results_encryption_summary/checkpoint-500/rng_state.pth (deflated 26%)
  adding: results_encryption_summary/checkpoint-500/training_args.bin (deflated 53%)


In [14]:
losses = [x["loss"] for x in trainer.state.log_history if "loss" in x]

for step, loss in enumerate(losses, 1):
    print(f"Step {step} - Training Loss: {loss}")

Step 1 - Training Loss: 5.4711
Step 2 - Training Loss: 4.1306
Step 3 - Training Loss: 3.4148
Step 4 - Training Loss: 3.0944
Step 5 - Training Loss: 2.8735
Step 6 - Training Loss: 2.7092
Step 7 - Training Loss: 2.6064
Step 8 - Training Loss: 2.538
Step 9 - Training Loss: 2.4986
Step 10 - Training Loss: 2.4872


In [1]:
!unzip results_encryption_summary.zip -d /content/results_encryption_summary

Archive:  results_encryption_summary.zip
   creating: /content/results_encryption_summary/results_encryption_summary/checkpoint-500/
  inflating: /content/results_encryption_summary/results_encryption_summary/checkpoint-500/trainer_state.json  
  inflating: /content/results_encryption_summary/results_encryption_summary/checkpoint-500/optimizer.pt  
  inflating: /content/results_encryption_summary/results_encryption_summary/checkpoint-500/config.json  
  inflating: /content/results_encryption_summary/results_encryption_summary/checkpoint-500/scheduler.pt  
  inflating: /content/results_encryption_summary/results_encryption_summary/checkpoint-500/generation_config.json  
  inflating: /content/results_encryption_summary/results_encryption_summary/checkpoint-500/model.safetensors  
  inflating: /content/results_encryption_summary/results_encryption_summary/checkpoint-500/scaler.pt  
  inflating: /content/results_encryption_summary/results_encryption_summary/checkpoint-500/rng_state.pth  
 

In [11]:
from transformers import AutoModelForSeq2SeqLM, LEDTokenizer

# Percorso locale (non repo HF)
model_path = "/content/results_encryption_summary/results_encryption_summary/checkpoint-500"

# Caricamento modello e tokenizer da locale
model_clear = AutoModelForSeq2SeqLM.from_pretrained(model_path, local_files_only=True)
tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")

print("✅ Modello locale caricato correttamente!")

tokenizer_config.json:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

✅ Modello locale caricato correttamente!


In [15]:
from transformers import pipeline, logging
from tqdm import tqdm

# Disable warnings
logging.set_verbosity_error()

# Disable gradient checkpointing
model_clear.gradient_checkpointing_disable()

# Summarization pipeline
summarizer_clear = pipeline("summarization", model=model_clear, tokenizer=tokenizer, device=0)

# Test set
test_texts = [ex["full_note"] for ex in test_data]

# List to store the generated summaries
generated_summaries = []

# Progress bar with tqdm
for text in tqdm(test_texts, desc="Generating summary"):
    summary = summarizer_clear(
        text,
        max_length=512,
        min_length=400,
        truncation=True
    )[0]["summary_text"]
    generated_summaries.append(summary)

# Number of inferences
num_inferenze = len(generated_summaries)
print(f"Number of inferences: {num_inferenze}")

# Reference summaries
references = [ex["summary"] for ex in test_data]

Generating summary: 100%|██████████| 1000/1000 [42:27<00:00,  2.55s/it]


Number of inferences: 1000


In [16]:
generated_summaries[0]

'The patient was a xUiOoi4x5l-Seor8VO9yQ-zR9BxL4QEb ZEt8W9lqk_ who presented with nMubxnGZ3r TUWycJ-ka0 of the proximal third of ZdpKlCZbNf left leg. ZdpKlCZbNf prior history included a prolonged stay in hospital for iqUcsjCc6B O8mbpBC7UG 0NRdXTHwsy iqUcsjCc6B of ZdpKlCZbNf left leg. ZdpKlCZbNf prior history included a prolonged stay in hospital for iqUcsjCc6B O8mbpBC7UG TUWycJ-ka0, when ah_zfXvr91 xUiOoi4xlc yerD4tQZHH xUiOoi4xlc was performed. y_BtX_qN0T had also previously undergone ah_zfXvr91 xUiOoi4xlc xUi'

In [17]:
# Evaluations
import evaluate

# ROUGE
rouge = evaluate.load("rouge")

results_clear = rouge.compute(predictions=generated_summaries, references=references)
print("ROUGE - Model on anonymized data:")
for k, v in results_clear.items():
    print(f"{k}: {v:.4f}")

Downloading builder script: 0.00B [00:00, ?B/s]

ROUGE - Model on anonymized data:
rouge1: 0.2222
rouge2: 0.1599
rougeL: 0.1994
rougeLsum: 0.1993


In [18]:
# BLEU
bleu = evaluate.load("bleu")

references_bleu = [[ref] for ref in references]

results_bleu = bleu.compute(predictions=generated_summaries, references=references_bleu)

print(f"BLEU score: {results_bleu['bleu']:.4f}")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

BLEU score: 0.0099


In [19]:
from bert_score import score

# BERTScore
P, R, F1 = score(generated_summaries, references, lang="en", verbose=True)

print(f"Precision: {P.mean().item():.4f}")
print(f"Recall: {R.mean().item():.4f}")
print(f"F1: {F1.mean().item():.4f}")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

calculating scores...
computing bert embedding.


  0%|          | 0/32 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/16 [00:00<?, ?it/s]

done in 107.30 seconds, 9.32 sentences/sec
Precision: 0.8639
Recall: 0.8352
F1: 0.8492


In [17]:
import csv
from google.colab import files

with open("../datasets/summaries/summaries_encryption.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["full_note", "summary"])
    for full_note, summary in zip(test_texts, generated_summaries):
        writer.writerow([full_note, summary])

# Download file CSV
files.download("summaries_encryption.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from statistics import mean

texts = [ex["summary"] for ex in train_data]
token_lengths = [len(tokenizer(text)["input_ids"]) for text in texts]
avg_tokens = mean(token_lengths)

print(f"Average token per entry in train set: {avg_tokens:.2f}")