# Fine-tune Prophetnet on Xsum

##Libraries and environment preparation

In [None]:
#Install essential packages
%%capture
!pip install datasets transformers rouge-score nltk wandb
!apt install git-lfs

In [None]:
#Colab Environment Check for GPU and RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

#GPU check
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Your runtime has 13.6 gigabytes of available RAM

Tue Feb 22 18:18:05 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-------------------------------------

In [None]:
# Make sure your version of Transformers is at least 4.11.0 
# to run the following code correctly:
import datasets
import transformers
print(transformers.__version__)

4.16.2


In [None]:
from transformers import AutoTokenizer    
# Huggingface Automodel class
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model_checkpoint = "microsoft/prophetnet-large-uncased"

In [None]:
# Import Wandb 
import os
import wandb
API_KEY = '39991c538626bee25c64d4f8a4c3403dd635537c'
os.environ["WANDB_API_KEY"] = API_KEY

##Load the dataset

In [None]:
# import dataset
raw_datasets = datasets.load_dataset("xsum")

Downloading:   0%|          | 0.00/2.05k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/954 [00:00<?, ?B/s]

Using custom data configuration default


Downloading and preparing dataset xsum/default (download: 245.38 MiB, generated: 507.60 MiB, post-processed: Unknown size, total: 752.98 MiB) to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934...


  0%|          | 0/2 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/255M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.00M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset xsum downloaded and prepared to /root/.cache/huggingface/datasets/xsum/default/1.2.0/32c23220eadddb1149b16ed2e9430a05293768cfffbdfd151058697d4c11f934. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

##Preprocess the data

In [None]:
# Import tokenizer from model checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/141 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

In [None]:
# tokenlize inputs into map

max_input_length = 512
max_target_length = 64

def preprocess_function(examples):
    inputs = [doc for doc in examples["document"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_train = raw_datasets["train"].select(range(100000)).map(preprocess_function, 
                                      batched=True,
                                      remove_columns=["document", "summary", "id"])

  0%|          | 0/100 [00:00<?, ?ba/s]

In [None]:
tokenized_val = raw_datasets["validation"].select(range(5000)).map(preprocess_function, 
                                      batched=True,
                                      remove_columns=["document", "summary", "id"])

  0%|          | 0/5 [00:00<?, ?ba/s]

In [None]:
tokenized_train

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 100000
})

##Fine-tuning the model

In [None]:
# Import tokenizer from model checkpoint and print detail
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# data collator: pad the inputs and labels during each batch to save space
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


Downloading:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

In [None]:
wandb.init(project="prophetnet")

[34m[1mwandb[0m: Currently logged in as: [33mshusunny[0m (use `wandb login --relogin` to force relogin)


In [None]:
# Define traing args, batch size and epoch
# batch size max 8 for input length 1024 on Colab Pro

batch_size = 4
epochs = 1
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-xsum",
    load_best_model_at_end="eval_loss",
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=1000, 
    save_steps=2500,
    eval_steps=2500,
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    fp16=True,
    report_to="wandb",
)

PyTorch: setting up devices


In [None]:
import nltk
import numpy as np
nltk.download('punkt')

metric = datasets.load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Downloading:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

In [None]:
# Pass into the trainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp half precision backend


In [25]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `ProphetNetForConditionalGeneration.forward` and have been ignored: token_type_ids.
***** Running training *****
  Num examples = 100000
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 25000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
2500,3.1309,2.831381,35.247,12.8134,27.8442,27.8862,24.8726
5000,2.9296,2.748818,36.3622,13.5287,28.6504,28.708,25.6688
7500,2.8701,2.678171,37.1285,14.4059,29.4593,29.5038,25.8164
10000,2.8052,2.608081,37.7995,15.0595,30.0573,30.0983,26.0204
12500,2.7609,2.546456,38.3448,15.5935,30.6442,30.6952,26.2554
15000,2.6705,2.498819,38.8711,15.8656,30.9536,31.0076,25.9098


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incom

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
2500,3.1309,2.831381,35.247,12.8134,27.8442,27.8862,24.8726
5000,2.9296,2.748818,36.3622,13.5287,28.6504,28.708,25.6688
7500,2.8701,2.678171,37.1285,14.4059,29.4593,29.5038,25.8164
10000,2.8052,2.608081,37.7995,15.0595,30.0573,30.0983,26.0204
12500,2.7609,2.546456,38.3448,15.5935,30.6442,30.6952,26.2554
15000,2.6705,2.498819,38.8711,15.8656,30.9536,31.0076,25.9098
17500,2.6384,2.44313,39.4108,16.5877,31.6027,31.6585,26.1794
20000,2.5823,2.399606,40.2374,17.1949,32.3489,32.4082,25.9412
22500,2.541,2.359188,40.7812,17.731,32.852,32.898,26.4892
25000,2.4758,2.33843,41.0736,17.9814,33.0565,33.0895,26.2212


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`use_cache=True` is incom

TrainOutput(global_step=25000, training_loss=2.756292294921875, metrics={'train_runtime': 39834.2241, 'train_samples_per_second': 2.51, 'train_steps_per_second': 0.628, 'total_flos': 1.0582492322906112e+17, 'train_loss': 2.756292294921875, 'epoch': 1.0})

In [26]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
eval/gen_len,▁▄▅▆▇▅▇▆█▇
eval/loss,█▇▆▅▄▃▂▂▁▁
eval/rouge1,▁▂▃▄▅▅▆▇██
eval/rouge2,▁▂▃▄▅▅▆▇██
eval/rougeL,▁▂▃▄▅▅▆▇██
eval/rougeLsum,▁▂▃▄▅▅▆▇██
eval/runtime,▁▃▃▄▆▇▇▅▇█
eval/samples_per_second,█▆▆▅▃▂▂▄▂▁
eval/steps_per_second,█▆▆▅▃▂▂▄▂▁
train/epoch,▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇▇████

0,1
eval/gen_len,26.2212
eval/loss,2.33843
eval/rouge1,41.0736
eval/rouge2,17.9814
eval/rougeL,33.0565
eval/rougeLsum,33.0895
eval/runtime,1847.0324
eval/samples_per_second,2.707
eval/steps_per_second,0.677
train/epoch,1.0


In [36]:
!ls -lh 

total 4.0G
drwx------ 5 root root 4.0K Feb 22 18:17 drive
drwxr-xr-x 5 root root 4.0K Feb 23 06:28 prophetnet-large-uncased-finetuned-xsum
-rw-r--r-- 1 root root 4.0G Feb 23 06:34 prophetnet-large-uncased-finetuned-xsum.zip
drwxr-xr-x 1 root root 4.0K Feb 18 14:33 sample_data
drwxr-xr-x 3 root root 4.0K Feb 22 18:43 wandb


In [35]:
!zip -r /content/prophetnet-large-uncased-finetuned-xsum.zip /content/prophetnet-large-uncased-finetuned-xsum/checkpoint-25000/

  adding: content/prophetnet-large-uncased-finetuned-xsum/checkpoint-25000/ (stored 0%)
  adding: content/prophetnet-large-uncased-finetuned-xsum/checkpoint-25000/special_tokens_map.json (deflated 37%)
  adding: content/prophetnet-large-uncased-finetuned-xsum/checkpoint-25000/pytorch_model.bin (deflated 7%)
  adding: content/prophetnet-large-uncased-finetuned-xsum/checkpoint-25000/optimizer.pt (deflated 9%)
  adding: content/prophetnet-large-uncased-finetuned-xsum/checkpoint-25000/config.json (deflated 60%)
  adding: content/prophetnet-large-uncased-finetuned-xsum/checkpoint-25000/trainer_state.json (deflated 80%)
  adding: content/prophetnet-large-uncased-finetuned-xsum/checkpoint-25000/rng_state.pth (deflated 27%)
  adding: content/prophetnet-large-uncased-finetuned-xsum/checkpoint-25000/tokenizer_config.json (deflated 40%)
  adding: content/prophetnet-large-uncased-finetuned-xsum/checkpoint-25000/scheduler.pt (deflated 49%)
  adding: content/prophetnet-large-uncased-finetuned-xsum/c

In [37]:
!cp prophetnet-large-uncased-finetuned-xsum.zip '/content/drive/My Drive/weights/'

##Try with a smaller dataset

In [None]:
# Select to get smaller dataset
small_train = raw_datasets['train'].select(list(range(0, 5000)))
small_val = raw_datasets['validation'].select(list(range(0, 500)))
small_train

In [None]:
tokenized_train = small_train.map(preprocess_function, batched=True)
tokenized_val = small_val.map(preprocess_function, batched=True)
tokenized_train

In [None]:
# Import original model in the cache
model_small = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# data collator: pad the inputs and labels during each batch to save space
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_small)

In [None]:
# keep track with wandb
wandb.init(project="Prophetnet")

In [None]:
# Define traing args, batch size and epoch
# batch size max 8 for input length 1024 on Colab Pro

batch_size = 8
epochs = 1
model_name = model_checkpoint.split("/")[-1]
args_small = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-xsum",
    load_best_model_at_end="eval_loss",
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    logging_steps=1000, 
    save_steps=2500,
    eval_steps=2500,
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    fp16=True,
    report_to="wandb",
)

In [None]:
# Pass into the trainer

trainer_small = Seq2SeqTrainer(
    model_small,
    args_small,
    train_dataset=small_train,
    eval_dataset=small_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer_small.train()