##Libraries and environment preparation

In [1]:
#GPU check
!nvidia-smi

Wed Mar 30 23:51:47 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   48C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
#Install essential packages
%%capture
!pip install datasets transformers rouge-score nltk wandb
!apt install git-lfs

In [3]:
# Make sure your version of Transformers is at least 4.11.0 
# to run the following code correctly:
import datasets
import transformers
print(transformers.__version__)

4.17.0


In [4]:
from transformers import AutoTokenizer    
# Huggingface Automodel class
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model_checkpoint = "t5-small"

In [5]:
# Import Wandb 
import os
import wandb
API_KEY = '39991c538626bee25c64d4f8a4c3403dd635537c'
os.environ["WANDB_API_KEY"] = API_KEY

##Load the dataset

In [6]:
# import dataset
raw_datasets = datasets.load_dataset("wikihow", "all", "/content/drive/MyDrive/dataset")

Using custom data configuration all-6f5101161f12f62f
Reusing dataset wikihow (/root/.cache/huggingface/datasets/wikihow/all-6f5101161f12f62f/1.2.0/5343fc81d685acaa086c9cc19eb8706206cd1f8b315792b04c1d7b92091c305e)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'headline', 'title'],
        num_rows: 157252
    })
    validation: Dataset({
        features: ['text', 'headline', 'title'],
        num_rows: 5599
    })
    test: Dataset({
        features: ['text', 'headline', 'title'],
        num_rows: 5577
    })
})

##Preprocess the data

In [8]:
# Import tokenizer from model checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

In [10]:
# tokenlize inputs into map
prefix = "summarize: "
max_input_length = 512
max_target_length = 128

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["headline"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [11]:
tokenized_train = raw_datasets["train"].map(preprocess_function, 
                                      batched=True,
                                      remove_columns=["text", "headline", "title"])

  0%|          | 0/158 [00:00<?, ?ba/s]

In [12]:
tokenized_val = raw_datasets["validation"].map(preprocess_function, 
                                      batched=True,
                                      remove_columns=["text", "headline", "title"])

  0%|          | 0/6 [00:00<?, ?ba/s]

In [13]:
tokenized_train

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 157252
})

##Fine-tuning the model

In [14]:
# Import tokenizer from model checkpoint and print detail
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# data collator: pad the inputs and labels during each batch to save space
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

In [15]:
# Define traing args, batch size and epoch
# batch size max 8 for input length 1024 on Colab Pro

batch_size = 8
epochs = 1
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-wiki",
    evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_strategy = "epoch",
    save_total_limit=3,
    num_train_epochs=epochs,
    predict_with_generate=True,
    fp16=True,
    #report_to="wandb",
)

In [16]:
import nltk
import numpy as np
nltk.download('punkt')

metric = datasets.load_metric("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [17]:
# Pass into the trainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

Using amp half precision backend


In [18]:
trainer.train()

***** Running training *****
  Num examples = 157252
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 19657
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mshusunny[0m (use `wandb login --relogin` to force relogin)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.8446,2.645127,25.2403,8.773,21.2579,24.6054,18.3408


***** Running Evaluation *****
  Num examples = 5599
  Batch size = 8
Saving model checkpoint to t5-small-finetuned-wiki/checkpoint-19657
Configuration saved in t5-small-finetuned-wiki/checkpoint-19657/config.json
Model weights saved in t5-small-finetuned-wiki/checkpoint-19657/pytorch_model.bin
tokenizer config file saved in t5-small-finetuned-wiki/checkpoint-19657/tokenizer_config.json
Special tokens file saved in t5-small-finetuned-wiki/checkpoint-19657/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=19657, training_loss=2.8989225520409394, metrics={'train_runtime': 5096.9757, 'train_samples_per_second': 30.852, 'train_steps_per_second': 3.857, 'total_flos': 2.1266718273306624e+16, 'train_loss': 2.8989225520409394, 'epoch': 1.0})

In [19]:
wandb.finish()




VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/gen_len,▁
eval/loss,▁
eval/rouge1,▁
eval/rouge2,▁
eval/rougeL,▁
eval/rougeLsum,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████

0,1
eval/gen_len,18.3408
eval/loss,2.64513
eval/rouge1,25.2403
eval/rouge2,8.773
eval/rougeL,21.2579
eval/rougeLsum,24.6054
eval/runtime,265.9926
eval/samples_per_second,21.049
eval/steps_per_second,2.632
train/epoch,1.0


In [20]:
!zip -r t5-small-finetuned-wiki.zip /content/t5-small-finetuned-wiki/checkpoint-19657/

  adding: content/t5-small-finetuned-wiki/checkpoint-19657/ (stored 0%)
  adding: content/t5-small-finetuned-wiki/checkpoint-19657/scaler.pt (deflated 55%)
  adding: content/t5-small-finetuned-wiki/checkpoint-19657/scheduler.pt (deflated 49%)
  adding: content/t5-small-finetuned-wiki/checkpoint-19657/tokenizer_config.json (deflated 80%)
  adding: content/t5-small-finetuned-wiki/checkpoint-19657/optimizer.pt (deflated 7%)
  adding: content/t5-small-finetuned-wiki/checkpoint-19657/config.json (deflated 62%)
  adding: content/t5-small-finetuned-wiki/checkpoint-19657/pytorch_model.bin (deflated 8%)
  adding: content/t5-small-finetuned-wiki/checkpoint-19657/tokenizer.json (deflated 74%)
  adding: content/t5-small-finetuned-wiki/checkpoint-19657/special_tokens_map.json (deflated 83%)
  adding: content/t5-small-finetuned-wiki/checkpoint-19657/rng_state.pth (deflated 27%)
  adding: content/t5-small-finetuned-wiki/checkpoint-19657/trainer_state.json (deflated 78%)
  adding: content/t5-small-fin

In [21]:
!cp t5-small-finetuned-wiki.zip '/content/drive/My Drive/weights/'