In [1]:
import torch
import pandas as pd

from tqdm import tqdm
from pynvml import *
from transformers import Trainer

from datasets import load_dataset, load_metric
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, PegasusConfig
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq
import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
configuration = PegasusConfig()
wandb.init()

[34m[1mwandb[0m: Currently logged in as: [33mwsd_[0m (use `wandb login --relogin` to force relogin)


In [17]:
dataset_train = load_dataset('csv', data_files = "../data/cleaned/train_clean.csv")

Using custom data configuration default-210c1916e129b6c8


Downloading and preparing dataset csv/default to /home/ubuntu/.cache/huggingface/datasets/csv/default-210c1916e129b6c8/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files: 100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 4378.19it/s]
Extracting data files: 100%|███████████████████████████████████████████████| 1/1 [00:00<00:00, 580.93it/s]


Dataset csv downloaded and prepared to /home/ubuntu/.cache/huggingface/datasets/csv/default-210c1916e129b6c8/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


100%|██████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 466.81it/s]


In [5]:
dataset_val = load_dataset('csv', data_files = "../data/cleaned/dev_clean.csv")

Using custom data configuration default-bdc8392c98cfd197
Reusing dataset csv (/home/ubuntu/.cache/huggingface/datasets/csv/default-bdc8392c98cfd197/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)
100%|██████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 596.21it/s]


In [4]:
model_name = "google/pegasus-xsum"
tokenizer = PegasusTokenizer.from_pretrained(model_name)

In [18]:
max_input_length = 1024
max_target_length = 128

def preprocess_function(examples):
    model_inputs = tokenizer(
        examples['transcript'], padding = "longest" , truncation = True)
    # Set up the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples['episode_description'], padding="longest", truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [19]:
train_inp = dataset_train.map(preprocess_function, batched=True)

100%|█████████████████████████████████████████████████████████████████████| 53/53 [05:33<00:00,  6.29s/ba]


In [21]:
dev_inp = dataset_val.map(preprocess_function, batched=True)

Loading cached processed dataset at /home/ubuntu/.cache/huggingface/datasets/csv/default-bdc8392c98cfd197/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-052964da48516ce8.arrow


In [22]:
print_gpu_utilization()

GPU memory occupied: 226 MB.


In [5]:
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

In [24]:
print_gpu_utilization()

GPU memory occupied: 3443 MB.


In [25]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [26]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [32]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    weight_decay=0.01,
    num_train_epochs=10,
    optim = "adafactor",
    run_name="pegasus_first_itr",
    eval_accumulation_steps = 500,
    fp16=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_inp['train'],
    eval_dataset=dev_inp['train'],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()

Error in callback <function _WandbInit._resume_backend at 0x7f15e7f07430> (for pre_run_cell):


Exception: The wandb backend process has shutdown

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp half precision backend
The following columns in the training set  don't have a corresponding argument in `PegasusForConditionalGeneration.forward` and have been ignored: episode_description, transcript. If episode_description, transcript are not expected by `PegasusForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 52381
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 8185
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Exception: The wandb backend process has shutdown

Error in callback <function _WandbInit._pause_backend at 0x7f15cf4d3550> (for post_run_cell):


Exception: The wandb backend process has shutdown

In [9]:
# read test dataset
# generate predictions

df = pd.read_csv("../data/cleaned/test_clean.csv")
df.head()

Unnamed: 0,transcript,episode_description
0,welcome back to another episode of tuxedo time...,today on the podcast we go on a journey we tal...
1,what s up guys this episode of the podcast is ...,ever wanted a podcast from your three favorite...
2,you are listening to irish illustrate insider ...,the irish illustrated insider crew discusses n...
3,you have tuned into irish illustrated insider ...,irish illustrated insider tackles nfl combine ...
4,what s up everybody welcome to the in the dome...,breaking down a classic calgary flames comebac...


In [None]:
f = open("generated.txt", "w")
for idx, row in df.iterrows():
    src_text = row['transcript']
    batch = tokenizer(src_text, truncation=True, padding="longest", return_tensors="pt").to(device)
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    f.write(str(tgt_text[0]))
    f.write("\n")

Error in callback <function _WandbInit._resume_backend at 0x7f83904f4700> (for pre_run_cell):


Exception: The wandb backend process has shutdown

Exception in thread ChkStopThr:
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/adl/lib/python3.9/threading.py", line 973, in _bootstrap_inner
    self.run()
  File "/home/ubuntu/anaconda3/envs/adl/lib/python3.9/threading.py", line 910, in run
    self._target(*self._args, **self._kwargs)
  File "/home/ubuntu/anaconda3/envs/adl/lib/python3.9/site-packages/wandb/sdk/wandb_run.py", line 170, in check_status
    status_response = self._interface.communicate_stop_status()
  File "/home/ubuntu/anaconda3/envs/adl/lib/python3.9/site-packages/wandb/sdk/interface/interface.py", line 114, in communicate_stop_status
    resp = self._communicate_stop_status(status)
  File "/home/ubuntu/anaconda3/envs/adl/lib/python3.9/site-packages/wandb/sdk/interface/interface_shared.py", line 395, in _communicate_stop_status
    resp = self._communicate(req, local=True)
  File "/home/ubuntu/anaconda3/envs/adl/lib/python3.9/site-packages/wandb/sdk/interface/interface_shared.py", line 226, i