# Install Dependancies [Including Setting up DQ] + Add Imports

In [13]:
import os

from transformers import T5Tokenizer, T5ForConditionalGeneration, DataCollatorForSeq2Seq, get_linear_schedule_with_warmup
from datasets import load_dataset, Dataset, DatasetDict
import numpy as np
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

# from dataquality.integrations.seq2seq.hf import watch
from functools import partial

from transformers import GenerationConfig
from transformers import AutoTokenizer, Adafactor

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [37]:
import sys
for x in [m for m in sys.modules if 'dataq' in m]:
    del sys.modules[x]

import dataquality as dq

# Log into DQ
The default environment is Dev

In [38]:
os.environ["GALILEO_USERNAME"]="galileo@rungalileo.io"
os.environ["GALILEO_PASSWORD"]="A11a1una!"
dq.set_console_url("https://console.dev.rungalileo.io")
dq.login()

📡 https://console.dev.rungalileo.io
🔭 Logging you into Galileo

🚀 You're logged in to Galileo as galileo@rungalileo.io!


# Set Notebook Parameters
- Dataset Args
- Training Hyper-Parameters
- Generation Parameters

In [39]:
# Dataset params
# DATASET = "xsum"
# SUBSET = "english"
# # Note that if you are processing the data columns this should match
# # the final input_col used!
# INPUT_COL = "document"
# TARGET_COL = "summary"

DATASET = "DEP-LLMs/Semantic_Similarity-cnn_dailymail-Col_highlights-percent_15-SubsetPercent_10-Seed_8"
SUBSET = None
INPUT_COL = "article"
TARGET_COL = "highlights"

# Use this column to train the model, and use INPUT_COL to log
# to dq. This way we avoid having things like the "task" prepended
MODEL_INPUT_COL = f"model_{INPUT_COL}"
# Flag indicating that we want to re-format the data columns.
# This happens after loading the data
FORMAT_DATA = True

# Sometimes you want to combine a comlumn and make that the new input
# Have the ability to pass a pre_processing function that we
# map the dataset with. This should happen later!

TRAIN_DS_SIZE = 20 # DEfault: 10_000
VAL_DS_SIZE = 10 # DEfault: 1_000
VAL_SPLIT = "test"

MODEL = "t5-small"

GENERATE_ON_TRAIN = False

# Tokenization params
MAX_INPUT_LENGTH = 32
MAX_TARGET_LENGTH = 16  # This is a bit of a bug but for now we cannot shorten the max length < that of the model

# Training Params
LR = 1e-4 # Default: 5e-4
NUM_EPOCHS = 1
ACCUMULATION_STEPS = 4
BATCH_SIZE = 8

# Generation params
# The default values in Generation Config
# Check more out here: https://huggingface.co/docs/transformers/v4.30.0/en/main_classes/text_generation#transformers.GenerationConfig
MAX_NEW_TOKENS = 128
TEMPERATURE = 0.2 # Keep this low for now
TOP_P = 1
TOP_K = 50

# IMPORTANT!
readible_name = DATASET.replace("/", "_")
PROJECT_NAME = "Seq2Seq-newDEP"
RUN_NAME = f"{readible_name}_Inputs-{INPUT_COL}_Targets-{TARGET_COL}_Model-{MODEL}"

# Load Dataset + Subsample if necessary

In [40]:
ds = load_dataset(DATASET, name=SUBSET, token = "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")  # use our Galileo token for some internal datasets

# Check if we need to downsample the data
if TRAIN_DS_SIZE < len(ds['train']) or VAL_DS_SIZE < len(ds[VAL_SPLIT]):
  print (f"Subsampling Datasets to train_size = {TRAIN_DS_SIZE}, val_size = {VAL_DS_SIZE}")
  ds = ds.shuffle(seed=8)
  ds_train = ds['train'][:TRAIN_DS_SIZE]
  ds_val = ds[VAL_SPLIT][:VAL_DS_SIZE]

  ds = DatasetDict({
      'train': Dataset.from_dict(ds_train),
      VAL_SPLIT: Dataset.from_dict(ds_val)
  })



# # Add ids to each ds split
ds = ds.map(lambda _, idx: {"id": idx}, with_indices=True)

ds_train = ds['train']
ds_val = ds[VAL_SPLIT]

Subsampling Datasets to train_size = 20, val_size = 10


Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

# Data Preprocessing
Only use if you want to update the default columns

In [41]:
def add_model_input(row, original_inputs, model_inputs):
  row[model_inputs] = row[original_inputs]
  return row

def format_task_prefix(row, model_inputs, task_prefix="summarize"):
  row[model_inputs] = f"{task_prefix}: {row[model_inputs]}"
  return row

def combine_summary_test(row, text_col="text", summary_col="summary", new_col="summary-text"):
  row[new_col] = f"""Generate a title for the following Text and Summary.

Text: {row[text_col]}

Summary: {row[summary_col]}
""".strip()
  return row

In [42]:
# Pre-process the dataset if we want to e.g. combine some columns.
# There are some pre-set function in the Helpers above, but you may
# need to write your own function.


# For compatability create the MODEL_INPUT_COL
ds = ds.map(lambda x: add_model_input(x, INPUT_COL, MODEL_INPUT_COL))

# How do I want to do it
format_func = format_task_prefix

if FORMAT_DATA:
  # This is just a bit messy right now
  ds = ds.map(lambda x: format_task_prefix(x, MODEL_INPUT_COL),
                  desc="Pre-processing Data")

# Because of the way dq works, unless INPUT_COL = text, we need to make
# sure there is no actual `text` column
remove_cols = []
if INPUT_COL != 'text' and 'text' in ds.column_names['train']:
  remove_cols = ['text']

ds = ds.remove_columns(remove_cols)

# Make sure that the input and target columns exist
assert INPUT_COL in ds['train'].column_names and TARGET_COL in ds['train'].column_names, "Make sure that the columns we want for training exist"

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Pre-processing Data:   0%|          | 0/20 [00:00<?, ? examples/s]

Pre-processing Data:   0%|          | 0/10 [00:00<?, ? examples/s]

# DQ Setup!

In [43]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True, model_max_length=MAX_INPUT_LENGTH)
# Make sure to reload the model each time!
model = T5ForConditionalGeneration.from_pretrained(MODEL)

generation_config = GenerationConfig(
    max_new_tokens=MAX_NEW_TOKENS,
    # Whether we use multinomial sampling
    do_sample=TEMPERATURE >= 1e-5,
    temperature=TEMPERATURE,
    top_p=TOP_P,
    top_k=TOP_K,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id
)

dq.init("seq2seq", project_name=PROJECT_NAME, run_name=RUN_NAME)
dq.set_tokenizer(tokenizer, max_input_length=MAX_INPUT_LENGTH)
# watch(
#     model,
#     generation_config,
#     generate_training_data=GENERATE_ON_TRAIN
# )

# Log datasets with dq
def _log_dataset(_ds, split, input_col, target_col):
    print(_ds)
    dq.log_dataset(
        _ds,
        text=input_col,
        label=target_col,
        split=split
    )

log_dataset = partial(_log_dataset, input_col=INPUT_COL, target_col=TARGET_COL)


# Log just for training
log_dataset(ds['train'], split="training")
log_dataset(ds[VAL_SPLIT], split=VAL_SPLIT)

✨ Initializing existing public project 'Seq2Seq-newDEP'
🏃‍♂️ Fetching existing run 'DEP-LLMs_Semantic_Similarity-cnn_dailymail-Col_highlights-percent_15-SubsetPercent_10-Seed_8_Inputs-article_Targets-highlights_Model-t5-small'
🛰 Connected to existing project 'Seq2Seq-newDEP', and existing run 'DEP-LLMs_Semantic_Similarity-cnn_dailymail-Col_highlights-percent_15-SubsetPercent_10-Seed_8_Inputs-article_Targets-highlights_Model-t5-small'.


  warn(
Token indices sequence length is longer than the specified maximum sequence length for this model (88 > 32). Running this sequence through the model will result in indexing errors


Dataset({
    features: ['article', 'highlights', 'id', 'Corrupted_Output', 'Key', 'model_article'],
    num_rows: 20
})


Aligning characters with tokens:   0%|          | 0/20 [00:00<?, ?it/s]

Logging 20 samples [########################################] 100.00% elapsed time  :     0.01s =  0.0m =  0.0h
 Dataset({
    features: ['article', 'highlights', 'id', 'Corrupted_Output', 'Key', 'model_article'],
    num_rows: 10
})


Aligning characters with tokens:   0%|          | 0/10 [00:00<?, ?it/s]

Logging 10 samples [########################################] 100.00% elapsed time  :     0.01s =  0.0m =  0.0h
 

# Dataset Tokenization and Setup

In [44]:
def tokenize(row, input_col, target_col, max_input_length=MAX_INPUT_LENGTH, max_target_length=MAX_TARGET_LENGTH):
  """Tokenize the input and outputs

  Creates the following columns

  Inputs:
    - input_ids
    - attention_mask

  Outputs:
    - labels
  """
  model_inputs = tokenizer(
        row[input_col],
        truncation=True,
        max_length=max_input_length,
        padding=False,
        return_tensors=None,
    )
  labels = tokenizer(
        row[target_col],
        truncation=True,
        max_length=max_target_length,
        padding=False,
        return_tensors=None,
    ).input_ids

  model_inputs['labels'] = labels
  model_inputs['id'] = row['id']
  return model_inputs


# Setup model training
# Tokenize the text
# NOTE here we use the MODEL_INPUT!
ds_tokenized = ds.map(lambda x: tokenize(x, input_col=MODEL_INPUT_COL, target_col=TARGET_COL, max_input_length=MAX_INPUT_LENGTH, max_target_length=MAX_TARGET_LENGTH),
                      remove_columns=ds['train'].column_names,
                      batched=True,
                      desc="Running tokenizer on dataset")

Running tokenizer on dataset:   0%|          | 0/20 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/10 [00:00<?, ? examples/s]

In [45]:
# Setup the dataloader
data_collator = DataCollatorForSeq2Seq(
    tokenizer, return_tensors="pt", padding=True
)

train_dataset = ds_tokenized["train"]
eval_dataset = ds_tokenized[VAL_SPLIT]

train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=data_collator, batch_size=BATCH_SIZE, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, shuffle=False, collate_fn=data_collator, batch_size=BATCH_SIZE, pin_memory=True)

# Model Training
Open this to view tensorboard and / or if you want to update the optimizer. By default we use `Adafactor`, as suggested for the `T5` model [here](https://discuss.huggingface.co/t/t5-finetuning-tips/684/4)

In [48]:
# training and evaluation
model = model.to(device)

optimizer = Adafactor(model.parameters(), lr=LR, scale_parameter=False, relative_step=False)

for epoch in range(NUM_EPOCHS):
    dq.set_epoch_and_split(split="training", epoch=epoch)
    model.train()
    train_epoch_loss = 0.
    for step, batch in enumerate(tqdm(train_dataloader)):
      ids = batch['id']
      batch = {k: v.to(device) for k, v in batch.items() if k != 'id'}

      outputs = model(**batch)
      # DQ logging!
      logits = outputs.logits  # Shape - [bs, bs_seq_ln, vocab]
      dq.log_model_outputs(
        logits = logits,
        ids = ids
      )

      loss = outputs.loss / ACCUMULATION_STEPS

      loss.backward()
      # Grad Accumulation
      if ((step + 1) % ACCUMULATION_STEPS == 0) \
          or ((step + 1) == len(train_dataloader)):
        optimizer.step()
        optimizer.zero_grad()

      step_loss = loss.detach().cpu().item()
      train_epoch_loss += step_loss
      step_perplexity = torch.exp(torch.Tensor([step_loss])).item()

    train_epoch_loss = train_epoch_loss / len(train_dataloader)
    train_ppl = torch.exp(torch.Tensor([train_epoch_loss])).float()

    model.eval()
    dq.set_epoch_and_split(split=VAL_SPLIT, epoch=epoch)
    eval_epoch_loss = 0
    #eval_preds = []
    with torch.no_grad():
      for step, batch in enumerate(tqdm(eval_dataloader)):
          ids = batch['id']
          batch = {k: v.to(device) for k, v in batch.items() if k != 'id'}

          outputs = model(**batch)
          # DQ logging!
          logits = outputs.logits  # Shape - [bs, bs_seq_ln, vocab]
          dq.log_model_outputs(
            logits = logits,
            ids = ids
          )

          loss = outputs.loss
          eval_step_loss = loss.cpu().item()
          eval_epoch_loss += eval_step_loss

      # Look just at the loss in aggregate!
      eval_epoch_loss = eval_epoch_loss / len(eval_dataloader)
      eval_ppl = torch.exp(torch.Tensor([eval_epoch_loss])).item()

    # Perplexity for a probabilisitic sequence - exp(cross_entropy_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")


100%|██████████| 3/3 [01:20<00:00, 26.69s/it]
100%|██████████| 2/2 [00:00<00:00, 10.39it/s]


epoch=0: train_ppl=tensor([3.4658]) train_epoch_loss=1.2429347435633342 eval_ppl=141.41412353515625 eval_epoch_loss=4.951692581176758


In [35]:
dq.finish()

☁️ Uploading Data
CuML libraries not found, running standard process. For faster Galileo processing, consider installing
`pip install 'dataquality[cuda]' --extra-index-url=https://pypi.nvidia.com/`


training:   0%|          | 0/1 [00:00<?, ?it/s]

training (epoch=0):   0%|          | 0/3 [00:00<?, ?it/s]

Uploading data to Galileo:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Uploading data to Galileo:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Uploading data to Galileo:   0%|          | 0.00/167k [00:00<?, ?B/s]

test:   0%|          | 0/1 [00:00<?, ?it/s]

test (epoch=0):   0%|          | 0/3 [00:00<?, ?it/s]

Uploading data to Galileo:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Uploading data to Galileo:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Uploading data to Galileo:   0%|          | 0.00/84.4k [00:00<?, ?B/s]

Job default successfully submitted. Results will be available soon at https://console.dev.rungalileo.io/insights?projectId=192763fb-9656-4a46-a96b-8d2afd58252a&runId=a8d1ac27-6063-48cd-ab05-a67cc44ecd57&taskType=8&split=training
Waiting for job (you can safely close this window)...
	[training] 👀 Looking for data anomalies
Done! Job finished with status completed
🧹 Cleaning up
🧹 Cleaning up


'https://console.dev.rungalileo.io/insights?projectId=192763fb-9656-4a46-a96b-8d2afd58252a&runId=a8d1ac27-6063-48cd-ab05-a67cc44ecd57&taskType=8&split=training'

In [None]:
def get_gpu_memory():
    t = torch.cuda.get_device_properties(0).total_memory
    r = torch.cuda.memory_reserved(0)
    a = torch.cuda.memory_allocated(0)
    f = r-a  # free inside reserved
    return t, r, a, f

total, reserved, allocated, free = get_gpu_memory()
print('Total Memory: ', total / 2**30)
print('Reserved Memory: ', reserved / 2**30)
print('Allocated Memory: ', allocated / 2**30)
print('Free Memory: ', free / 2**30)

del optimizer
del batch
del logits
del loss

torch.cuda.empty_cache()

import gc
gc.collect()