In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [7]:
import torch
import math

print(torch.cuda.is_available())
#True
print(torch.cuda.device_count())
#1
print(torch.cuda.current_device())
#0
print(torch.cuda.get_device_name(0))
#'GeForce GTX 1080'

True
1
0
NVIDIA GeForce RTX 3090


In [3]:
from __future__ import annotations
import functools
import typing as tp
import datasets
import transformers
from transformers import (
    DataCollatorForSeq2Seq,
    PreTrainedTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)


increment_en = [
    {"input": "One", "target": "Two"},
    {"input": "Three", "target": "Four"},
    {"input": "Five", "target": "Six"},
    {"input": "Seven", "target": "Eight"},
    {"input": "Nine", "target": "Ten"},
]
increment_en = increment_en * 100


def lod_to_dol(list_of_dicts: tp.List[tp.Dict[str, tp.Any]]) -> tp.Dict[str, list]:
    dict_of_lists = {
        key: [dct[key] for dct in list_of_dicts] for key in list_of_dicts[0]
    }
    return dict_of_lists


increment_en = lod_to_dol(increment_en)


def preprocess_function_(
    examples,
    tokenizer: PreTrainedTokenizer,
    max_input_length: int,
    max_target_length: int,
):
    inputs = examples["input"]
    targets = examples["target"]

    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


def preprocess_function2_(
    examples,
    tokenizer: PreTrainedTokenizer,
    max_input_length: int,
    max_target_length: int,
):
    inputs = examples["inputs"]
    targets = examples["labels"]

    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
For effortless bug reporting copy-paste your error into this form: https://docs.google.com/forms/d/e/1FAIpQLScPB8emS3Thkp66nvqwmjTEgxp8Y9ufuWTzFyr9kJ5AoI47dQ/viewform?usp=sf_link
CUDA SETUP: CUDA runtime path found: /home/sherman/miniconda3/envs/fn_env/lib/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 118
CUDA SETUP: Loading binary /home/sherman/miniconda3/envs/fn_env/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda118.so...
[2023-07-29 00:15:01,496] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [8]:
tokenizer = transformers.T5TokenizerFast.from_pretrained("google/flan-t5-small")
model = transformers.T5ForConditionalGeneration.from_pretrained("google/flan-t5-small")

batch_size=4
args = Seq2SeqTrainingArguments(
    "script_debug",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=False,
    push_to_hub=False,
    # sharded_ddp=["zero_dp_3"],
    max_steps=math.ceil(24363/batch_size),
    logging_steps=1000,
    save_steps=5000
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

dataset = datasets.DatasetDict(
    {
        "train": datasets.load_from_disk("templates_paraphrase_dev.dataset"),
        "test": datasets.load_from_disk("templates_paraphrase_dev.dataset"),
    }
)

preprocess_function2 = functools.partial(
    preprocess_function2_,
    tokenizer=tokenizer,
    max_input_length=512,
    max_target_length=512
)

processed_ds2 = dataset.map(preprocess_function2, batched=True)
processed_ds2.set_format(
    type="torch", columns=["input_ids", "attention_mask", "labels"]
)

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=processed_ds2["train"],
    eval_dataset=processed_ds2["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
1000,1.6317
2000,1.4416
3000,1.3848
4000,1.3645
5000,1.3316
6000,1.3395


TrainOutput(global_step=6091, training_loss=1.4135024553448319, metrics={'train_runtime': 556.9771, 'train_samples_per_second': 43.743, 'train_steps_per_second': 10.936, 'total_flos': 331636284807168.0, 'train_loss': 1.4135024553448319, 'epoch': 1.0})

In [12]:
model.generate(**(tokenizer("One", return_tensors="pt").to(model.device)))

tensor([[   0, 2759,    1]], device='cuda:0')

In [9]:
pipe = transformers.pipelines.Text2TextGenerationPipeline(model=model, tokenizer=tokenizer, device=model.device, batch_size=4)

In [10]:
pipe("'SYSTEM | How about Searching for Sugar Man? The movie has an average rating of 8.2.'")#, do_sample=True, eta_cutoff=3e-4)

[{'generated_text': 'How about Searching for Sugar Man? It has an average rating of 8.2?'}]