In [None]:
from datasets import load_dataset
from transformers import (  # GPT2LMHeadModel,
    AutoConfig,
    AutoModel,
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    T5ForConditionalGeneration,
    T5Tokenizer,
)

In [None]:
class Config:
    seed = 42
    max_source_length = 128
    max_target_length = 128
    learning_rate = 2e-5
    per_device_train_batch_size = 128
    per_device_eval_batch_size = 128
    weight_decay = 0.01
    save_total_limit = 3
    num_train_epochs = 3
    fp16 = True
    dataset_size = 250000

In [None]:
cnfg = Config()

In [None]:
# dataset = load_dataset('ingeniumacademy/reuters_articles')
# dataset = load_dataset("liweili/c4_200m", split="train", streaming=True)
dataset = load_dataset("liweili/c4_200m")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Loading dataset shards:   0%|          | 0/101 [00:00<?, ?it/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 183894319
    })
})

In [None]:
shuffled_dataset = dataset.shuffle(seed=cnfg.seed)
dataset_new = dataset["train"].select(range(cnfg.dataset_size))

In [None]:
dataset_new = dataset_new.train_test_split(test_size=0.1)

In [None]:
dataset_new_train = dataset_new["train"]
dataset_new_val = dataset_new["test"]

In [None]:
dataset_new_train

Dataset({
    features: ['input', 'output'],
    num_rows: 225000
})

In [None]:
print(next(iter(dataset_new_train)))

{'input': 'The tees are also interesting- rather than individual tee boxes, the tee system is more like a mini fairway, so various tees can practically move back or forward depending on the conditions.', 'output': 'The tees are also interesting- rather than individual tee boxes, the tee system is more like a mini fairway, so various tees can easily be moved back or forward depending on the wind conditions.'}


## Models

In [None]:
# model = AutoModelForCausalLM.from_pretrained("gpt2")
# tokenizer = AutoTokenizer.from_pretrained("gpt2")

tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small")
model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small")

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model.config

T5Config {
  "_name_or_path": "google-t5/t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_en_to_de": {
      "early_stopping": true,
      "max_length": 

In [None]:
model

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [None]:
# tokenizer.pad_token = tokenizer.eos_token

In [None]:
dataset_new_train

Dataset({
    features: ['input', 'output'],
    num_rows: 225000
})

In [None]:
task_prefix = "correct english to english: "


def tokenize_data(inputs):

    encoding = tokenizer(
        [task_prefix + sequence for sequence in inputs["input"]],
        padding="longest",
        max_length=cnfg.max_source_length,
        truncation=True,
        return_tensors="pt",
    )

    input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

    target_encoding = tokenizer(
        inputs["output"],
        padding="longest",
        max_length=cnfg.max_target_length,
        truncation=True,
        return_tensors="pt",
    )

    labels = target_encoding.input_ids
    labels[labels == tokenizer.pad_token_id] = (
        -100
    )  # replace padding token id's of the labels by -100 so it's ignored by the loss

    model_inputs = {}
    model_inputs["input_ids"] = input_ids
    model_inputs["attention_mask"] = attention_mask
    model_inputs["labels"] = labels

    return model_inputs

In [None]:
tokenized_train = dataset_new_train.map(tokenize_data, batched=True, batch_size = cnfg.per_device_train_batch_size)
tokenized_val = dataset_new_val.map(tokenize_data, batched=True, batch_size=cnfg.per_device_eval_batch_size)

Map:   0%|          | 0/225000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
# tmp = next(iter(tokenized_train))

In [None]:
# len(tmp["labels"]), len(tmp["attention_mask"]), len(tmp["input_ids"])

In [None]:
# tmp

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="EN_GRAMMAR_CORRECTOR",
    evaluation_strategy="epoch",
    learning_rate=cnfg.learning_rate,
    per_device_train_batch_size=cnfg.per_device_train_batch_size,
    per_device_eval_batch_size=cnfg.per_device_eval_batch_size,
    weight_decay=cnfg.weight_decay,
    save_total_limit=cnfg.save_total_limit,
    num_train_epochs=cnfg.num_train_epochs,
    predict_with_generate=True,
    fp16=cnfg.fp16,
)

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=True,
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
