In [None]:
from datasets import load_dataset
from transformers import (
    AutoConfig,
    AutoModel,
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    GPT2LMHeadModel,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)

In [None]:
class Config:
    seed = 42
    max_length = 512
    learning_rate = 2e-5
    per_device_train_batch_size = 16
    per_device_eval_batch_size = 16
    weight_decay = 0.01
    save_total_limit = 3
    num_train_epochs = 10
    fp16 = True

In [None]:
cnfg = Config()

In [None]:
# dataset = load_dataset('ingeniumacademy/reuters_articles')
# dataset = load_dataset("liweili/c4_200m", split="train", streaming=True)
dataset = load_dataset("liweili/c4_200m")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading data:   0%|          | 0.00/14.9G [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
dataset

In [None]:
shuffled_dataset = dataset.shuffle(seed=cnfg.seed)
dataset_new = dataset.select([0:1000000])

In [None]:
print(next(iter(dataset_new)))

## Models

In [None]:
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
model.config

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.40.1",
  "use_cache": true,
  "vocab_size": 50257
}

In [None]:
tokenizer

GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}

In [None]:
prefix = "Incorrect: "


def tokenize_data(inputs):
    processed_inputs = [f"{prefix}{inp['input']} Correct: " for inp in inputs]
    labels = [f"{inp['output']}" for inp in inputs]
    tokenized_data = tokenizer(
        processed_inputs, text_target=labels, max_length=cnfg.max_length
    )

    return tokenized_data

In [None]:
tokenized_train = dataset.map(tokenize_data, batched=True)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="EN_GRAMMAR_CORRECTOR",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    # report_to="wandb",
    # run_name="text_summary_gpt2-medium"
)