In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer

eli5 = load_dataset("eli5_category", split="train[:5000]", trust_remote_code=True)

In [None]:
eli5 = eli5.train_test_split(test_size=0.2)

In [None]:
eli5["train"][0]

In [None]:
eli5 = eli5.flatten()

In [None]:
eli5["train"][0]

In [None]:


tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")

def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]])

tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=eli5["train"].column_names,
)

In [None]:
tokenized_eli5

In [None]:
block_size = 128

In [None]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=1)

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf")

In [None]:
# from transformers import AdamW
from transformers import AdamWeightDecay

optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)

In [None]:
from transformers import TFAutoModelForCausalLM

model = TFAutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")

In [None]:
tf_train_set = model.prepare_tf_dataset(
    lm_dataset["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    lm_dataset["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [None]:
import tensorflow as tf

model.compile(optimizer=optimizer)  # No loss argument!

In [None]:
# # from transformers.keras_callbacks import

# callback = PushToHubCallback(
#     output_dir="my_awesome_eli5_clm-model",
#     # tokenizer=tokenizer,
# )

In [None]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3)

In [None]:
model.save_pretrained(r"C:\Users\shafi\Documents\Shafique\Learning\HugginFace\code\chapter1(transformer_model)\my_local_model")
tokenizer.save_pretrained(r"C:\Users\shafi\Documents\Shafique\Learning\HugginFace\code\chapter1(transformer_model)\my_local_model")

In [6]:
prompt = "Somatic hypermutation allows the immune system to"

In [10]:
from transformers import TFAutoModelForCausalLM, AutoTokenizer
model_path = r"C:\Users\shafi\Documents\Shafique\Learning\HugginFace\code\chapter1(transformer_model)\my_local_model"
model = TFAutoModelForCausalLM.from_pretrained(model_path, from_pt=False)

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
# tokenizer = AutoTokenizer.from_pretrained(model_path)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at C:\Users\shafi\Documents\Shafique\Learning\HugginFace\code\chapter1(transformer_model)\my_local_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [11]:
from transformers import pipeline
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [14]:
print(generator(prompt))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'Somatic hypermutation allows the immune system to "keep" the virus from taking over the body. There are many different methods of getting immune cells to behave like normal, if not super-normal, but the vast majority of people are in good'}]
