In [1]:
from datasets import load_dataset

eli5 = load_dataset("eli5_category", split="train[:5000]")

In [2]:
eli5 = eli5.train_test_split(test_size=0.2)

In [3]:
eli5["train"][0]

{'q_id': '5m1z49',
 'title': 'Why does the military insist on saluting?',
 'selftext': '',
 'category': 'Other',
 'subreddit': 'explainlikeimfive',
 'answers': {'a_id': ['dc0ge8e', 'dc05x5e', 'dc05zb8'],
  'text': ["Saluting has a long history that is actually not written in stone. However one explanation is that it was originally used as a greeting between two knights to show their face and thus not be seen as an enemy. Additionally it would take away from their master hand to hold their weapon. This has progressed through to when officers would remove their headgear in the presence of a superior, and then became just a grabbing of the hat. This turned into the modern salute which now has enlisted men salute officers and junior officers saluting superior officers. The salute itself is in recognition of the Commander-in-Chief, for the Commonwealth the salute goes to the Queen, as recognised by the crown on the cap badge. Really though it's just a longheld tradition and the military lov

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")



In [5]:
eli5 = eli5.flatten()
eli5["train"][0]

{'q_id': '5m1z49',
 'title': 'Why does the military insist on saluting?',
 'selftext': '',
 'category': 'Other',
 'subreddit': 'explainlikeimfive',
 'answers.a_id': ['dc0ge8e', 'dc05x5e', 'dc05zb8'],
 'answers.text': ["Saluting has a long history that is actually not written in stone. However one explanation is that it was originally used as a greeting between two knights to show their face and thus not be seen as an enemy. Additionally it would take away from their master hand to hold their weapon. This has progressed through to when officers would remove their headgear in the presence of a superior, and then became just a grabbing of the hat. This turned into the modern salute which now has enlisted men salute officers and junior officers saluting superior officers. The salute itself is in recognition of the Commander-in-Chief, for the Commonwealth the salute goes to the Queen, as recognised by the crown on the cap badge. Really though it's just a longheld tradition and the military 

In [6]:
def preprocess_function(examples):
    return tokenizer([" ".join(x) for x in examples["answers.text"]])

In [7]:
tokenized_eli5 = eli5.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=eli5["train"].column_names,
)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (8350 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1028 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1040 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1186 > 1024). Running this sequence through the model will result in indexing errors


Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1027 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1314 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1583 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1521 > 1024). Running this sequence through the model will result in indexing errors


In [8]:
block_size = 128


def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    # Split by chunks of block_size.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [9]:
lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [10]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

2024-10-07 00:56:08.237411: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-10-07 00:56:08.320395: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-07 00:56:08.352368: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-07 00:56:08.362877: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-07 00:56:08.422338: I tensorflow/core/platform/cpu_feature_guar

In [11]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")


In [12]:
training_args = TrainingArguments(
    output_dir="my_awesome_eli5_clm-model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()


  0%|          | 0/3918 [00:00<?, ?it/s]

{'loss': 3.9853, 'grad_norm': 4.597394943237305, 'learning_rate': 1.7447677386421644e-05, 'epoch': 0.38}
{'loss': 3.9478, 'grad_norm': 3.9575085639953613, 'learning_rate': 1.4895354772843289e-05, 'epoch': 0.77}


  0%|          | 0/330 [00:00<?, ?it/s]

{'eval_loss': 3.8348867893218994, 'eval_runtime': 11.9856, 'eval_samples_per_second': 219.763, 'eval_steps_per_second': 27.533, 'epoch': 1.0}
{'loss': 3.9019, 'grad_norm': 4.543058395385742, 'learning_rate': 1.2343032159264931e-05, 'epoch': 1.15}
{'loss': 3.8641, 'grad_norm': 4.304403305053711, 'learning_rate': 9.790709545686576e-06, 'epoch': 1.53}
{'loss': 3.8473, 'grad_norm': 4.118569374084473, 'learning_rate': 7.238386932108219e-06, 'epoch': 1.91}


  0%|          | 0/330 [00:00<?, ?it/s]

{'eval_loss': 3.823927402496338, 'eval_runtime': 12.1836, 'eval_samples_per_second': 216.193, 'eval_steps_per_second': 27.086, 'epoch': 2.0}
{'loss': 3.8203, 'grad_norm': 4.049162864685059, 'learning_rate': 4.686064318529863e-06, 'epoch': 2.3}
{'loss': 3.8182, 'grad_norm': 4.359309673309326, 'learning_rate': 2.133741704951506e-06, 'epoch': 2.68}


  0%|          | 0/330 [00:00<?, ?it/s]

{'eval_loss': 3.82340931892395, 'eval_runtime': 11.8276, 'eval_samples_per_second': 222.7, 'eval_steps_per_second': 27.901, 'epoch': 3.0}
{'train_runtime': 529.9857, 'train_samples_per_second': 59.136, 'train_steps_per_second': 7.393, 'train_loss': 3.8758947149474383, 'epoch': 3.0}


TrainOutput(global_step=3918, training_loss=3.8758947149474383, metrics={'train_runtime': 529.9857, 'train_samples_per_second': 59.136, 'train_steps_per_second': 7.393, 'total_flos': 1023662682537984.0, 'train_loss': 3.8758947149474383, 'epoch': 3.0})

In [13]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

  0%|          | 0/330 [00:00<?, ?it/s]

Perplexity: 45.76


In [31]:
prompt = "who is the president of kenya"

In [32]:
from transformers import pipeline

generator = pipeline("text-generation", model="./my_awesome_eli5_clm-model/")
generator(prompt)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'generated_text': "who is the president of kenya). This would probably be true for both males and females since the male president would likely have appointed someone who has had a lot at hand but wasn't personally selected by the administration or the US in any way,"}]