In [None]:
%pip install transformers datasets evaluate

In [None]:
from huggingface_hub import notebook_login

In [None]:
notebook_login()

In [None]:
from datasets import load_dataset

In [None]:
sandra = load_dataset('_training-data')

In [None]:
data = sandra['train'].train_test_split(test_size=0.15)

In [None]:
data['train'][0]

In [None]:
def preprocess_function(examples):
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained('decapoda-research/llama-7b-hf')
    return tokenizer([" ".join(x) for x in examples['message']])

In [None]:
tokenized = data.map(
    preprocess_function,
    batched=True,
    num_proc=4,
    remove_columns=data['train'].column_names
)

In [None]:
# def preprocess_function(examples):
#     from transformers import AutoTokenizer
#     tokenizer = AutoTokenizer.from_pretrained('decapoda-research/llama-7b-hf')

#     tokenizer.pad_token = tokenizer.eos_token

#     return tokenizer.batch_encode_plus(
#         [x for x in examples['message']],
#         padding=True,
#         truncation=True,
#         max_length=512,
#         return_tensors='pt',
#         pad_to_max_length=True
#     )

# tokenized = data.map(
#     preprocess_function,
#     batched=True,
#     num_proc=4,
#     remove_columns=data["train"].column_names,
# )

In [None]:
def group_texts(examples):

    block_size = 128

    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])

    if total_length >= block_size:
        total_length = (total_length // block_size) * block_size
    
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result['labels'] = result['input_ids'].copy()
    return result

lm_dataset = tokenized.map(group_texts, batched=True, num_proc=4)

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained('decapoda-research/llama-7b-hf')

In [None]:
training_args = TrainingArguments(
    output_dir='be-right-back',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset['train'],
    eval_dataset=lm_dataset['test'],
    data_collator=data_collator,
)

trainer.train()

In [None]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.push_to_hub()