In [None]:
!pip install datasets
!pip install transformers -U
!pip install accelerate -U
!pip install trl


In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
from datasets import load_dataset
DATASET_NAME = "mlabonne/guanaco-llama2-1k"
dataset = load_dataset(DATASET_NAME)

In [None]:
print(dataset)

In [None]:
training_dataset = dataset["train"]
print(training_dataset)

In [None]:
random_number = torch.randint(0, 1000, (1,)).item()
print(f"Random number: {random_number}")
training_dataset[random_number]

In [None]:
MODEL_NAME = "distilgpt2"
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM

# load in the model, use caching, load in tokenizer, set token padding
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto")
model.config.use_cache = True
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code = True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# configure generator
generation_configuration = model.generation_config
generation_configuration.pad_token_id = tokenizer.eos_token_id
generation_configuration.eos_token_id = tokenizer.eos_token_id
generation_configuration.max_new_tokens = 1024
generation_configuration.temperature = .7 # scales down output (softmax probabilities)
generation_configuration.top_p = .9
generation_configuration.top_k = 20




In [None]:
def generate(prompt):
  encoded = tokenizer.encode(prompt, add_special_tokens = True, return_tensors='pt').to(device)
  out = model.generate(input_ids = encoded, repetition_penalty = 1.2, do_sample = True) #multinomial sampling
  string_decoded = tokenizer.decode(out[0].tolist(), clean_up_tokenization_spaces =True)
  print(string_decoded)

In [None]:
generate('this is')

In [None]:
generate('how are you?')

In [None]:
from trl import SFTConfig, SFTTrainer
from transformers import TrainingArguments

# set up your training arguments
training_args = SFTConfig(
gradient_accumulation_steps=1,
num_train_epochs=5,
learning_rate=5e-5,
fp16=True,
output_dir="logs",
lr_scheduler_type="cosine",
warmup_ratio=0.05,
group_by_length=True,
max_length=512

)

# initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=training_dataset,
    processing_class=tokenizer,
    args=training_args
)

In [None]:
generation_configuration.do_sample = True
trainer.train()

In [None]:

def generate(prompt):
    # Wrap the prompt in the tags the model saw during training
    formatted_prompt = f"<s>[INST] {prompt} [/INST]"

    encoded = tokenizer.encode(formatted_prompt, add_special_tokens=True, return_tensors='pt').to(model.device)

    # Lower repetition_penalty; 2.0 is too aggressive and causes "garbage" output
    out = model.generate(
        input_ids=encoded,
        repetition_penalty=1.1,
        do_sample=True,
        max_new_tokens=100
    )

    string_decoded = tokenizer.decode(out[0], skip_special_tokens=False)
    print(string_decoded)

In [None]:
generate('What is your name?')