In [1]:
import openai


In [2]:
import torch
from transformers import (AutoModelForCausalLM,
                          TrainingArguments,
                          Trainer)
from transformers import LlamaTokenizer, LlamaForCausalLM
from pyprojroot import here
from prepare_training_data import prepare_cubetrianlge_qa_dataset

In [9]:
model_path = 'openlm-research/open_llama_3b'
tokenizer = LlamaTokenizer.from_pretrained(model_path)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
base_model = LlamaForCausalLM.from_pretrained(
    model_path, torch_dtype=torch.float16, device_map='auto',
)

In [10]:
tokenized_cubetriangle_qa_dataset = prepare_cubetrianlge_qa_dataset(tokenizer)


Raw dataset shape: Dataset({
    features: ['question', 'answer'],
    num_rows: 364
})


In [11]:
split_cubetriangle_qa_dataset = tokenized_cubetriangle_qa_dataset.train_test_split(test_size=0.1, shuffle=True, seed=20)

In [7]:
max_steps = -1
epochs=2
output_dir = "fine_tuned_models/CubeTriangle_open_llama_3b_{epochs}_epochs"

training_args = TrainingArguments(
  learning_rate=1.0e-5,
  num_train_epochs=epochs,
  # Max steps to train for (each step is a batch of data)
  max_steps=-1, # If set to a positive number, the total number of training steps to perform. Overrides num_train_epochs, if not -1. 
  #For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until max_steps is reached.
  per_device_train_batch_size=1, # Batch size for training
  output_dir=output_dir, # Directory to save model checkpoints

  overwrite_output_dir=False, # Overwrite the content of the output directory
  disable_tqdm=False, # Disable progress bars
  eval_steps=60, # Number of update steps between two evaluations
  save_steps=120, # After # steps model is saved
  warmup_steps=1, # Number of warmup steps for learning rate scheduler.  Ratio of total training steps used for a linear warmup from 0 to learning_rate.
  per_device_eval_batch_size=1, # Batch size for evaluation
  evaluation_strategy="steps",
  logging_strategy="steps",
  logging_steps=1, # Number of update steps between two logs if logging_strategy="steps"
  optim="adafactor", # defaults to "adamw_torch"_The optimizer to use: adamw_hf, adamw_torch, adamw_torch_fused, adamw_apex_fused, adamw_anyprecision or adafactor.
  gradient_accumulation_steps = 4, # Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
  gradient_checkpointing=False, # If True, use gradient checkpointing to save memory at the expense of slower backward pass.

  # Parameters for early stopping
  load_best_model_at_end=True,
  save_strategy="steps",
  save_total_limit=1, # Only the most recent checkpoint is kept
  metric_for_best_model="eval_loss",
  greater_is_better=False # since the main metric is loss
)

In [8]:
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=split_cubetriangle_qa_dataset["train"],
    eval_dataset=split_cubetriangle_qa_dataset["test"],
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [9]:
training_output = trainer.train()

Step,Training Loss,Validation Loss
60,0.3996,0.338372
120,0.3099,0.257204


Could not locate the best model at fine_tuned_models/CubeTriangle_open_llama_3b_{epochs}_epochs/checkpoint-120/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


In [None]:
save_dir = f'models/fine_tuned_models/CubeTriangle_open_llama_3b_{epochs}e_qa_qa'
trainer.save_model(save_dir)
print("Saved model to:", save_dir)

In [12]:
epochs = 2

In [13]:
save_dir = f'models/fine_tuned_models/CubeTriangle_open_llama_3b_{epochs}e_qa_qa'
finetuned_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only=True, device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
max_input_tokens = 1000
max_output_tokens = 100
test_q = split_cubetriangle_qa_dataset["test"][2]['question']
print("Test question:\n",test_q)
print("--------------------------------")
test_a = split_cubetriangle_qa_dataset["test"][2]["answer"]
print(f"Test answer:\n{test_a}")
print("--------------------------------")
print("Model's answer: ")

# inputs = tokenizer(test_q, return_tensors="pt").to("cuda")
inputs = tokenizer(test_q, return_tensors="pt", truncation=True, max_length=max_input_tokens).to("cuda")
tokens = finetuned_model.generate(**inputs, max_length=max_output_tokens)
# tokens = finetuned_model.generate(**inputs, max_new_tokens=500)
tokenizer.decode(tokens[0], skip_special_tokens=True)[len(test_q):]

Test question:
 What is the value of the indicator 'Poverty - poverty in old age (%)' (Proportion of people over 65 years old who are affected by poverty) in 2016?
--------------------------------
Test answer:
5
--------------------------------
Model's answer: 


"1.3%\nWhat is the value of the indicator 'Poverty - poverty in old age (%)' (Proportion of people over 65 years old who are affected by poverty) in 2017?1.3%\nWhat is the value of"