# Llama 2 Fine-Tuning

This notebook is based on the following Tutorials: 
* https://huggingface.co/blog/llama2#fine-tuning-with-peft
* https://huggingface.co/docs/trl/sft_trainer

**It is incomplete and not working yet.**

In [None]:
import os
import pandas as pd

base_dir = os.path.join('./praktikum-ise-2023-patrick-zierahn')
data_dir = os.path.join(base_dir, 'data')

bibtext_file = os.path.join(data_dir, 'software_architecture', 'bib-text.csv')

reports_dir = os.path.join(base_dir, 'reports', 'llama2_fine_tune')
os.makedirs(reports_dir, exist_ok=True)

llama_data_dir = os.path.join(data_dir, 'llama2_fine_tune')
os.makedirs(llama_data_dir, exist_ok=True)

taxonomy_file = os.path.join(data_dir, 'software_architecture', 'taxonomy_explanation.json')

In [None]:
df = pd.read_csv(bibtext_file)

## Create dataset

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
from names_generator import generate_name

model_name = generate_name()
print('Model name:', model_name)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(test_df)

train_file = os.path.join(llama_data_dir, model_name + "_train.csv")
test_file = os.path.join(llama_data_dir, model_name + "_test.csv")

train_df.to_csv(train_file, index=False)
test_df.to_csv(test_file, index=False)

In [None]:
from typing import List
from src.taxonomy.utils import parse_taxonomy


def get_research_objects(meta: str) -> List[str]:
    tree = parse_taxonomy(meta)
    research_obj = tree.children("Research Object")

    objs = []
    for obj in research_obj:
        objs.append(obj.tag)

    return objs

In [None]:
from src.utils.utils_json import read_json

taxonomy_explanation = read_json(taxonomy_file)

research_objects_all = ""
for obj in taxonomy_explanation["Research Object"].keys():
    research_objects_all += obj + "\n"

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

import torch
import json

pretrained_model = "meta-llama/Llama-2-7b-hf"
# pretrained_model = "meta-llama/Llama-2-13b-hf"

tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model,
    use_auth_token=hf_token,
)

model = AutoModelForCausalLM.from_pretrained(
    pretrained_model,
    use_auth_token=hf_token,
    torch_dtype=torch.float16,
)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [None]:
def format_prompts(item):
    research_objects = get_research_objects(item['classes'])

    completion = ""
    for obj in research_objects:
        completion += f"{obj}\n"

    # Combine a prompt with the static strings
    prompt = f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
    prompt += f"### Instruction:\nCategories the paper into the following research objects:\n{research_objects_all}\n\n"
    prompt += f"### Paper:\n"
    prompt += f"Title:\n{item['title']}\n"
    prompt += f"Abstract:\n{item['abstract']}\n\n"
    prompt += f"### Response:\n{completion}\n"
    prompt += f"### End"

    return prompt


def formatting_prompts_func(example):
    output_texts = []
    for inx in range(len(example['title'])):
        research_objects = get_research_objects(example['classes'][inx])

        completion = ""
        for obj in research_objects:
            completion += f"{obj}\n"

        # Combine a prompt with the static strings
        prompt = f"Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n"
        prompt += f"### Instruction:\nCategories the paper into the following research objects:\n{research_objects_all}\n\n"
        prompt += f"### Paper:\n"
        prompt += f"Title:\n{example['title'][inx]}\n"
        prompt += f"Abstract:\n{example['abstract'][inx]}\n\n"
        prompt += f"### Response:\n{completion}\n"
        prompt += f"### End"

        output_texts.append(prompt)

    return output_texts


response_template = " ### Completion:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

In [None]:
print(train_dataset[0])
test = format_prompts(train_dataset[0])
print(test)

In [None]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
model_dir = "models/llama-2-7b-" + model_name

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 2

# Batch size per GPU for evaluation
per_device_eval_batch_size = 2

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

In [None]:
print(f"Model directory: {model_dir}")

In [None]:
from transformers import TrainingArguments

training_arguments = TrainingArguments(
    output_dir=model_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
)

In [None]:
trainer = SFTTrainer(
    model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    tokenizer=tokenizer,
    packing=False,
    args=training_arguments,
    max_seq_length=4096,
    dataset_batch_size=1,
)

trainer.train()

trainer.save_model(model_dir)

In [None]:
# Write training and evaluation metrics to file
metrics = trainer.evaluate()
with open(os.path.join(reports_dir, f"metrics_{model_name}.txt"), "w") as f:
    for key, value in metrics.items():
        f.write(f"{key} = {value}\n")