The following notebook based on:
https://github.com/philschmid/deep-learning-pytorch-huggingface/blob/main/training/fine-tune-llms-in-2024-with-trl.ipynb


In [1]:
# # Install Pytorch & other libraries
# !pip install "torch==2.1.2" tensorboard
# 
# # Install Hugging Face libraries
# !pip install  --upgrade \
# "transformers==4.36.2" \
# "datasets==2.16.1" \
# "accelerate==0.26.1" \
# "evaluate==0.4.1" \
# "bitsandbytes==0.42.0" \
#     # "trl==0.7.10" # \
# # "peft==0.7.1" \
# 
# # install peft & trl from github
# !pip install git+https://github.com/huggingface/trl@a3c5b7178ac4f65569975efadc97db2f3749c65e --upgrade
# !pip install git+https://github.com/huggingface/peft@4a1559582281fc3c9283892caea8ccef1d6f5a4f --upgrade

In [2]:
# !pip install python-dotenv
# !pip install wandb
# !pip install ipywidgets

In [4]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '3'  # Must be before importing torch.

In [4]:
import torch

assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'

# # install flash-attn
# !pip install ninja packaging
# !MAX_JOBS=4 pip install flash-attn --no-build-isolation

In [1]:
import pandas as pd
from os.path import join as pj

from trl import SFTTrainer
from transformers import TrainingArguments
from peft import LoraConfig, PeftConfig, AutoPeftModelForCausalLM
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import setup_chat_format
from datasets import load_dataset, Dataset

from dotenv import load_dotenv
from pathlib import Path

from typing import Dict, List, Tuple

from huggingface_hub import login as hf_login
import wandb
from transformers.models.auto.tokenization_auto import PreTrainedTokenizerFast
from data_datasets.AlexanderStreet import alexander_street

from tqdm.notebook import tqdm

import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv(Path('../.env'))

True

In [7]:
hf_login(token=os.getenv('HF_TOKEN'))

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/stavyo/.cache/huggingface/token
Login successful


In [8]:
wandb.login(key=os.getenv('WANDB_TOKEN'))

[34m[1mwandb[0m: Currently logged in as: [33mstav-dev95[0m ([33mstav_nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/stavyo/.netrc


True

In [9]:
# noinspection SpellCheckingInspection
def load_miti() -> Dict[str, List[Tuple[bool, str]]]:
    df = pd.read_csv(MITI_FILE_PATH)
    df = df[df['Filename'].str.contains('.txt')]

    data = {}
    filtered = df[['Filename', 'Only Text']].to_dict('records')

    fix_text = lambda x: x.replace('T:', '[THERAPIST]').replace('C:', '[PATIENT]').strip()

    timecode_pattern = r"\[?\(?(inaudible )?(at )?\(?[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\)?\]?"
    spaces_pattern = r" +"
    tabs_pattern = r"\t"
    clean_text_timecode = lambda x: re.sub(timecode_pattern, " ", x)
    clean_text_tabs = lambda x: re.sub(tabs_pattern, " ", x)
    clean_text_spaces = lambda x: re.sub(spaces_pattern, " ", x)
    clean_text = lambda x: clean_text_spaces(clean_text_tabs(clean_text_timecode(x)))

    fix_and_clean = lambda x: clean_text(fix_text(x))

    for row in filtered:
        filename = row['Filename']
        if filename not in data:
            conv = list(map(fix_and_clean, row['Only Text'].split('\n')))  # Fix text and split by new line

            for i in range(len(conv) - 1, 0, -1):
                # If the first 9 characters are the same, then it's a continuation of the previous line. len([THERAPIST]) != len([PATIENT])
                if conv[i][:9] == conv[i - 1][:9]:
                    txt = conv[i].replace('[THERAPIST]', '').replace('[PATIENT]', '').strip()
                    conv[i - 1] = conv[i - 1].strip() + ' ' + txt
                    conv.pop(i)

            # Convert conversation to tuples, where the first element is the role and the second is the text
            conv_tup = []
            for i in range(len(conv)):
                if '[THERAPIST]' in conv[i]:
                    conv_tup.append((True, conv[i].replace('[THERAPIST]', '').strip()))
                elif '[PATIENT]' in conv[i]:
                    conv_tup.append((False, conv[i].replace('[PATIENT]', '').strip()))
                else:
                    assert False, 'Unknown role'

            data[filename] = conv_tup

    return data


def load_alexander_street() -> Dict[str, List[Tuple[bool, str]]]:
    as_dataset = alexander_street.AlexanderStreetDataset.load_dataset(json_file=ALEXANDER_STREET_FILE_PATH)
    conversations = as_dataset.volume_ctrn.conversations + as_dataset.volume_psyc.conversations

    k = 0
    data = {}
    for conv in conversations:
        utterances = conv.utterances
        conv_tup = []
        for i in range(len(utterances)):
            if 'Therapist:' in utterances[i]:
                conv_tup.append((True, utterances[i].replace('Therapist:', '').strip()))
            elif 'Patient:' in utterances[i]:
                conv_tup.append((False, utterances[i].replace('Patient:', '').strip()))
            else:
                assert False, f'Unknown role in {utterances[i]}'

        data[str(k)] = conv_tup
        k += 1
    return data


def create_prompts(data: List[List[str]], system_message: str, utterances_count: int,
                   tokenizer: PreTrainedTokenizerFast,
                   max_prompt_length: int) -> List[Dict[List[Dict[str, str]], str]]:
    assert utterances_count % 2 == 0, 'utterances_count must be even'

    prompts = []
    for conversation in tqdm(data):
        queue = []
        for utt in conversation:
            queue.append(utt)

            if len(queue) >= utterances_count and queue[-1][0]:
                while len(queue) > utterances_count:
                    queue.pop(0)

                messages = [{"role": "system", "content": system_message}]
                messages += [{"role": 'assistant' if queue[i][0] else "user", "content": queue[i][1]}
                             for i in range(len(queue))]

                # noinspection PyBroadException
                try:
                    tokens = tokenizer.apply_chat_template(messages, tokenize=True)
                    if len(tokens) <= max_prompt_length:
                        prompts.append({"messages": messages})
                except:
                    print(messages)
                    continue

                queue.pop(0)

    return prompts


def create_dataset(train_dataset_size: int, prompts: List[dict],
                   train_json_path: str, test_json_path: str):
    test_size = abs(train_dataset_size - len(prompts)) / len(prompts)

    dataset = Dataset.from_list(prompts)
    dataset = dataset.shuffle(seed=42)
    dataset = dataset.train_test_split(test_size=test_size, seed=42)

    dataset["train"].to_json(train_json_path, orient="records")
    dataset["test"].to_json(test_json_path, orient="records")

In [10]:
UTTERANCES_COUNT = 6
MODEL_IDS = {
    'Llama-2-7b-chat-hf': 'meta-llama/Llama-2-7b-chat-hf',
    'Llama-2-13b-chat-hf': 'meta-llama/Llama-2-13b-chat-hf'
}
MODEL_ID = MODEL_IDS['Llama-2-13b-chat-hf']
TRAINING_FOLDER = 'miti'
# wandb.init(name=f'{TRAINING_FOLDER} - {MODEL_ID}')
FOLDER_FINETUNING = 'finetuning'
OUTPUT_DIR = pj(FOLDER_FINETUNING, TRAINING_FOLDER, f'{MODEL_ID.replace("/", "-")}-{UTTERANCES_COUNT}')

DATASETS_PATHS = {
    'miti': {
        'train': pj(FOLDER_FINETUNING, f'prompts_miti_{UTTERANCES_COUNT}_train.json'),
        'test': pj(FOLDER_FINETUNING, f'prompts_miti_{UTTERANCES_COUNT}_test.json')
    },
    'alexander_street_small': {
        'train': pj(FOLDER_FINETUNING, f'prompts_alexander_{UTTERANCES_COUNT}_train_small.json'),
        'test': pj(FOLDER_FINETUNING, f'prompts_alexander_{UTTERANCES_COUNT}_test_small.json')
    },
    'alexander_street_large': {
        'train': pj(FOLDER_FINETUNING, f'prompts_alexander_{UTTERANCES_COUNT}_train_large.json'),
        'test': pj(FOLDER_FINETUNING, f'prompts_alexander_{UTTERANCES_COUNT}_test_large.json')
    },
    'miti_alexander_street': {
        'train': pj(FOLDER_FINETUNING, f'prompts_miti_alexander_{UTTERANCES_COUNT}_train.json'),
        'test': pj(FOLDER_FINETUNING, f'prompts_miti_alexander_{UTTERANCES_COUNT}_test.json')
    }
}

TRAIN_DATASET_PATH = DATASETS_PATHS[TRAINING_FOLDER]['train']
TEST_DATASET_PATH = DATASETS_PATHS[TRAINING_FOLDER]['test']

MITI_FILE_PATH = pj('data_datasets', 'MITI', 'dataset', 'global_mitis.csv')
ALEXANDER_STREET_FILE_PATH = pj('dataset', 'alexander_street_dataset.json')

SYSTEM_MESSAGE = 'You are a motivational interviewing counselor. ' \
                 'You partner with the patient to understand his problems. ' \
                 'You are empathetic towards him and help the patient ' \
                 'explore their ambivalence regarding behavioral change. ' \
                 'You are non-judgmental while encouraging the patient to change'

assert not os.path.exists(OUTPUT_DIR), f'Output directory {OUTPUT_DIR} already exists'

AssertionError: Output directory finetuning/miti/meta-llama-Llama-2-13b-chat-hf-6 already exists

In [None]:
_tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
_tokenizer.padding_side = 'right'  # to prevent warnings

In [11]:
MODEL = AutoModelForCausalLM.from_pretrained(
    'meta-llama/Llama-2-7b-chat-hf',
    device_map="cpu",
    torch_dtype=torch.float16
)
OUTPUT_DIR = MODEL_ID

TOKENIZER = AutoTokenizer.from_pretrained(MODEL_ID)
MODEL, TOKENIZER = setup_chat_format(model=MODEL, tokenizer=TOKENIZER)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [20]:
for _k in DATASETS_PATHS.keys():
    _train_dataset = load_dataset('json', data_files=DATASETS_PATHS[_k]['train'], split="train")
    _arr = []

    # Iterate all the training dataset
    for _idx in tqdm(range(len(_train_dataset))):
        _prompt = TOKENIZER.apply_chat_template(conversation=_train_dataset[_idx]["messages"],
                                                tokenize=True,
                                                add_generation_prompt=True)

        _arr.append(len(_prompt))

    DATASETS_PATHS[_k]['prompts_length'] = _arr

  0%|          | 0/5999 [00:00<?, ?it/s]

  0%|          | 0/6000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

In [21]:
# For each dataset calculate the average length of the prompts and total sum of the lengths
for _k in DATASETS_PATHS.keys():
    _arr = DATASETS_PATHS[_k]['prompts_length']
    _avg = sum(_arr) / len(_arr)
    _sum = sum(_arr)

    _prompts_count = len(DATASETS_PATHS[_k]['prompts_length'])
    print(f'{_k}:\n\tPrompts count - {_prompts_count}\n\tAverage tokens per prompt - {_avg}\n\tTotal tokens in dataset - {_sum / 10 ** 6} (Millions)\n')

miti:
	Prompts count - 5999
	Average tokens per prompt - 290.72395399233204
	Total tokens in dataset - 1.744053 (Millions)

alexander_street_small:
	Prompts count - 6000
	Average tokens per prompt - 323.452
	Total tokens in dataset - 1.940712 (Millions)

alexander_street_large:
	Prompts count - 20000
	Average tokens per prompt - 323.29435
	Total tokens in dataset - 6.465887 (Millions)

miti_alexander_street:
	Prompts count - 10000
	Average tokens per prompt - 308.3635
	Total tokens in dataset - 3.083635 (Millions)


In [22]:
_train_dataset = load_dataset('json', data_files=TRAIN_DATASET_PATH, split="train")
_train_dataset

Dataset({
    features: ['messages'],
    num_rows: 5999
})

In [23]:
_test_dataset = load_dataset('json', data_files=TEST_DATASET_PATH, split="train")
_test_dataset

Dataset({
    features: ['messages'],
    num_rows: 196
})

In [24]:
_train_dataset[0]

{'messages': [{'content': 'You are a motivational interviewing counselor. You partner with the patient to understand his problems. You are empathetic towards him and help the patient explore their ambivalence regarding behavioral change. You are non-judgmental while encouraging the patient to change',
   'role': 'system'},
  {'content': 'Um hmm.', 'role': 'user'},
  {'content': "And so when you switch to this pattern I'm suggesting that maybe you pick two or three days and you just do breakfast meals for dinner... not only are breakfast meals usually pretty easily digested, not too heavy, but they are typically mid-range moderate.",
   'role': 'assistant'},
  {'content': 'Um hmm.', 'role': 'user'},
  {'content': 'If you notice your anxiety go up, get the pen and paper out.',
   'role': 'assistant'},
  {'content': 'Ok.', 'role': 'user'},
  {'content': "And really try to give a voice to that anxiety. Try to see, if it's one of the inner younger kids that are starting to talk but try to j

In [25]:
_test_dataset[1]

{'messages': [{'content': 'You are a motivational interviewing counselor. You partner with the patient to understand his problems. You are empathetic towards him and help the patient explore their ambivalence regarding behavioral change. You are non-judgmental while encouraging the patient to change',
   'role': 'system'},
  {'content': "Yeah, cause then they smile, and they laugh and then they're happy and that's good.",
   'role': 'user'},
  {'content': 'Yeah.', 'role': 'assistant'},
  {'content': "Because then they're not mad at me", 'role': 'user'},
  {'content': 'So, in life in general and maybe with your friends and with your family, being able to do things that make people happy is very important. Yeah. And your friends. So, you love to play games with them and hang out.',
   'role': 'assistant'},
  {'content': "Oh, we worked together too but they don't work there either Now, I mean all of us had to leave and so I mean We don't get to see each other like we used to, and it's lon

In [26]:
print(TOKENIZER.apply_chat_template(conversation=_train_dataset[3]['messages'],
                                                tokenize=False,
                                                add_generation_prompt=False).replace('\n', '\n\\\\').replace('_','\\_'))

<|im\_start|>system
\\You are a motivational interviewing counselor. You partner with the patient to understand his problems. You are empathetic towards him and help the patient explore their ambivalence regarding behavioral change. You are non-judgmental while encouraging the patient to change<|im\_end|>
\\<|im\_start|>user
\\I would say maybe a seven<|im\_end|>
\\<|im\_start|>assistant
\\that's surprising, it's not a one or two, but it's a seven you say it's very good. Why would you say it's not a lower number?<|im\_end|>
\\<|im\_start|>user
\\Well, I don't want her you know, I don't want to see her struggling with this and maybe me being a drinker around her could potentially influence her I want her to be smart and want to be that role model for her you know<|im\_end|>
\\<|im\_start|>assistant
\\I understand. what would you say needs what would you say needs to happen to make it a nine for you, instead of a seven for you to really want to go<|im\_end|>
\\<|im\_start|>user
\\If all 

In [None]:
# BitsAndBytesConfig int-4 config
_bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# LoRA config based on QLoRA paper & Sebastian Raschka experiment
_peft_config = LoraConfig(
    lora_alpha=128,
    lora_dropout=0.05,
    r=256,
    bias="none",
    target_modules="all-linear",
    task_type="CAUSAL_LM"
)

_training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,  # directory to save and repository id
    num_train_epochs=2,  # number of training epochs
    per_device_train_batch_size=3,  # batch size per device during training
    gradient_accumulation_steps=2,  # number of steps before performing a backward/update pass
    gradient_checkpointing=True,  # use gradient checkpointing to save memory
    optim="adamw_torch_fused",  # use fused adamw optimizer
    logging_steps=1,  # log every one step
    save_strategy="epoch",  # save checkpoint every epoch
    learning_rate=2e-4,  # learning rate, based on QLoRA paper
    bf16=True,  # use bfloat16 precision
    tf32=True,  # use tf32 precision
    max_grad_norm=0.3,  # max gradient norm based on QLoRA paper
    warmup_ratio=0.03,  # warmup ratio based on QLoRA paper
    lr_scheduler_type="constant",  # use constant learning rate scheduler
    push_to_hub=False,  # push model to hub
    report_to=["wandb", "tensorboard"],  # report metrics to W&B and, tensorboard
)


In [None]:
_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map='auto',
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
    quantization_config=_bnb_config
)

# # set chat template to OAI chatML, remove if you start from a fine-tuned model
_model, _tokenizer = setup_chat_format(model=_model, tokenizer=_tokenizer)

In [None]:
_max_seq_length = 3072  # max sequence length for model and packing of the dataset

_trainer = SFTTrainer(
    model=_model,
    args=_training_args,
    train_dataset=_train_dataset,
    peft_config=_peft_config,
    max_seq_length=_max_seq_length,
    tokenizer=_tokenizer,
    packing=True,
    dataset_kwargs={
        "add_special_tokens": False,  # We template with special tokens
        "append_concat_token": False,  # No need to add additional separator token
    }
)

In [None]:
# start training, the model will be automatically saved to the output directory
_trainer.train()

In [None]:
# save model 
_trainer.save_model()

In [None]:
# # free the memory again
del _model
del _trainer
torch.cuda.empty_cache()

In [None]:
# Load PEFT model on CPU
_config = PeftConfig.from_pretrained(_training_args.output_dir)
_model = AutoModelForCausalLM.from_pretrained(_config.base_model_name_or_path, low_cpu_mem_usage=True)
_tokenizer = AutoTokenizer.from_pretrained(_training_args.output_dir)
_model.resize_token_embeddings(len(_tokenizer))
_model = AutoPeftModelForCausalLM.from_pretrained(
    _training_args.output_dir,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
)

# # Merge LoRA and base model and save
_merged_model = _model.merge_and_unload()
_merged_model.save_pretrained(_training_args.output_dir, safe_serialization=True, max_shard_size="2GB")