In [1]:
# check if the GPU is detected

import torch
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce RTX 3070


In [2]:
# download TinyLlama 1.1B

from huggingface_hub import hf_hub_download
hf_hub_download(repo_id="TinyLlama/TinyLlama-1.1B-Chat-v1.0", filename="config.json")

'/home/reggie/.cache/huggingface/hub/models--TinyLlama--TinyLlama-1.1B-Chat-v1.0/snapshots/fe8a4ea1ffedaf415f4da2f062534de366a451e6/config.json'

### Load and Format Data

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

raw_data = pd.read_csv('swim_dataset.csv')
raw_data

# split to train and eval
train_data, eval_data = train_test_split(raw_data, test_size=0.1, random_state=42)
len(train_data), len(eval_data)

(675, 76)

In [4]:
# define data format
def preprocess(example):
    return {
        "text": f"<|system|> You are Swim Instructor helping athletes <|user|> {example['Question']} <|assistant|> {example['Answer']}"
    }

In [5]:
# load dataset
from datasets import Dataset

# load dataset/preprocess
train_dataset = Dataset.from_pandas(train_data)
eval_dataset = Dataset.from_pandas(eval_data)
train_data = train_dataset.map(preprocess)
eval_data = eval_dataset.map(preprocess)

Map:   0%|          | 0/675 [00:00<?, ? examples/s]

Map:   0%|          | 0/76 [00:00<?, ? examples/s]

In [6]:
train_data['text'][0]

"<|system|> You are Swim Instructor helping athletes <|user|> What is the best time to do dryland training in relation to swim practice? <|assistant|> It's often best to do dryland workouts after swimming or on separate days to avoid fatigue affecting swim performance."

In [7]:
import wandb

wandb.init(
    project="swim_coach_llama",
    name="run2-10epochs",
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mreggienwade[0m ([33mreggienwade-boise-state-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


### Define Model and Generation Function

In [8]:
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

In [16]:
from transformers import AutoTokenizer, AutoProcessor, AutoModelForCausalLM, BitsAndBytesConfig, EarlyStoppingCallback

def get_model_and_tokenizer(model_id):
    tokenizer = AutoProcessor.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    # store the weights (parameters) of the model in 4-bit to save v-ram
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype="float16",
        bnb_4bit_use_double_quant=True
    )  
    model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto")    
    model.config.use_cache = False
    # keep tensors on one device
    model.config.pretraining_tp = 1
    return model, tokenizer

model, tokenizer = get_model_and_tokenizer(model_id)

In [10]:
def generate(model, prompt, max_length=300, temperature=0.7, top_p=0.9, do_sample=True):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        temperature=temperature,
        top_p=top_p,
        do_sample=do_sample,
        pad_token_id=tokenizer.eos_token_id,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

### Evaluate Baseline using BLEU

In [11]:
from tqdm.notebook import tqdm
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
def calculateBLEU(eval_set, model):
    references = []
    hypotheses = []
    
    for row in tqdm(eval_data):
        curr_query = ('<|system|> You are Swim Instructor helping athletes <|user|> ' + row['Question'])
        reference = row['text'].split()
        hypothesis = generate(model, curr_query).split()

        # skip over if LLM failed to generate
        if len(hypothesis) < 9:
            continue
    
        references.append(reference)
        hypotheses.append(hypothesis)
    
    corpus_bleu_score = corpus_bleu(references, hypotheses, smoothing_function=SmoothingFunction().method1)
    return corpus_bleu_score

In [12]:
print(f"Corpus BLEU Score: {calculateBLEU(eval_data, model):.4f}")

  0%|          | 0/76 [00:00<?, ?it/s]

Corpus BLEU Score: 0.0001


### Evaluate Baseline using ROUGE

In [34]:
from rouge import Rouge

def calculateROUGE(eval_set, model):
    references = []
    hypotheses = []

    for row in tqdm(eval_data):
        curr_query = ('<|system|> You are Swim Instructor helping athletes <|user|> ' + row['Question'])
        reference = row['text']
        hypothesis = generate(model, curr_query)

        # skip over if LLM failed to generate
        if len(hypothesis) < 9:
            continue

        references.append(reference)
        hypotheses.append(hypothesis)
    rouge = Rouge()
    scores = rouge.get_scores(hypotheses, references, avg=True)
    return scores

In [36]:
print(f"Corpus ROUGE Score: {calculateROUGE(eval_data, model)}")

  0%|          | 0/76 [00:00<?, ?it/s]

Corpus ROUGE Score: {'rouge-1': {'r': 0.5666637803067068, 'p': 0.6384504535271867, 'f': 0.5271197903493715}, 'rouge-2': {'r': 0.4862746078980227, 'p': 0.5805493675324449, 'f': 0.4401218056640395}, 'rouge-l': {'r': 0.5647033568292554, 'p': 0.6376540773603455, 'f': 0.5259933085835715}}


### Test QA on Base Model

In [21]:
response = generate(model, "<|system|> You are Swim Instructor helping athletes <|user|> How can I improve my distance per stroke?", max_length=200)
print(response)

<|system|> You are Swim Instructor helping athletes <|user|> How can I improve my distance per stroke? Answer according to: Can you suggest some exercises to improve my distance per stroke? Answer according to: The longer you swim, the more you’ll improve your distance per stroke. The more distance you cover, the more you’ll build strength and endurance. But even if you’re not a swimmer, you can still improve your distance by doing the exercises below. First, you’ll want to make sure you’re in a good state of form before you start doing these exercises. If you’re not sure, take a break from swimming and do some plyometric exercises to get your heart rate up.
First, you’ll want to make sure you’re in a good state of form before you start doing these exercises. If you’re not


### Define LoRs Config and Training Regimine

In [11]:
from peft import LoraConfig, PeftModel

# LoRa config
peft_config = LoraConfig(
    r=64,    # size of low rank matricies
    lora_alpha=32,    # scaling factor
    lora_dropout=0.08,  # Regularization via dropout
    bias='none',    # don't edit the bias of original model
    task_type="CAUSAL_LM",    # tells Lora this is a generation model
)

In [12]:
from trl import SFTTrainer, SFTConfig

sft_config = SFTConfig(
    output_dir="swim_coach_llama_64",
    dataset_text_field="text",
    packing=False,  # Set to True if you want data packing
    max_seq_length=1024,    # max token length accepted, anything longer will be truncated
    per_device_train_batch_size=16,    # batch size (essentially 64 because of next line)
    gradient_accumulation_steps=4,    # accumulate gradients across 4 batches before back-prop
    optim="paged_adamw_32bit",    # define optimization function
    learning_rate=2e-4,    # learning rate
    lr_scheduler_type="cosine",    # learning rate scheduler
    save_strategy="epoch",   # defines where to save checkpoints
    save_steps=10,    # how often to save checkpoint
    logging_steps=1,     # how often to log info
    num_train_epochs=10,    # number of epochs
    max_steps=200,    # max number of training steps
    fp16=True,    # train on mixed precision (16bit floats)
    eval_strategy="steps",  # Evaluate regularly
    eval_steps=10,    # how often to evaluate
    save_total_limit=10,           # Keep only last N checkpoints
    metric_for_best_model="eval_loss",  # Use eval_loss to find the best model
    greater_is_better=False,      # Lower eval_loss is better
)
# Trainer with validation dataset
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    processing_class=tokenizer,
    args=sft_config,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=10)],  
)

Converting train dataset to ChatML:   0%|          | 0/675 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/675 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/675 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/675 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/76 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/76 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/76 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/76 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [11]:
trainer.train()

Using EarlyStoppingCallback without load_best_model_at_end=True. Once training is finished, the best model will not be loaded automatically.


Step,Training Loss,Validation Loss
10,1.9215,1.814044
20,1.3682,1.352496
30,1.1564,1.233001
40,1.2296,1.188865
50,1.1753,1.156014
60,1.1388,1.131532
70,1.0838,1.114419
80,1.0649,1.098348
90,1.0406,1.086983
100,1.0543,1.078204


TrainOutput(global_step=200, training_loss=1.1353641149401665, metrics={'train_runtime': 781.6939, 'train_samples_per_second': 16.375, 'train_steps_per_second': 0.256, 'total_flos': 6124924109438976.0, 'train_loss': 1.1353641149401665})

In [22]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    torch_dtype=torch.float16, 
    load_in_8bit=False, 
    device_map="auto", 
    trust_remote_code=True,
    use_flash_attention_2=False,
)

#modify the folder according to which checkpoint has the best eval_score! (lower is better)
model_path = "./swim_coach_llama_64/checkpoint-198"

peft_model = PeftModel.from_pretrained(model, model_path, from_transformers=True, device_map="auto")
model = peft_model.merge_and_unload()

In [29]:
response = generate(model, "<|system|> You are Swim Instructor helping athletes <|user|> How can I develop an early vertical forearm?", max_length=350)
print(response)

<|system|> You are Swim Instructor helping athletes <|user|> How can I develop an early vertical forearm? <|assistant|> Start with a light kick, catch your breath, and gradually work your way up.


### Calculate BLEU Score post fine-tuning

In [62]:
print(f"Corpus BLEU Score: {calculateBLEU(eval_data, model):.4f}")

  0%|          | 0/76 [00:00<?, ?it/s]

Corpus BLEU Score: 0.0002


In [38]:
print(f"Corpus ROUGE Score: {calculateROUGE(eval_data, model)}")

  0%|          | 0/76 [00:00<?, ?it/s]

Corpus ROUGE Score: {'rouge-1': {'r': 0.6414817724953342, 'p': 0.6464909690526827, 'f': 0.6419120388185374}, 'rouge-2': {'r': 0.5414242616589967, 'p': 0.536292097271519, 'f': 0.5364554575505489}, 'rouge-l': {'r': 0.6285719485328878, 'p': 0.6338201443484348, 'f': 0.6291532247547832}}
