In [1]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets
!pip install -q trl
!pip install -q evaluate rouge_score
!pip install -q torch
!pip install -q tensorboard

In [2]:
import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer
import evaluate

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# The model that you want to train from the Hugging Face hub
model_name = "meta-llama/Llama-2-7b-chat-hf"

# The instruction dataset to use
dataset_name = "amaydle/npc-dialogue"

# Fine-tuned model name
new_model = "llama-2-7b-npc"

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 25

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [15]:
dataset = load_dataset(dataset_name, split = "train")

In [17]:
dataset

Dataset({
    features: ['Name', 'Biography', 'Query', 'Response', 'Emotion'],
    num_rows: 1723
})

In [7]:
prefix = 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n'
delimeter = '###'
instruction = ' Instruction:\n'
input = ' Input:\n'
name = 'Name: '
bio = ', Biography: '
response = ' Response: '

In [19]:
temp_list = [prefix + delimeter + instruction + q + '\n\n' + delimeter for q in dataset['Query']]
temp_list_2 = [input + name + n + bio + b + '\n\n' + delimeter for (n, b) in zip(dataset['Name'], dataset['Biography'])]
temp_list_3 = [response + r for r in dataset['Response']]

In [20]:
text = [x + y + z for x, y, z in zip(temp_list, temp_list_2, temp_list_3)]

In [21]:
dataset = dataset.add_column('text', text)

In [22]:
dataset['text'][0]

'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat is your opinion on friendship?\n\n### Input:\nName: Bikram, Biography: Bikram is a rough and tough smuggler from the streets of Calcutta, India.\n\n### Response: Friendship is a bond stronger than blood.'

In [5]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# # Load LoRA configuration
# peft_config = LoraConfig(
#     lora_alpha=lora_alpha,
#     lora_dropout=lora_dropout,
#     r=lora_r,
#     bias="none",
#     task_type="CAUSAL_LM",
# )

# # Set training parameters
# training_arguments = TrainingArguments(
#     output_dir=output_dir,
#     num_train_epochs=num_train_epochs,
#     per_device_train_batch_size=per_device_train_batch_size,
#     gradient_accumulation_steps=gradient_accumulation_steps,
#     optim=optim,
#     save_steps=save_steps,
#     logging_steps=logging_steps,
#     learning_rate=learning_rate,
#     weight_decay=weight_decay,
#     fp16=fp16,
#     bf16=bf16,
#     max_grad_norm=max_grad_norm,
#     max_steps=max_steps,
#     warmup_ratio=warmup_ratio,
#     group_by_length=group_by_length,
#     lr_scheduler_type=lr_scheduler_type,
#     report_to="tensorboard"
# )

# # Set supervised fine-tuning parameters
# trainer = SFTTrainer(
#     model=model,
#     train_dataset=dataset,
#     peft_config=peft_config,
#     dataset_text_field="text",
#     max_seq_length=max_seq_length,
#     tokenizer=tokenizer,
#     args=training_arguments,
#     packing=packing,
# )

# # Train model
# trainer.train()

# # Save trained model
# trainer.model.save_pretrained(new_model)

Your GPU supports bfloat16: accelerate training with bf16=True


Downloading (…)lve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [6]:
test_data = load_dataset(dataset_name, split = "test")

Downloading readme:   0%|          | 0.00/579 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/166k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1723 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/192 [00:00<?, ? examples/s]

In [8]:
test_list = [prefix + delimeter + instruction + q + '\n\n' + delimeter for q in test_data['Query']]
test_list_2 = [input + name + n + bio + b for (n, b) in zip(test_data['Name'], test_data['Biography'])]

In [9]:
text_2 = [x + y for x, y in zip(test_list, test_list_2)]

In [10]:
test_data = test_data.add_column('text', text_2)

In [11]:
print(test_data['text'][0])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
What is the biggest challenge you face as a teacher?

### Input:
Name: Naina Mathur, Biography: Naina Mathur is a determined and passionate teacher who has a stutter.


In [14]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
# prompt = "Instruction:\nWhere is the magic key?\n\n ### Input:\n Name: Bikram, Biography: Bikram is a rough and tough smuggler from the streets of Calcutta, India."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=250)
responses = pipe(test_data['text'])

In [23]:
responses[0]

[{'generated_text': 'Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat is the biggest challenge you face as a teacher?\n\n### Input:\nName: Naina Mathur, Biography: Naina Mathur is a determined and passionate teacher who has a stutter. She is determined to overcome her stutter and become a more confident and effective teacher.\n\n### Response:\nNaina, I think the biggest challenge you face as a teacher is your stutter. It can be difficult to communicate effectively with your students, especially when you are trying to convey complex ideas or answer questions in class. However, I want to encourage you that there are many strategies and techniques that can help you manage your stutter and communicate more confidently. Have you considered seeking out speech therapy or using techniques like slowing down your speech or using filler words to help you find your

In [43]:
responses[2][0]['generated_text'].split('### Instruction:\n')[1].split('\n\n### Input')[0]

'Can you describe yourself in three words?'

In [44]:
output = {'instruction': [], 'responses': []}
for x in responses:
    i = x[0]['generated_text'].split('### Instruction:\n')[1].split('\n\n### Input')[0]
    r = x[0]['generated_text'].split('\n\n')[-1]

    output['instruction'].append(i)
    output['responses'].append(r)

In [26]:
import pandas as pd

output = pd.DataFrame(data = output)

# output.to_csv('Question_Response_Output.csv')

In [45]:
# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [46]:
model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

pytorch_model-00001-of-00002.bin:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

pytorch_model-00002-of-00002.bin:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/davezack/llama-2-7b-npc/commit/f29e7c6f5acdd14b4d8108910c35cd4173442705', commit_message='Upload tokenizer', commit_description='', oid='f29e7c6f5acdd14b4d8108910c35cd4173442705', pr_url=None, pr_revision=None, pr_num=None)

In [46]:
rouge = evaluate.load('rouge')
predictions = [x[0] for x in output['responses']]
references = test_data['Response']
results = rouge.compute(predictions=predictions,
                      references=references)

In [47]:
results

{'rouge1': 0.002989130434782609,
 'rouge2': 0.0,
 'rougeL': 0.0033375139353400223,
 'rougeLsum': 0.0031249999999999997}

In [6]:
!pip install -q torchinfo

In [7]:
import torchinfo

In [8]:
torchinfo.summary(model)

Layer (type:depth-idx)                                  Param #
LlamaForCausalLM                                        --
├─LlamaModel: 1-1                                       --
│    └─Embedding: 2-1                                   131,072,000
│    └─ModuleList: 2-2                                  --
│    │    └─LlamaDecoderLayer: 3-1                      101,195,776
│    │    └─LlamaDecoderLayer: 3-2                      101,195,776
│    │    └─LlamaDecoderLayer: 3-3                      101,195,776
│    │    └─LlamaDecoderLayer: 3-4                      101,195,776
│    │    └─LlamaDecoderLayer: 3-5                      101,195,776
│    │    └─LlamaDecoderLayer: 3-6                      101,195,776
│    │    └─LlamaDecoderLayer: 3-7                      101,195,776
│    │    └─LlamaDecoderLayer: 3-8                      101,195,776
│    │    └─LlamaDecoderLayer: 3-9                      101,195,776
│    │    └─LlamaDecoderLayer: 3-10                     101,195,776
│    │    └

In [9]:
del model