In [1]:
# !pip uninstall -y transformers bitsandbytes accelerate

In [2]:
# !pip install transformers==4.37.2 bitsandbytes==0.42.0 accelerate==0.27.2

In [3]:
# from huggingface_hub import notebook_login
# notebook_login()

In [4]:
#Change the current path of the execution
import sys
import os
cwd = os.path.abspath(os.path.join(os.getcwd(), '../..'))
sys.path.append(cwd)
os.chdir(cwd)

In [5]:
import transformers
import bitsandbytes
import accelerate

print(f"Transformers version: {transformers.__version__}")
print(f"BitsAndBytes version: {bitsandbytes.__version__}")
print(f"Accelerate version: {accelerate.__version__}")

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'
Transformers version: 4.37.2
BitsAndBytes version: 0.42.0
Accelerate version: 0.27.2


In [6]:
import json

data_dir = 'llm_datasets/book_crossing'

# Load the JSON file
with open(os.path.join(data_dir, "train.json"), 'r') as f:
    training_data = json.load(f)

# Print a sample to verify the format
print("Sample data point:")
print(training_data[0])

Sample data point:
{'instruction': 'Given the user\'s preference and unpreference, identify whether the user will like the target book by answering "Yes." or "No.".', 'input': 'User Preference: "The End of Enemies (Briggs Tanner Novels)" written by Grant Blackwood, "Q Is for Quarry" written by Sue Grafton\nUser Unpreference: "ICEFIRE" written by Judith Reeves-Stevens\nWhether the user will like the target book "Specter of the Past: Star Wars (Star Wars (Bantam Books (Firm) : Unnumbered).)" written by Timothy Zahn?', 'output': 'Yes.'}


In [7]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Model ID
model_id = "meta-llama/Llama-2-7b-hf"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  #This tells the tokenizer to add padding tokens to the right side of the sequence

# Configure model loading based on device
if device == "cuda":
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        bnb_8bit_quant_type="nf8",
        bnb_8bit_compute_dtype=torch.float16
    )
    
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.float16
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float32
    )

Using device: cpu




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
def tokenize_function(examples):
    # Combine instruction, input, and output
    prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{examples['instruction']}

### Input:
{examples['input']}

### Response:
"""
    
    response = examples['output']
    
    # Tokenize prompt and response
    prompt_ids = tokenizer(prompt, truncation=False, add_special_tokens=False)["input_ids"]
    response_ids = tokenizer(response, truncation=False, add_special_tokens=False)["input_ids"]
    
    # Combine them and truncate if needed
    input_ids = prompt_ids + response_ids + [tokenizer.eos_token_id]
    
    ## 1. Most transformer models have a maximum sequence length
    # For LLaMA-2, it's typically 512 tokens
    #Consider this as managing sequence length to fit the model's maximum context window
    if len(input_ids) > 512:
        input_ids = input_ids[:511] + [tokenizer.eos_token_id]
    
    # Create attention mask
    attention_mask = [1] * len(input_ids)
    
    # Create labels (same as input_ids for causal LM)
    # -100 is hardcoded in PyTorch and the transformers library as a special "ignore_index" value when calculating loss
    labels = [-100] * len(prompt_ids) + response_ids + [tokenizer.eos_token_id]
    if len(labels) > 512:
        labels = labels[:511] + [tokenizer.eos_token_id]
    
    # Pad everything to max_length
    padding_length = 512 - len(input_ids)
    if padding_length > 0:
        # Add padding to input_ids
        input_ids = input_ids + [tokenizer.pad_token_id] * padding_length # [23, 45, 67, 89] + [2, 2, 2] (if pad_token_id is 2)
        
        # Add 0s to attention mask for paddin
        attention_mask = attention_mask + [0] * padding_length
        ## [1, 1, 1, 1] + [0, 0, 0] # 0 means: "ignore this padding token"
        
        # Add -100 to labels for padding
        labels = labels + [-100] * padding_length
        ## [-100, -100, 67, 89, 12, 4, 2] + [-100, -100, -100]
        # -100 for pad tokens too
        #We only want model to learn to predict the response, not:
        #The prompt (first -100s)
        #The padding (last -100s)
    
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [None]:
# Configure LoRA
lora_config = LoraConfig(
    r=8, #Rank of the LoRA adaptation matrices
    lora_alpha=16, #Scaling factor for the LoRA layers
    target_modules=['q_proj', 'v_proj'], ## For LLaMA models, typically targets attention layers: # 'q_proj': Query projection,  'v_proj': Value projection
    lora_dropout=0.05, #Dropout probability for LoRA layers
    bias="none", #How to handle bias terms
    task_type="CAUSAL_LM" ## "CAUSAL_LM": For autoregressive/generative tasks, # Other options like "SEQ_2_SEQ_LM" for different architectures
)

# Prepare model for training
if device == "cuda":
    model = prepare_model_for_kbit_training(model)
    
# Create PEFT model
peft_model = get_peft_model(model, lora_config)

# Training arguments based on device
training_args = TrainingArguments(
    output_dir="./alpaca-tuned-model",
    num_train_epochs=3,
    per_device_train_batch_size=1 if device == "cpu" else 4,
    gradient_accumulation_steps=8 if device == "cpu" else 4,
    learning_rate=2e-4,
    save_strategy="epoch",
    logging_steps=10,
    evaluation_strategy="no",  # Changed from "epoch" to "no"
    remove_unused_columns=False,
    fp16=device == "cuda",
    no_cuda=device == "cpu"
)

In [None]:
# Create and process dataset => Creates a Hugging Face dataset from a list of dictionaries (JSON format data)
dataset = Dataset.from_list(training_data)
tokenized_dataset = dataset.map(
    tokenize_function,
    remove_columns=dataset.column_names
)

# Print training device information
print(f"Training will be performed on: {device}")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Gradient accumulation steps: {training_args.gradient_accumulation_steps}")

In [None]:
dataset

In [None]:
# Create Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Start training
trainer.train()

In [None]:
# Save the trained model
peft_model.save_pretrained("./alpaca-tuned-model")