In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers accelerate datasets



In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import json
import os

# data_dir = 'llm_datasets/book_crossing'
data_dir = '/content/drive/MyDrive/colab_data/book_crossing'

# Load the JSON file
with open(os.path.join(data_dir, "train.json"), 'r') as f:
    training_data = json.load(f)

# Print a sample to verify the format
print("Sample data point:")
print(training_data[0])

Sample data point:
{'instruction': 'Given the user\'s preference and unpreference, identify whether the user will like the target book by answering "Yes." or "No.".', 'input': 'User Preference: "Ordinary Resurrections: Children in the Years of Hope" written by Jonathan Kozol, "Resistance and Representation: Rethinking Childhood Education" written by Janice A. Jipson, "The Lovely Bones: A Novel" written by Alice Sebold\nUser Unpreference: "The Girls\' Guide to Hunting and Fishing" written by Melissa Bank, "There Are No Children Here: The Story of Two Boys Growing Up in the Other America" written by Alex Kotlowitz\nWhether the user will like the target book "The Skin We\'re In : Teaching Our Teens To Be Emotionally Strong, Socially Smart, and Spiritually Connected" written by Janie Victoria Ward?', 'output': 'No.'}


In [6]:
# import torch
# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
# from datasets import Dataset
# from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq

# # Check if GPU is available
# device = "cuda" if torch.cuda.is_available() else "cpu"
# print(f"Using device: {device}")

# # Model ID
# model_id = "meta-llama/Llama-2-7b-hf"

# # Load tokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"  #This tells the tokenizer to add padding tokens to the right side of the sequence

# # Configure model loading based on device
# if device == "cuda":
#     bnb_config = BitsAndBytesConfig(
#         load_in_8bit=True,
#         bnb_8bit_quant_type="nf8",
#         bnb_8bit_compute_dtype=torch.float16
#     )

#     model = AutoModelForCausalLM.from_pretrained(
#         model_id,
#         quantization_config=bnb_config,
#         device_map="auto",
#         torch_dtype=torch.float16
#     )
# else:
#     model = AutoModelForCausalLM.from_pretrained(
#         model_id,
#         low_cpu_mem_usage=True,
#         torch_dtype=torch.float32
#     )



import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Model ID
model_id = "meta-llama/Llama-2-7b-hf"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"  # This tells the tokenizer to add padding tokens to the right side of the sequence

# Configure model loading based on device
if device == "cuda":
    # Use native 16-bit precision instead of 8-bit quantization
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.float16,  # Using half precision
        low_cpu_mem_usage=True
    )
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float32
    )

Using device: cuda


model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [8]:
def tokenize_function(examples):
    # Combine instruction, input, and output
    prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{examples['instruction']}

### Input:
{examples['input']}

### Response:
"""

    response = examples['output']

    # Tokenize prompt and response
    prompt_ids = tokenizer(prompt, truncation=False, add_special_tokens=False)["input_ids"]
    response_ids = tokenizer(response, truncation=False, add_special_tokens=False)["input_ids"]

    # Combine them and truncate if needed
    input_ids = prompt_ids + response_ids + [tokenizer.eos_token_id]

    ## 1. Most transformer models have a maximum sequence length
    # For LLaMA-2, it's typically 512 tokens
    #Consider this as managing sequence length to fit the model's maximum context window
    if len(input_ids) > 512:
        input_ids = input_ids[:511] + [tokenizer.eos_token_id]

    # Create attention mask
    attention_mask = [1] * len(input_ids)

    # Create labels (same as input_ids for causal LM)
    # -100 is hardcoded in PyTorch and the transformers library as a special "ignore_index" value when calculating loss
    labels = [-100] * len(prompt_ids) + response_ids + [tokenizer.eos_token_id]
    if len(labels) > 512:
        labels = labels[:511] + [tokenizer.eos_token_id]

    # Pad everything to max_length
    padding_length = 512 - len(input_ids)
    if padding_length > 0:
        # Add padding to input_ids
        input_ids = input_ids + [tokenizer.pad_token_id] * padding_length # [23, 45, 67, 89] + [2, 2, 2] (if pad_token_id is 2)

        # Add 0s to attention mask for paddin
        attention_mask = attention_mask + [0] * padding_length
        ## [1, 1, 1, 1] + [0, 0, 0] # 0 means: "ignore this padding token"

        # Add -100 to labels for padding
        labels = labels + [-100] * padding_length
        ## [-100, -100, 67, 89, 12, 4, 2] + [-100, -100, -100]
        # -100 for pad tokens too
        #We only want model to learn to predict the response, not:
        #The prompt (first -100s)
        #The padding (last -100s)

    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels
    }

In [10]:
# Configure LoRA
lora_config = LoraConfig(
    r=8, #Rank of the LoRA adaptation matrices
    lora_alpha=16, #Scaling factor for the LoRA layers
    target_modules=['q_proj', 'v_proj'], ## For LLaMA models, typically targets attention layers: # 'q_proj': Query projection,  'v_proj': Value projection
    lora_dropout=0.05, #Dropout probability for LoRA layers
    bias="none", #How to handle bias terms
    task_type="CAUSAL_LM" ## "CAUSAL_LM": For autoregressive/generative tasks, # Other options like "SEQ_2_SEQ_LM" for different architectures
)

# Prepare model for training
if device == "cuda":
    model = prepare_model_for_kbit_training(model)

# Create PEFT model
peft_model = get_peft_model(model, lora_config)

# # Training arguments based on device
# training_args = TrainingArguments(
#     output_dir="/content/drive/MyDrive/your-project-folder/alpaca-tuned-model", #"./alpaca-tuned-model",
#     num_train_epochs=3,
#     per_device_train_batch_size=1 if device == "cpu" else 4,
#     gradient_accumulation_steps=8 if device == "cpu" else 4,
#     learning_rate=2e-4,
#     save_strategy="epoch",
#     logging_steps=10,
#     evaluation_strategy="no",  # Changed from "epoch" to "no"
#     remove_unused_columns=False,
#     fp16=device == "cuda",
#     no_cuda=device == "cpu"
# )


# Training arguments with extreme memory optimization
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/your-project-folder/alpaca-tuned-model",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=32,  # Very high gradient accumulation
    learning_rate=2e-4,
    save_strategy="no",  # Only save at the end to save memory
    logging_steps=50,  # Log less frequently
    remove_unused_columns=False,
    fp16=True,
    dataloader_num_workers=0,
    optim="adamw_torch",
    max_grad_norm=0.3,  # Limit gradient values
)

In [11]:
# Create and process dataset => Creates a Hugging Face dataset from a list of dictionaries (JSON format data)
dataset = Dataset.from_list(training_data)
tokenized_dataset = dataset.map(
    tokenize_function,
    remove_columns=dataset.column_names
)

# Print training device information
print(f"Training will be performed on: {device}")
print(f"Batch size: {training_args.per_device_train_batch_size}")
print(f"Gradient accumulation steps: {training_args.gradient_accumulation_steps}")

Map:   0%|          | 0/19414 [00:00<?, ? examples/s]

Training will be performed on: cuda
Batch size: 1
Gradient accumulation steps: 32


In [12]:
dataset

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 19414
})

In [13]:
# Create Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Start training
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msmazzyhitting[0m ([33msmazzyhitting-786[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 50.88 MiB is free. Process 83814 has 39.50 GiB memory in use. Of the allocated memory 38.89 GiB is allocated by PyTorch, and 117.77 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [14]:
# Import necessary libraries
import os
import gc
import torch

# Set memory allocation configuration
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Check device and load model
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load tokenizer
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model with extreme memory optimization
if device == "cuda":
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True
    )
    # Enable gradient checkpointing
    model.gradient_checkpointing_enable()
else:
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float32
    )

# Clear memory
gc.collect()
torch.cuda.empty_cache()

# Configure LoRA with minimal parameters
lora_config = LoraConfig(
    r=4,  # Reduced rank
    lora_alpha=8,  # Lower alpha
    target_modules=['q_proj', 'v_proj'],  # Only essential modules
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Prepare model for training
if device == "cuda":
    model = prepare_model_for_kbit_training(model)

# Create PEFT model
peft_model = get_peft_model(model, lora_config)

# Training arguments with extreme memory optimization
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/your-project-folder/alpaca-tuned-model",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=32,  # Very high gradient accumulation
    learning_rate=2e-4,
    save_strategy="no",  # Only save at the end to save memory
    logging_steps=50,  # Log less frequently
    remove_unused_columns=False,
    fp16=True,
    dataloader_num_workers=0,
    optim="adamw_torch",
    max_grad_norm=0.3,  # Limit gradient values
)

# Clear memory again before trainer
gc.collect()
torch.cuda.empty_cache()

# Create Trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer, padding=True)
)

# Start training
trainer.train()

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 250.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 50.88 MiB is free. Process 83814 has 39.50 GiB memory in use. Of the allocated memory 38.89 GiB is allocated by PyTorch, and 117.90 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Save the trained model
peft_model.save_pretrained("/content/drive/MyDrive/your-project-folder/alpaca-tuned-model")