In [1]:
!pip install bitsandbytes
!pip install peft
!pip install evaluate

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1
Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.13.2
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?2

In [2]:
import os
import sys
import json
import evaluate
import math
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import BitsAndBytesConfig

# from transformers.integrations.bitsandbytes import BitsAndBytesConfig
from datasets import load_dataset
import transformers
from transformers import LlamaForCausalLM, LlamaTokenizer
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
)

from kaggle_secrets import UserSecretsClient
import wandb

In [3]:
# wandb login
user_secrets = UserSecretsClient()
wandb_key = user_secrets.get_secret('wandb-key')
wandb.login(key=wandb_key)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
# Set random seed for reproducibility
RANDOM_SEED = 1234
transformers.set_seed(RANDOM_SEED)

# Fit into Kaggle T4*2
MICRO_BATCH_SIZE = 4
BATCH_SIZE = 128
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 1  # One epoch takes ~6 hours, and 2 epochs may exceed Kaggle 12-hour limit 
LEARNING_RATE = 2e-5  # Following stanford_alpaca
CUTOFF_LEN = 256  # 256 accounts for about 96% of the data. Shorter input, faster training/less VRAM
LORA_R = 8  # Some LoRA parameters
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
VAL_SET_SIZE = 0
TARGET_MODULES = [
    'q_proj',
    'v_prol',
]
#DATA_PATH = 'final_dataset.json'
OUTPUT_DIR = '/kaggle/working/llama_7b_tuned_1'  # Save the model in Kaggle output dir.

# DDP setting
device_map = 'auto'
world_size = int(os.environ.get('WORLD_SIZE', 1))
ddp = (world_size != 1)  # If more than one GPU, then DDP
if ddp:
    device_map = {'': int(os.environ.get('LOCAL_RANK') or 0)}
    GRADIENT_ACCUMULATION_STEPS = GRADIENT_ACCUMULATION_STEPS // world_size

In [5]:
# Read LLaMA model
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
#     bnb_4bit_compute_dtype=torch.bfloat16,
#     bnb_4bit_use_double_quant=True,
)

model = LlamaForCausalLM.from_pretrained(
    'baffo32/decapoda-research-llama-7B-hf',
    quantization_config=quantization_config,   # 8-bit to save VRAM
    device_map=device_map,
)
tokenizer = LlamaTokenizer.from_pretrained(
    'baffo32/decapoda-research-llama-7B-hf', add_eos_token=True
)
tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token
model = prepare_model_for_kbit_training(model)

config.json:   0%|          | 0.00/428 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/33 [00:00<?, ?it/s]

pytorch_model-00001-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00002-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00003-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00004-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00005-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00006-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00007-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00008-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00009-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00010-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00011-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00012-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00013-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00014-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00015-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00016-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00017-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00018-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00019-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00020-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00021-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00022-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00023-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00024-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00025-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00026-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00027-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00028-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00029-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00030-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00031-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00032-of-00033.bin:   0%|          | 0.00/405M [00:00<?, ?B/s]

pytorch_model-00033-of-00033.bin:   0%|          | 0.00/524M [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/142 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


In [6]:
# LoRA config.
config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias='none',
    task_type='CAUSAL_LM',
)
model = get_peft_model(model, config)

In [7]:
data = load_dataset('kunchum/capstone_1')
# data = load_dataset('json', data_files='/kaggle/input/qa-data-1/qa_df.json')
data = data.shuffle(seed=RANDOM_SEED)  # Shuffle dataset here

# Select a sample of 5000 records for fine-tuning
sample_size = 20000
data_sample = data['train'].select(range(sample_size))

# Create a DatasetDict with the sampled data
from datasets import DatasetDict

# Create a new DatasetDict to retain the original structure
sampled_data_dict = DatasetDict({
    'train': data_sample  # You can add validation or test splits if needed
})

qa_data.json:   0%|          | 0.00/70.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/36612 [00:00<?, ? examples/s]

In [8]:
data

DatasetDict({
    train: Dataset({
        features: ['instruction', 'context', 'context_cleaned', 'response', 'response_cleaned', 'tag'],
        num_rows: 36612
    })
})

In [9]:
sampled_data_dict

DatasetDict({
    train: Dataset({
        features: ['instruction', 'context', 'context_cleaned', 'response', 'response_cleaned', 'tag'],
        num_rows: 20000
    })
})

In [10]:
def generate_prompt(data_point):
    """Gen. input text based on a prompt, task instruction, (context info.), and answer

    :param data_point: dict: Data point
    :return: str: Input text
    """
    # Samples with additional context into.
    if data_point['context_cleaned'] != "":
        text = 'Below is an instruction that describes a task, paired with an input that provides' \
               ' further context. Write a response that appropriately completes the request.\n\n'
        text += f'### Instruction:\n{data_point["instruction"]}\n\n'
        text += f'### Input:\n{data_point["context_cleaned"]}\n\n'
        text += f'### Response:\n{data_point["response_cleaned"]}'
        return text

    # Without
    else:
        text = 'Below is an instruction that describes a task. Write a response that ' \
               'appropriately completes the request.\n\n'
        text += f'### Instruction:\n{data_point["instruction"]}\n\n'
        text += f'### Response:\n{data_point["response_cleaned"]}'
        return text

In [11]:
def tokenize(prompt):
    """Tokenise the input

    :param prompt: str: Input text
    :return: dict: {'tokenised input text': list, 'mask': list}
    """
    result = tokenizer(prompt, truncation=True, max_length=CUTOFF_LEN + 1, padding='max_length')
    return {
        'input_ids': result['input_ids'][:-1],
        'attention_mask': result['attention_mask'][:-1],
    }

In [12]:
def generate_and_tokenize_prompt(data_point):
    """This function masks out the labels for the input, so that our loss is computed only on the
    response."""
    if data_point['context_cleaned'] != "":
        user_prompt = 'Below is an instruction that describes a task, paired with an input that ' \
                      'provides further context. Write a response that appropriately completes ' \
                      'the request.\n\n'
        user_prompt += f'### Instruction:\n{data_point["instruction"]}\n\n'
        user_prompt += f'### Input:\n{data_point["context_cleaned"]}\n\n'
        user_prompt += f'### Response:\n'
    else:
        user_prompt = 'Below is an instruction that describes a task. Write a response that ' \
                      'appropriately completes the request.'
        user_prompt += f'### Instruction:\n{data_point["instruction"]}\n\n'
        user_prompt += f'### Response:\n'

    # Count the length of prompt tokens
    len_user_prompt_tokens = len(tokenizer(user_prompt,
                                           truncation=True,
                                           max_length=CUTOFF_LEN + 1,
                                           padding='max_length')['input_ids'])
    len_user_prompt_tokens -= 1  # Minus 1 (one) for eos token

    # Tokenise the input, both prompt and output
    full_tokens = tokenizer(
        user_prompt + data_point['response_cleaned'],
        truncation=True,
        max_length=CUTOFF_LEN + 1,
        padding='max_length',
    )['input_ids'][:-1]
    return {
        'input_ids': full_tokens,
        'labels': [-100] * len_user_prompt_tokens + full_tokens[len_user_prompt_tokens:],
        'attention_mask': [1] * (len(full_tokens)),
    }

In [13]:
# Train/val split
if VAL_SET_SIZE > 0:
    train_val = sampled_data_dict['train'].train_test_split(
        test_size=VAL_SET_SIZE, shuffle=False, seed=RANDOM_SEED
    )
    train_data = train_val['train'].map(generate_and_tokenize_prompt)
    val_data = train_val['test'].map(generate_and_tokenize_prompt)
else:
    train_data = sampled_data_dict['train'].map(generate_and_tokenize_prompt)
    val_data = None

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [14]:
# # Loading the perplexity metric from evaluate
# perplexity_metric = evaluate.load("perplexity")

# # Function to compute perplexity based on the model's loss
# def compute_metrics(eval_pred):
#     loss = eval_pred["eval_loss"]
#     perplexity = math.exp(loss) if loss < 100 else float("inf")  # Handle overflow for large losses
#     return {"perplexity": perplexity}

from transformers import TrainerCallback
import math

# Custom callback to log and print perplexity at each logging step
class PerplexityCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs and "loss" in logs:
            # Calculate perplexity from the training loss
            perplexity = math.exp(logs["loss"]) if logs["loss"] < 100 else float("inf")
            logs["perplexity"] = perplexity  # Update logs dictionary with perplexity
            print(f"Step {state.global_step} - Training Loss: {logs['loss']:.4f} - Perplexity: {perplexity:.4f}")

In [15]:
# HuggingFace Trainer
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
#     eval_dataset=val_data,
    args=transformers.TrainingArguments(
        seed=RANDOM_SEED,  # Reproducibility
        data_seed=RANDOM_SEED,
        per_device_train_batch_size=MICRO_BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=100,
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        fp16=True,
        logging_steps=20,
#         evaluation_strategy='steps' if VAL_SET_SIZE > 0 else 'no',
        save_strategy='steps',
        save_steps=50,
#         eval_steps=50 if VAL_SET_SIZE > 0 else None,
        output_dir=OUTPUT_DIR,
        save_total_limit=3,
        load_best_model_at_end=True if VAL_SET_SIZE > 0 else False,
        ddp_find_unused_parameters=False if ddp else None,
        report_to="wandb",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
#     compute_metrics=compute_metrics,
    callbacks=[PerplexityCallback()]
)
model.config.use_cache = False

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [16]:
# # Without evaluation
# # HuggingFace Trainer

# # Setting environment variable to reduce fragmentation
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# trainer = transformers.Trainer(
#     model=model,
#     train_dataset=train_data,
#     args=transformers.TrainingArguments(
#         seed=RANDOM_SEED,  # Reproducibility
#         data_seed=RANDOM_SEED,
#         per_device_train_batch_size=MICRO_BATCH_SIZE,
#         gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
#         warmup_steps=100,
#         num_train_epochs=EPOCHS,
#         learning_rate=LEARNING_RATE,
#         fp16=False,
#         bf16=True,
#         logging_steps=3,
#         eval_strategy='no',  # No evaluation
#         save_strategy='steps',
#         save_steps=6,
#         output_dir=OUTPUT_DIR,
#         save_total_limit=3,
#         load_best_model_at_end=False,  # Remove to avoid extra memory usage
#         ddp_find_unused_parameters=False if ddp else None,
#     ),
#     data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
# )

# model.config.use_cache = False
# torch.cuda.empty_cache()
# model.gradient_checkpointing_enable()

In [17]:
# PEFT setup
old_state_dict = model.state_dict
model.state_dict = (
    lambda self, *_, **__: get_peft_model_state_dict(self, old_state_dict())
).__get__(model, type(model))

# Use the latest PyTorch 2.0 if possible
if torch.__version__ >= '2' and sys.platform != 'win32':
    model = torch.compile(model)

In [18]:
# Train
trainer.train()
wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mkunchum[0m ([33mcs7150_rakshak[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.18.3
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20241028_050443-k6h0gsph[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m/kaggle/working/llama_7b_tuned_1[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/cs7150_rakshak/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/cs7150_rakshak/huggingface/runs/k6h0gsph[0m
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
20,2.2559
40,2.2683
60,2.2485
80,2.2315
100,2.1924
120,2.1564
140,2.1055


Step 20 - Training Loss: 2.2559 - Perplexity: 9.5439
Step 40 - Training Loss: 2.2683 - Perplexity: 9.6630


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step 60 - Training Loss: 2.2485 - Perplexity: 9.4735
Step 80 - Training Loss: 2.2315 - Perplexity: 9.3138
Step 100 - Training Loss: 2.1924 - Perplexity: 8.9567


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step 120 - Training Loss: 2.1564 - Perplexity: 8.6400
Step 140 - Training Loss: 2.1055 - Perplexity: 8.2112


  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:         train/epoch ▁▂▃▄▅▆▇█
[34m[1mwandb[0m:   train/global_step ▁▂▃▄▅▆▇█
[34m[1mwandb[0m:     train/grad_norm ▁▂▅▅▆▇█
[34m[1mwandb[0m: train/learning_rate ▁▃▅▆█▅▂
[34m[1mwandb[0m:          train/loss ▇█▇▆▅▃▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:               total_flos 2.0271715316033126e+17
[34m[1mwandb[0m:              train/epoch 0.9984
[34m[1mwandb[0m:        train/global_step 156
[34m[1mwandb[0m:          train/grad_norm 0.17387
[34m[1mwandb[0m:      train/learning_rate 1e-05
[34m[1mwandb[0m:               train/loss 2.1055
[34m[1mwandb[0m:               train_loss 2.19462
[34m[1mwand

In [19]:
# Save the fine-tuned model
model.save_pretrained(OUTPUT_DIR)

model.push_to_hub("kunchum/capstone_1")

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-671f9293-59ca87504fc5e3fa3677153e;7288c054-1c7c-4692-bc3a-4d61fc73e00a)

Invalid username or password.