In [None]:
# !pip install -U torch torchvision transformers datasets peft accelerate bitsandbytes wandb matplotlib sentencepiece huggingface_hub dotenv nbformat optuna --no-cache-dir

In [None]:
import os, huggingface_hub
from dotenv import load_dotenv
import torch
import wandb
from datasets import load_dataset
import bitsandbytes
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

use this for running in kaggle cli

In [None]:
load_dotenv()

hf_token = os.getenv("huggingface_token")
wandb_key = os.getenv("wandb_key")

huggingface_hub.login(token = hf_token)
wandb.login(key = wandb_key)

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: [wandb.login()] Using explicit session credentials for https://api.wandb.ai.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/sanja/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msanjayashrestha777[0m ([33msanjayashrestha777-thapathali-campus[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
# from google.colab import drive
# drive.mount("/content/drive")

this are some of the dependencies that we need for this notebook

settings for this notebook ,especially the path of different directories for this notebook


In [20]:
MODEL_NAME = "meta-llama/Llama-2-7b-hf"
DATA_PATH = "../Datasets/train (1).jsonl"
OUTPUT_DIR = "llama2-python-lora"
WANDB_PROJECT = "llama2-python-codegen"

Hyperparameters for this training process.here we will have actual bath size 16

In [8]:
EPOCHS = 1
BATCH_SIZE = 2
GRAD_ACCUM = 8
LR = 2e-4
MAX_LENGTH = 512

initialize WandB .here change the name as you are running the notebook for which epoch

In [10]:
wandb.init(
    project=WANDB_PROJECT,
    config={
        "model": "Llama-fine-tuned",
        "name": "lora-r8-lr2e-4-epoch1",
        "epochs": EPOCHS,
        "lr": LR,
        "batch_size": BATCH_SIZE,
        "quantization": "auto",
        "lora_r": 8,
        "lora_alpha": 32,
        "lora_dropout": 0.1
    }
)

code for quantization using bitsandbytes

In [11]:
if torch.cuda.is_available():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )
    device_map = "auto"
else:
    bnb_config = None
    device_map = {"": "cpu"}

load tokenizer

In [12]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token


load the model and prepare for lora or 4-bit training

In [13]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map=device_map,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
model = prepare_model_for_kbit_training(model)

`torch_dtype` is deprecated! Use `dtype` instead!
Loading weights: 100%|██████████| 291/291 [00:23<00:00, 12.32it/s, Materializing param=model.norm.weight]                              


configuration for lora

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 8,388,608 || all params: 6,746,804,224 || trainable%: 0.1243


load the datasets for trainig

In [16]:
dataset = load_dataset("json", data_files=DATA_PATH)

Generating train split: 16736 examples [00:00, 25172.81 examples/s]


tokenizer function

In [17]:
def tokenize(example):
    text = f"{example["text"]}"
    return tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )


remove the correct columns like instruction and completion which may be like instruction and code also with respect to data

In [18]:
dataset = dataset.map(tokenize, remove_columns=["text"])

Map: 100%|██████████| 16736/16736 [00:18<00:00, 912.65 examples/s] 


trainign arguments and training

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    fp16=torch.cuda.is_available(),
    logging_steps=5,
    save_strategy="epoch",
    save_total_limit=3,
    report_to="wandb",
    optim="paged_adamw_8bit" if torch.cuda.is_available() else "adamw_torch",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05
)

trainer = Trainer(
    model=model,
    train_dataset=dataset['train'][:100],
    eval_dataset = dataset['train'][100:10],
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()


warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.


  return fn(*args, **kwargs)


Step,Training Loss


save model and tokenizer

In [None]:
trainer.save_model(f"{OUTPUT_DIR}/final")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final")
wandb.finish()