In [None]:
!pip install -U \
  torch transformers==4.37.2 datasets peft accelerate bitsandbytes==0.41.3 \
  wandb matplotlib sentencepiece huggingface_hub==0.20.3

In [None]:
!wandb login

use this for running in kaggle cli

In [None]:
%env WANDB_API_KEY=api key
import wandb
wandb.init(project="llama2-python-codegen")

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
!huggingface-cli login

use this for kaggle cli

In [None]:
from huggingface_hub import login

login(token="huggingface api")

In [None]:
import pandas as pd
df = pd.read_json("dataset.jsonl", lines=True)

print(df.head())
print(df.columns)

In [None]:
df.info()

In [None]:
# Removed: This cell was causing a dependency conflict by upgrading bitsandbytes.
!pip install -U bitsandbytes

In [None]:
import bitsandbytes as bnb
print("bitsandbytes version:", bnb.__version__)

this are some of the dependencies that we need for this notebook

In [None]:
import os
import torch
import wandb
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

settings for this notebook ,especially the path of different directories for this notebook


In [None]:
MODEL_NAME = "meta-llama/Llama-2-7b-hf"
DATA_PATH = "/content/dataset.jsonl"
OUTPUT_DIR = "/content/drive/MyDrive/llama2-python-lora"
WANDB_PROJECT = "llama2-python-codegen"

Hyperparameters for this training process.here we will have actual bath size 16

In [None]:
EPOCHS = 1
BATCH_SIZE = 2
GRAD_ACCUM = 8
LR = 2e-4
MAX_LENGTH = 512

initialize WandB .here change the name as you are running the notebook for which epoch

In [None]:
wandb.init(
    project=WANDB_PROJECT,
    config={
        "model": "Llama-fine-tuned",
        "name": "lora-r8-lr2e-4-epoch1"
        "epochs": EPOCHS,
        "lr": LR,
        "batch_size": BATCH_SIZE,
        "quantization": "auto",
        "lora_r": 8,
        "lora_alpha": 32,
        "lora_dropout": 0.1
    }
)

code for quantization using bitsandbytes

In [None]:
if torch.cuda.is_available():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.float16
    )
    device_map = "auto"
else:
    bnb_config = None
    device_map = {"": "cpu"}

load tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token


load the model and prepare for lora or 4-bit training

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map=device_map,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
model = prepare_model_for_kbit_training(model)

configuration for lora

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

load the datasets for trainig

In [None]:
dataset = load_dataset("json", data_files=DATA_PATH)

tokenizer function

In [None]:
def tokenize(example):
    text = f"<s>[INST] {example['instruction']} [/INST]\n{example['completion']}</s>"
    return tokenizer(
        text,
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )


remove the correct columns like instruction and completion which may be like instruction and code also with respect to data

In [None]:
dataset = dataset.map(tokenize, remove_columns=["instruction", "completion"])

trainign arguments and training

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    fp16=torch.cuda.is_available(),
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=3,
    report_to="wandb",
    optim="paged_adamw_8bit" if torch.cuda.is_available() else "adamw_torch",
    lr_scheduler_type="cosine",
    warmup_ratio=0.05
)

trainer = Trainer(
    model=model,
    train_dataset=dataset["train"],
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()


save model and tokenizer

In [None]:
trainer.save_model(f"{OUTPUT_DIR}/final")
tokenizer.save_pretrained(f"{OUTPUT_DIR}/final")
wandb.finish()