<a href="https://colab.research.google.com/github/shankarsubramony/GenAI/blob/main/TCS_E2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install huggingface_hub
!pip -q install numpy

In [2]:
!pip -q install peft
!pip -q install datasets


In [3]:
!pip -q install trl

In [4]:
!pip -q install wandb
!pip install -i https://pypi.org/simple/ bitsandbytes


Looking in indexes: https://pypi.org/simple/


In [5]:
!pip -q install accelerate


In [6]:
from transformers import AutoModelForSeq2SeqLM,AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,HfArgumentParser,TrainingArguments,pipeline, logging, TextStreamer
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training, get_peft_model
from wandb import wandb
import os, torch, platform, warnings
from trl import SFTTrainer
from huggingface_hub import notebook_login


In [8]:
#load model and tokenizer

from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=False,
   bnb_4bit_compute_dtype=torch.bfloat16
)
huggingface_token = "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"

model_name_ta = "microsoft/phi-1_5"
#model_name_ta = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer = AutoTokenizer.from_pretrained(model_name_ta, quantization_config=nf4_config, padding=True)
# Set the pad token
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
model = AutoModelForCausalLM.from_pretrained(model_name_ta, quantization_config=nf4_config)



`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [9]:
#load dataset

from huggingface_hub import login
huggingface_token = "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"
login(token = huggingface_token)
from datasets import load_dataset
dataset_name = "aboonaji/wiki_medical_terms_llam2_format"
instruct_tune_dataset = load_dataset(dataset_name)
print(instruct_tune_dataset)
instruct_tune_dataset["train"] = instruct_tune_dataset["train"].select(range(2000))


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful
DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 6861
    })
})


In [10]:
# Prepare model for fine tuning using lora

from peft import LoraConfig, get_peft_model, TaskType


def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )


peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM"
)

#prepare model for kbit training

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
print_trainable_parameters(model)


trainable params: 44040192 || all params: 858331136 || trainable%: 5.1309092904675895


In [11]:
#Set hyperparameter

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Number of training epochs
num_train_epochs = 10

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 4

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = 10 #100 was changed to 10

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 25

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

'''args = TrainingArguments(
  output_dir = "llama_instruct_generation",
  num_train_epochs=5,
  max_steps = 1,
  per_device_train_batch_size = 4,
    warmup_steps = 0.03,
  logging_steps=10,
  optim="paged_adamw_8bit",
  save_strategy="epoch",
  #evaluation_strategy="epoch",
  #evaluation_strategy="steps",
  #eval_steps=20,
  learning_rate=2e-4,
  bf16=False,
  lr_scheduler_type='constant',
  report_to="wandb"
)'''

# Set training parameters
args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
)

In [12]:
max_seq_length = 250

#def create_prompt(sample):
#    bos_token = "<s>"
#    #original_system_message = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
#    #system_message = "Use the provided input to answer any response of the elements in the claim json."
#    response = sample["IP_CLM_ID"]
#    input = sample["IP_CLM_ID"]
#    eos_token = "</s>"

#    full_prompt = ""
#    full_prompt += bos_token
#    #full_prompt += "### Instruction:"
#    #full_prompt += "\n" + system_message
#    full_prompt += "\n\n### Input:"
#    full_prompt += "\n" + input
#    full_prompt += "\n\n### Response:"
#    full_prompt += "\n" + response
#    full_prompt += eos_token

#    print(full_prompt)

 #   return full_prompt

trainer = SFTTrainer(
  model=model,
  peft_config=peft_config,
  max_seq_length=max_seq_length,
  tokenizer=tokenizer,
  packing=True,
  #formatting_func=create_prompt,
  dataset_text_field="text",
  args=args,
  train_dataset=instruct_tune_dataset["train"]
)

max_steps is given, it will override any value given in num_train_epochs


In [13]:
import time
import accelerate
start = time.time()
trainer.train()
# Save the fine-tuned model
#trainer.model.save_pretrained(new_model)
wandb.finish()
model.config.use_cache = True
model.eval()
print(time.time()- start)

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss


36.28129005432129


In [None]:
# Check GPU compatibility with bfloat16
bnb_4bit_compute_dtype = "float16"
use_4bit = True
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)
    else:
        print("Bummer")
