In [50]:
%autosave 300
%load_ext autoreload
%autoreload 2
%reload_ext autoreload
%config Completer.use_jedi = False

Autosaving every 300 seconds


In [1]:
import torch

torch.cuda.is_available()

True

#### Load base Model

In [2]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
import torch.nn as nn
import bitsandbytes as bnb
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

In [3]:
# loading the model without any quantization
model = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-3b",
    torch_dtype=torch.float16,
    device_map="auto",
)

tokenizer = AutoTokenizer.from_pretrained("bigscience/tokenizer")

In [4]:
# Model casting to fp32 and gradient checkpointing
for param in model.parameters():
    param.requires_grad = False  # freeze the model - train adapters later
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)

model.gradient_checkpointing_enable()  # reduce number of stored activations
model.enable_input_require_grads()


class CastOutputToFloat(nn.Sequential):
    def forward(self, x):
        return super().forward(x).to(torch.float32)


model.lm_head = CastOutputToFloat(model.lm_head)

In [5]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )
    return (trainable_params / all_param) * 100

In [6]:
print_trainable_parameters(model)

trainable params: 0 || all params: 3002557440 || trainable%: 0.0


0.0

##### LORA MODEL using LORA CONFIG

In [7]:
from peft import LoraConfig, get_peft_model

```python
class LoraConfig(PeftConfig):
    """
    Configuration class for the LoraModel.

    Args:
        r (int): LoRA attention dimension (rank).
        target_modules (Optional[Union[List[str], str]]): Modules to apply the adapter to.
        lora_alpha (int): Alpha parameter for LoRA scaling.
        lora_dropout (float): Dropout probability for LoRA layers.
        fan_in_fan_out (bool): Set True if the layer stores weights as (fan_in, fan_out).
        bias (str): Bias type ('none', 'all', 'lora_only').
        use_rslora (bool): Use Rank-Stabilized LoRA (defaults to False).
        modules_to_save (List[str]): Modules to be set as trainable and saved.
        init_lora_weights (Union[bool, Literal]): Initialization method for LoRA weights.
        layers_to_transform (Union[List[int], int]): Layer indices to apply transformations to.
        layers_pattern (str): Name pattern for layers to transform.
        rank_pattern (dict): Layer-to-rank mapping.
        alpha_pattern (dict): Layer-to-alpha mapping.
        megatron_config (Optional[dict]): TransformerConfig for Megatron.
        megatron_core (Optional[str]): Core module for Megatron (default: 'megatron.core').
        loftq_config (Optional[LoftQConfig]): Configuration for LoftQ quantization.
        use_dora (bool): Enable Weight-Decomposed Low-Rank Adaptation (DoRA).
        layer_replication (List[Tuple[int, int]]): Layer replication ranges for model expansion.
        runtime_config (LoraRuntimeConfig): Runtime configurations (not saved or restored).
    """
```

In [8]:
config = LoraConfig(
    r=8,  # rank of the low-rank approximation
    lora_alpha=16,  # alpha parameter of LoRA
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

Wrap the base model and peft_config with the get_peft_model() function to create a PeftModel.

In [9]:
model = get_peft_model(model, config)
traininable_params = print_trainable_parameters(model)
print(
    f"For rank {config.r} and alpha {config.lora_alpha}, trainable params: {traininable_params} in percentage"
)

trainable params: 2457600 || all params: 3005015040 || trainable%: 0.08178328451893539
For rank 8 and alpha 16, trainable params: 0.08178328451893538 in percentage


#### loading some dataset

In [10]:
from datasets import load_dataset

qa_dataset = load_dataset("squad_v2")

In [11]:
qa_dataset["train"][0]

{'id': '56be85543aeaaa14008c9063',
 'title': 'Beyoncé',
 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".',
 'question': 'When did Beyonce start becoming popular?',
 'answers': {'text': ['in the late 1990s'], 'answer_start': [269]}}

The data is structured as follows:
### CONTEXT
{context}

### QUESTION
{question}

### ANSWER
{answer}</s>

In [12]:
def create_prompt(context, question, answer):
    """
    Create a prompt for the model to answer the question
    """
    if len(answer["text"]) < 1:
        answer = "Cannot Find Answer"
    else:
        answer = answer["text"][0]
    prompt_template = f"### CONTEXT\n{context}\n\n### QUESTION\n{question}\n\n### ANSWER\n{answer}</s>"
    return prompt_template

In [13]:
# mapping the dataset to the model tokenizer and creating the prompt
mapped_qa_dataset = qa_dataset.map(
    lambda samples: tokenizer(
        create_prompt(samples["context"], samples["question"], samples["answers"])
    )
)

Train LORA

In [23]:
import transformers
from transformers.integrations import TensorBoardCallback

```python
## TrainingArguments Class

The `TrainingArguments` class is used to configure the settings for training a model using Hugging Face's `Trainer` API. This class handles parameters such as batch size, learning rate, output directories, and many other important hyperparameters required to train, evaluate, and log the model training.

### Parameters

- **output_dir** (`str`):  
  The directory where the model predictions and checkpoints will be saved.

- **overwrite_output_dir** (`bool`, *optional*, defaults to `False`):  
  If `True`, the content of the output directory will be overwritten. Use caution when setting this to `True`, as it will replace any existing models and checkpoints.

- **do_train** (`bool`, *optional*, defaults to `False`):  
  Whether to run the training loop.

- **do_eval** (`bool`, *optional*, defaults to `False`):  
  Whether to evaluate the model on a validation dataset at the end of each epoch.

- **evaluation_strategy** (`str`, *optional*, defaults to `"no"`):  
  The evaluation strategy to adopt during training. Options include:
  - `"no"`: No evaluation during training.
  - `"steps"`: Evaluation is done every few steps.
  - `"epoch"`: Evaluation is done at the end of each epoch.

- **per_device_train_batch_size** (`int`, *optional*, defaults to `8`):  
  The batch size per device during training.

- **per_device_eval_batch_size** (`int`, *optional*, defaults to `8`):  
  The batch size per device during evaluation.

- **learning_rate** (`float`, *optional*, defaults to `5e-5`):  
  The initial learning rate for the Adam optimizer.

- **weight_decay** (`float`, *optional*, defaults to `0`):  
  The weight decay (L2 penalty) to apply during optimization.

- **num_train_epochs** (`float`, *optional*, defaults to `3.0`):  
  The total number of training epochs.

- **logging_dir** (`str`, *optional*, defaults to `None`):  
  Directory where the training logs will be written to.

- **logging_steps** (`int`, *optional*, defaults to `500`):  
  The number of update steps before logging training metrics.

- **save_steps** (`int`, *optional*, defaults to `500`):  
  The number of steps after which a checkpoint is saved.

### Additional Parameters

- **seed** (`int`, *optional*, defaults to `42`):  
  A seed to ensure reproducibility of the training results.

- **fp16** (`bool`, *optional*, defaults to `False`):  
  Whether to use 16-bit (mixed) precision training for faster training on GPUs that support it.

- **no_cuda** (`bool`, *optional*, defaults to `False`):  
  Whether to prevent the use of CUDA (i.e., GPU acceleration) even if it is available.

- **save_total_limit** (`int`, *optional*):  
  The maximum number of checkpoints to keep. Older checkpoints will be deleted. If `None`, all checkpoints are kept.

```

In [24]:
train_args = transformers.TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=25,
    max_steps=50,
    learning_rate=1e-3,
    fp16=True,
    logging_steps=1,
    output_dir="outputs",
    seed=42,
)

In [25]:
import os

os.environ["DISABLE_MLFLOW_INTEGRATION"] = "TRUE"

In [26]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=mapped_qa_dataset["train"],
    args=train_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Exception ignored in: <function MLflowCallback.__del__ at 0x7fcfaef928c0>
Traceback (most recent call last):
  File "/anaconda/envs/finetune_env/lib/python3.10/site-packages/transformers/integrations/integration_utils.py", line 1105, in __del__
    self._auto_end_run
AttributeError: 'MLflowCallback' object has no attribute '_auto_end_run'
Exception ignored in: <function MLflowCallback.__del__ at 0x7fcfaef928c0>
Traceback (most recent call last):
  File "/anaconda/envs/finetune_env/lib/python3.10/site-packages/transformers/integrations/integration_utils.py", line 1105, in __del__
    self._auto_end_run
AttributeError: 'MLflowCallback' object has no attribute '_auto_end_run'


Step,Training Loss
1,2.7324
2,2.8416
3,2.9437
4,2.8856
5,2.8187
6,2.9036
7,2.952
8,2.8253
9,2.7614
10,2.7308


TrainOutput(global_step=50, training_loss=2.576432580947876, metrics={'train_runtime': 269.898, 'train_samples_per_second': 2.964, 'train_steps_per_second': 0.185, 'total_flos': 3025677716152320.0, 'train_loss': 2.576432580947876, 'epoch': 0.01})

In [31]:
HUGGING_FACE_USER_NAME = "soutrik"
model_name = "soutrik/opt-6.7b-lora"

In [32]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [33]:
model.push_to_hub(model_name, use_auth_token=True)



adapter_model.safetensors:   0%|          | 0.00/9.84M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/soutrik/opt-6.7b-lora/commit/d6f5bf4c5fac0c161662c9daf109f0b8f95cda46', commit_message='Upload model', commit_description='', oid='d6f5bf4c5fac0c161662c9daf109f0b8f95cda46', pr_url=None, pr_revision=None, pr_num=None)

In [34]:
##### Loading the model from the hub

import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

In [37]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer

In [42]:
config = PeftConfig.from_pretrained("soutrik/opt-6.7b-lora")
model = AutoModelForCausalLM.from_pretrained(
    config.base_model_name_or_path,
    return_dict=True,
    load_in_8bit=False,
    device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
qa_model = PeftModel.from_pretrained(model, "soutrik/opt-6.7b-lora")



adapter_model.safetensors:   0%|          | 0.00/9.84M [00:00<?, ?B/s]

Inferenencing

In [43]:
from IPython.display import display, Markdown


def make_inference(context, question):
    batch = tokenizer(
        f"### CONTEXT\n{context}\n\n### QUESTION\n{question}\n\n### ANSWER\n",
        return_tensors="pt",
    )

    with torch.cuda.amp.autocast():
        output_tokens = qa_model.generate(**batch, max_new_tokens=200)

    display(Markdown((tokenizer.decode(output_tokens[0], skip_special_tokens=True))))

In [44]:
context = "Cheese is the best food."
question = "What is the best food?"

make_inference(context, question)

### CONTEXT
Cheese is the best food.

### QUESTION
What is the best food?

### ANSWER
Cheese

In [47]:
context = "Cheese is the best food."
question = "How far away is the Moon from the Earth?"

make_inference(context, question)

### CONTEXT
Cheese is the best food.

### QUESTION
How far away is the Moon from the Earth?

### ANSWER
1,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,000,

In [46]:
context = "The Moon orbits Earth at an average distance of 384,400 km (238,900 mi), or about 30 times Earth's diameter. Its gravitational influence is the main driver of Earth's tides and very slowly lengthens Earth's day. The Moon's orbit around Earth has a sidereal period of 27.3 days. During each synodic period of 29.5 days, the amount of visible surface illuminated by the Sun varies from none up to 100%, resulting in lunar phases that form the basis for the months of a lunar calendar. The Moon is tidally locked to Earth, which means that the length of a full rotation of the Moon on its own axis causes its same side (the near side) to always face Earth, and the somewhat longer lunar day is the same as the synodic period. However, 59% of the total lunar surface can be seen from Earth through cyclical shifts in perspective known as libration."
question = "At what distance does the Moon orbit the Earth?"

make_inference(context, question)

### CONTEXT
The Moon orbits Earth at an average distance of 384,400 km (238,900 mi), or about 30 times Earth's diameter. Its gravitational influence is the main driver of Earth's tides and very slowly lengthens Earth's day. The Moon's orbit around Earth has a sidereal period of 27.3 days. During each synodic period of 29.5 days, the amount of visible surface illuminated by the Sun varies from none up to 100%, resulting in lunar phases that form the basis for the months of a lunar calendar. The Moon is tidally locked to Earth, which means that the length of a full rotation of the Moon on its own axis causes its same side (the near side) to always face Earth, and the somewhat longer lunar day is the same as the synodic period. However, 59% of the total lunar surface can be seen from Earth through cyclical shifts in perspective known as libration.

### QUESTION
At what distance does the Moon orbit the Earth?

### ANSWER
30 times Earth's diameter