In [3]:
#Installing Required Libraries for Fine tuning
!pip install -q -U bitsandbytes peft trl accelerate datasets transformers

In [4]:
#Installing Flas-attn Library
!pip install flash-attn --no-build-isolation



In [5]:
pip install --upgrade huggingface_hub



In [6]:
#Importing the libraries
import os
import transformers
import torch
from google.colab import userdata #optional for adding API Tokens
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig,get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,pipeline

In [7]:
#Loading Tokenizer & Model from Hugging Face using BitsAndBytesConfig
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [8]:
from huggingface_hub import login

#Hugging Face token
login(token="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")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [9]:
import os

# Setting your Hugging Face token
os.environ['HF_TOKEN'] = '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'

In [10]:
model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
# Printing to ensure it's not empty
print(f"Loading model with ID: {model_id}")

# Ensureing model ID is valid and not empty
if not model_id:
    raise ValueError("Model ID cannot be empty.")

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    token=os.environ['HF_TOKEN'],
    padding_side="left",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             token=os.environ['HF_TOKEN'],
                                             quantization_config=bnb_config,
                                             device_map={"":0},
                                             attn_implementation="eager"
                                             )

Loading model with ID: meta-llama/Llama-3.2-11B-Vision-Instruct


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
#Testing whether Model is working or or not
text = "As an Machine Learning Engineer, can you explain the concept of crossvalidation in machine learning?"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=100,temperature=0.7,top_k=50,top_p=0.95,num_return_sequences=1)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

As an Machine Learning Engineer, can you explain the concept of crossvalidation in machine learning? Cross-validation is a technique used to evaluate the performance of a machine learning model on unseen data. It involves splitting the available data into training and testing sets, and then training the model on multiple subsets of the data while using the remaining subset as a test set. This process is repeated multiple times, and the performance of the model is evaluated on each test set.

There are several types of cross-validation, including:

1.  **K-Fold Cross-Validation**: This is a widely used method where


In [11]:
from datasets import load_dataset, concatenate_datasets

# Loading all datasets
piqa = load_dataset('piqa', split='train')
boolqa = load_dataset('boolq', split='train')
winograd = load_dataset('winograd_wsc', 'wsc285', split='test')
arc = load_dataset('ai2_arc', 'ARC-Challenge', split='train')  # or 'ARC-Easy'



In [10]:
# # Example formatting functions for each dataset
# def format_piqa(example):
#     text = f"Goal: {example['goal']}\nSolution 1: {example['sol1']}\nSolution 2: {example['sol2']}"
#     # Convert ClassLabel to int
#     label = int(example["label"])
#     return {"input_text": text, "label": label}

# def format_boolqa(example):
#     text = f"Question: {example['question']}\nPassage: {example['passage']}"
#     label = 1 if example["answer"] else 0  # Convert True/False to 1/0
#     return {"input_text": text, "label": label}

# def format_winograd(example):
#     text = f"Text: {example['text']}"
#     # Convert ClassLabel to int
#     label = int(example["label"])
#     return {"input_text": text, "label": label}

# def format_arc(example):
#     choices_text = " ".join(example['choices']['text'])
#     answer_key = example["answerKey"]
#     answer_index = ord(answer_key) - ord('A')  # Convert 'A' to 0, 'B' to 1, etc.
#     return {"input_text": f"Question: {example['question']}\nChoices: {choices_text}", "label": answer_index}

In [36]:
def rename_labels(batch):
    if 'label' in batch:  # If 'label' exists in the batch
        batch['labels'] = batch.pop('label')  # Rename 'label' to 'labels'
    return batch

# Applying renaming to all datasets
piqa = piqa.map(rename_labels)
boolqa = boolqa.map(rename_labels)
winograd = winograd.map(rename_labels)
arc = arc.map(rename_labels)

Map:   0%|          | 0/16113 [00:00<?, ? examples/s]

Map:   0%|          | 0/9427 [00:00<?, ? examples/s]

Map:   0%|          | 0/285 [00:00<?, ? examples/s]

Map:   0%|          | 0/1119 [00:00<?, ? examples/s]

In [37]:
# Define max_length for tokenization
max_length = 96  # Adjusted with my datasets

# Formatting functions for each dataset type
def formatting_func_data_distribution(example):
    if 'goal' in example:  # PIQA dataset
        return f"Goal: {example['goal']}\nSolution 1: {example['sol1']}\nSolution 2: {example['sol2']}"
    elif 'question' in example and 'passage' in example:  # BoolQA dataset
        return f"Question: {example['question']}\nPassage: {example['passage']}"
    elif 'text' in example:  # Winograd dataset
        return f"Text: {example['text']}"
    elif 'question' in example and 'choices' in example:  # ARC dataset
        choices_text = " ".join(example['choices']['text'])
        return f"Question: {example['question']}\nChoices: {choices_text}"
    else:
        return "Unknown format"

# Tokenization and prompt generation function
def generate_and_tokenize_prompt2(example):
    formatted_input = formatting_func_data_distribution(example)

    # Applying tokenization, truncation, and padding
    result = tokenizer(
        formatted_input,
        truncation=True,
        max_length=max_length,
        padding="max_length",
    )

    # Setting labels for the task (for supervised fine-tuning)
    result["labels"] = result["input_ids"].copy()
    return result

# Tokenize in batches
def tokenize_in_batches(dataset, batch_size=1000):
    tokenized_batches = []
    total_examples = len(dataset)

    for i in range(0, total_examples, batch_size):
        # Process a batch of examples
        batch = dataset.select(range(i, min(i + batch_size, total_examples)))

        # Applying tokenization without batching (processing one example at a time)
        tokenized_batch = batch.map(generate_and_tokenize_prompt2)

        # Directly append tokenized_batch as it is already a Dataset
        tokenized_batches.append(tokenized_batch)

        # Clearing memory
        del tokenized_batch
        torch.cuda.empty_cache()  # Clearing GPU memory
        print(f"Processed batch {i} to {min(i + batch_size, total_examples)}")

    return tokenized_batches

# Applying tokenization to all datasets in small batches
tokenized_piqa = tokenize_in_batches(piqa, batch_size=1000)
tokenized_boolqa = tokenize_in_batches(boolqa, batch_size=1000)
tokenized_winograd = tokenize_in_batches(winograd, batch_size=500)  # Smaller batch due to size
tokenized_arc = tokenize_in_batches(arc, batch_size=1000)

# Combining tokenized datasets into a list for sequential training
tokenized_datasets = [tokenized_piqa, tokenized_boolqa, tokenized_winograd, tokenized_arc]

# Customizing formatting function for the trainer
def formatting_func(example):
    if 'goal' in example:  # For PIQA dataset
        return f"Goal: {example['goal']}\nSolution 1: {example['sol1']}\nSolution 2: {example['sol2']}"
    elif 'question' in example and 'passage' in example:  # For BoolQA dataset
        return f"Question: {example['question']}\nPassage: {example['passage']}"
    elif 'text' in example:  # For Winograd dataset
        return f"Text: {example['text']}"
    elif 'question' in example and 'choices' in example:  # For ARC dataset
        choices_text = " ".join(example['choices']['text'])
        return f"Question: {example['question']}\nChoices: {choices_text}"
    else:
        return "Unknown format"


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 0 to 1000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 1000 to 2000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 2000 to 3000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 3000 to 4000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 4000 to 5000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 5000 to 6000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 6000 to 7000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 7000 to 8000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 8000 to 9000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 9000 to 10000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 10000 to 11000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 11000 to 12000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 12000 to 13000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 13000 to 14000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 14000 to 15000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 15000 to 16000


Map:   0%|          | 0/113 [00:00<?, ? examples/s]

Processed batch 16000 to 16113


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 0 to 1000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 1000 to 2000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 2000 to 3000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 3000 to 4000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 4000 to 5000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 5000 to 6000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 6000 to 7000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 7000 to 8000


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 8000 to 9000


Map:   0%|          | 0/427 [00:00<?, ? examples/s]

Processed batch 9000 to 9427


Map:   0%|          | 0/285 [00:00<?, ? examples/s]

Processed batch 0 to 285


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Processed batch 0 to 1000


Map:   0%|          | 0/119 [00:00<?, ? examples/s]

Processed batch 1000 to 1119


In [38]:
#Setup Training Configuration
lora_config = LoraConfig(
    r = 8,
    target_modules = ["q_proj", "o_proj", "k_proj", "v_proj",
                      "gate_proj", "up_proj", "down_proj"],
    task_type = "CAUSAL_LM",
)

In [39]:
#Checking the changes in model
model = get_peft_model(model, lora_config)
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PeftModelForCausalLM(
      (base_model): LoraModel(
        (model): PeftModelForCausalLM(
          (base_model): LoraModel(
            (model): PeftModelForCausalLM(
              (base_model): LoraModel(
                (model): MllamaForCausalLM(
                  (model): MllamaTextModel(
                    (embed_tokens): Embedding(128264, 4096, padding_idx=128004)
                    (layers): ModuleList(
                      (0-2): 3 x MllamaSelfAttentionDecoderLayer(
                        (self_attn): MllamaTextSelfAttention(
                          (q_proj): lora.Linear4bit(
                            (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                            (lora_dropout): ModuleDict(
                              (default): Identity()
                            )
                            (lora_A): ModuleDict(
                              (default): Linear(

In [40]:
training_args = transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="results",
        optim="paged_adamw_8bit"
    )

In [None]:
#model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"

# Printing to ensure it's not empty
print(f"Loading model with ID: {model_id}")

# Ensuring model ID is valid and not empty
if not model_id:
    raise ValueError("Model ID cannot be empty.")

#print(data[0])

Loading model with ID: meta-llama/Llama-3.2-11B-Vision-Instruct
{'goal': "When boiling butter, when it's ready, you can", 'sol1': 'Pour it onto a plate', 'sol2': 'Pour it into a jar', 'label': 1, 'input_ids': [128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128009, 128000, 41092, 25, 3277, 50937, 14432, 11, 994, 433, 596, 5644, 11, 499, 649, 198, 37942, 220, 16, 25, 27058, 433, 8800, 264, 12235, 198, 37942, 220, 17, 25, 27058, 433, 1139, 264, 30695], 'attention_mask': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [None]:
# Printing out model details before training
print(f"Model configuration: {model.config}")

# Checking training arguments
print(f"Training arguments: {training_args}")


Model configuration: MllamaTextConfig {
  "bos_token_id": 128000,
  "cross_attention_layers": [
    3,
    8,
    13,
    18,
    23,
    28,
    33,
    38
  ],
  "dropout": 0,
  "eos_token_id": [
    128001,
    128008,
    128009
  ],
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 131072,
  "model_type": "mllama_text_model",
  "num_attention_heads": 32,
  "num_hidden_layers": 40,
  "num_key_value_heads": 8,
  "pad_token_id": 128004,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": false,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bitsandby

In [41]:
# Function to train the model on each tokenized batch
from transformers import TrainingArguments

tokenizer.pad_token = tokenizer.eos_token

# Function to train the model on each tokenized batch
def train_on_batches(tokenized_batches):
    for tokenized_batch in tokenized_batches:
        trainer = SFTTrainer(
            model=model,
            tokenizer=tokenizer,
            train_dataset=tokenized_batch,
            args=training_args,
            peft_config=lora_config,
            formatting_func=formatting_func,
            max_seq_length=max_length
        )
        trainer.train()

        # Clearing memory after each batch training
        del tokenized_batch
        torch.cuda.empty_cache()

# Sequentially training on each tokenized dataset
for tokenized_dataset in tokenized_datasets:
    train_on_batches(tokenized_dataset)  # Passing each tokenized dataset correctly



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,3.6187
2,3.2345
3,2.6483
4,2.7686
5,2.5362
6,4.131
7,3.4216
8,3.7154
9,2.4382
10,2.7654



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,2.679
2,2.9932
3,2.2198
4,2.6355
5,2.1222
6,2.4754
7,2.7623
8,2.9641
9,2.1023
10,2.3509



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.9203
2,1.9796
3,2.0109
4,2.2447
5,2.3498
6,2.0037
7,2.4016
8,2.2464
9,2.4604
10,2.1553



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,2.2537
2,1.9571
3,1.96
4,2.0224
5,2.4424
6,2.0272
7,2.2398
8,2.2297
9,1.7433
10,2.3951



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,2.355
2,2.2044
3,2.6425
4,2.0806
5,1.9908
6,2.2843
7,2.4334
8,2.4067
9,1.8001
10,1.9174



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.7312
2,2.0019
3,1.8479
4,1.8569
5,1.9844
6,1.915
7,1.8657
8,1.875
9,1.6848
10,1.9948



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.9066
2,1.6815
3,2.0058
4,1.5223
5,2.2784
6,1.7823
7,1.6182
8,1.8808
9,1.8084
10,1.5262



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.7344
2,1.8201
3,1.5651
4,1.6896
5,1.6795
6,1.9208
7,1.5001
8,1.7586
9,1.6452
10,1.3207



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.6024
2,1.4891
3,1.7132
4,1.7304
5,1.391
6,1.5996
7,1.6306
8,1.9797
9,1.777
10,1.7134



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.5718
2,1.8732
3,1.2889
4,1.7211
5,1.5663
6,1.8551
7,1.7241
8,2.074
9,1.6176
10,1.7228



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.6306
2,1.8572
3,1.6195
4,1.8128
5,1.7393
6,1.6812
7,1.7341
8,1.5482
9,1.5068
10,1.7595



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,2.1372
2,1.6459
3,1.0181
4,1.6988
5,1.6344
6,2.3191
7,1.4573
8,1.9458
9,1.928
10,1.4781



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.5023
2,1.5583
3,1.5756
4,1.7611
5,1.4109
6,1.4573
7,1.534
8,1.5771
9,1.7423
10,1.507



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.426
2,1.0693
3,1.4254
4,1.4865
5,1.4765
6,1.3687
7,1.4915
8,1.7676
9,1.6284
10,1.5901



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,2.1752
2,1.7078
3,1.5239
4,1.821
5,1.6424
6,1.889
7,1.7227
8,1.6555
9,1.4821
10,1.2581



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.743
2,1.6614
3,1.692
4,1.2568
5,1.4674
6,1.6556
7,1.7761
8,1.4402
9,1.5032
10,1.8499



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.2139
2,1.4464
3,1.9007
4,1.3314
5,1.6678
6,1.7375
7,1.6586
8,1.5701
9,1.7934
10,1.5901



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,2.0109
2,2.2901
3,2.2331
4,2.0945
5,2.176
6,1.532
7,1.9596
8,1.8683
9,1.8068
10,1.547



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.7919
2,1.9978
3,1.8334
4,2.096
5,2.0542
6,1.4539
7,1.8631
8,2.0736
9,1.7332
10,1.5448



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,2.2835
2,1.4655
3,1.1892
4,2.2452
5,1.8186
6,1.1991
7,1.3919
8,1.383
9,1.7899
10,2.2972



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.3505
2,1.4667
3,1.555
4,1.5605
5,1.4918
6,1.7564
7,1.2449
8,1.8957
9,1.4032
10,1.8206



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.7059
2,1.5553
3,1.6885
4,1.6824
5,1.18
6,1.7188
7,1.9496
8,1.5692
9,0.9261
10,1.6723



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.8112
2,1.401
3,1.4009
4,1.505
5,1.4856
6,1.1918
7,1.0874
8,1.6566
9,1.2999
10,1.5996



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,2.0088
2,1.4641
3,1.6261
4,1.3689
5,1.3401
6,1.7191
7,1.6223
8,1.8401
9,2.1264
10,1.2174



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.4477
2,1.0739
3,1.536
4,1.6353
5,1.332
6,1.8157
7,1.6917
8,1.7693
9,1.4571
10,1.5356



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.3017
2,1.5449
3,2.1879
4,1.5258
5,1.4981
6,1.679
7,1.441
8,2.0077
9,1.1004
10,1.3128



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,1.5685
2,1.7529
3,2.2836
4,1.2232
5,1.3928
6,1.4055
7,1.7946
8,1.8389
9,1.5623
10,1.6536



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,3.2028
2,3.5202
3,3.3911
4,3.1392
5,2.9857
6,2.9373
7,3.3147
8,3.0934
9,2.3095
10,2.784



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,2.6599
2,2.8617
3,2.7154
4,2.1007
5,2.0344
6,1.6221
7,1.6315
8,1.7613
9,2.2064
10,1.9978



Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
max_steps is given, it will override any value given in num_train_epochs


Step,Training Loss
1,2.5309
2,2.0129
3,1.828
4,1.92
5,2.3698
6,1.6563
7,1.8249
8,1.7433
9,2.1406
10,1.7938


In [42]:
#Test the model with new prompts(added few example prompt)

#PIQA (Physical Interaction QA)
# Goal: How can you safely carry a large stack of plates?

# BoolQA (Boolean Question Answering)
# Question: Can a penguin fly?

# Winograd (Winograd Schema Challenge)
# Sentence: The city councilmen refused the demonstrators a permit because they feared violence. Who feared violence?

# ARC-C (AI2 Reasoning Challenge - Challenge Set)
# Question: What is the primary function of leaves in a plant?
# Choices:
# A) To absorb water and nutrients.
# B) To produce seeds.
# C) To protect the plant from predators.
# D) To perform photosynthesis.

text = "Is Tomato a fruit"
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=100,temperature=0.1,top_k=50,top_p=0.95)
print(tokenizer.decode(outputs[0],skip_special_tokens=True))

Is Tomato a fruit or a vegetable?
Tomato is a fruit. It is a type of fruit called a berry. It grows on a plant and contains seeds. It is often used in salads, sauces, and other dishes. It is also a popular ingredient in many cuisines around the world. Tomatoes are typically red, but they can also be yellow, green, or purple. They are a nutritious food, high in vitamins and antioxidants. They are also a good source of fiber and potassium. Tomatoes are


In [43]:
#@title Show current memory status and usage
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
7.797 GB of memory reserved.


In [49]:
#Showing the overall performance and details.
!nvidia-smi

Tue Oct  1 20:31:52 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P0              32W /  70W |   8115MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    