<a href="https://colab.research.google.com/github/shreyazmz/Adding-Notifications-to-an-App-using-Angular/blob/master/llama3_json_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Training the Llama 3 model

In [1]:
#imp cell to run
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers "trl<0.9.0" peft accelerate bitsandbytes

In [2]:
!pip install bitsandbytes

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->bitsandbytes)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->bitsandbytes)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch->bitsandbytes)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch->bitsandbyte

In [3]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
    "unsloth/llama-3-8b-Instruct-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
    "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


config.json:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

==((====))==  Unsloth: Fast Llama patching release 2024.7
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [5]:
#run this code
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
import json
from datasets import Dataset

# Load your dataset from a JSON file, make sure to include the file name and extension
with open('/content/drive/MyDrive/training_data.json') as f: # Replace 'your_file.json' with the actual name of your JSON file
    custom_data= json.load(f)

In [7]:
def format_custom_data(data):
    formatted_data = {"instruction": [], "output": []}
    for entry in data:
        text = entry["text"]
        qa_pairs = text.split('**Q:')
        for qa in qa_pairs:
            if '**A:' in qa:
                parts = qa.split('**A:')
                question = parts[0].strip()
                answer = '**A:'.join(parts[1:]).strip()  # Join remaining parts if there are multiple **A:
                formatted_data["instruction"].append(question)
                formatted_data["output"].append(answer)
    return Dataset.from_dict(formatted_data)

dataset = format_custom_data(custom_data)  # Use custom_data instead of dataset if needed

alpaca_prompt = """Below is an instruction. Write a response that appropriately completes the request.

### Instruction:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    outputs = examples["output"]
    texts = []
    for instruction, output in zip(instructions, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)

# Check the formatted data
print(dataset[0])



Map:   0%|          | 0/923 [00:00<?, ? examples/s]

{'instruction': '** What happens if you are deficient in B12?', 'output': "** Symptoms of B12 deficiency can include fatigue, paleness, memory problems, dementia, Alzheimer's disease, neurological issues like peripheral neuropathy, and anemia.", 'text': "Below is an instruction. Write a response that appropriately completes the request.\n\n### Instruction:\n** What happens if you are deficient in B12?\n\n### Response:\n** Symptoms of B12 deficiency can include fatigue, paleness, memory problems, dementia, Alzheimer's disease, neurological issues like peripheral neuropathy, and anemia.<|end_of_text|>"}


In [8]:

def format_custom_data(data):
    formatted_data = {"instruction": [], "output": []}
    for entry in data:
        text = entry["text"]
        qa_pairs = text.split('**Q:')
        for qa in qa_pairs:
            if '**A:' in qa:
                parts = qa.split('**A:')
                question = parts[0].strip()
                answer = '**A:'.join(parts[1:]).strip()  # Join remaining parts if there are multiple **A:
                formatted_data["instruction"].append(question)
                formatted_data["output"].append(answer)
    return Dataset.from_dict(formatted_data)

dataset = format_custom_data(custom_data)

# Print the first few entries to verify the formatting
for i in range(5):
    print(f"Entry {i}:")
    print("Instruction:", dataset["instruction"][i])
    print("Output:", dataset["output"][i])
    print()


Entry 0:
Instruction: ** What happens if you are deficient in B12?
Output: ** Symptoms of B12 deficiency can include fatigue, paleness, memory problems, dementia, Alzheimer's disease, neurological issues like peripheral neuropathy, and anemia.

Entry 1:
Instruction: ** What is the most common cause of B12 deficiency?
Output: ** The most common cause of B12 deficiency is low stomach acid, not necessarily a poor diet.

Entry 2:
Instruction: ** How can you improve B12 absorption if you have low stomach acid?
Output: ** Drinking apple cider vinegar with meals can help acidify the stomach and improve B12 absorption.

Entry 3:
Instruction: ** Who is most likely to have a B12 deficiency?
Output: ** One-third of people over the age of 65 have a B12 deficiency, often associated with low stomach acid.

Entry 4:
Instruction: ** What should you do if you are taking antacids or not eating a healthy diet?
Output: ** If you are taking antacids or not consuming enough minerals, you may lose stomach ac

In [9]:
alpaca_prompt = """Below is an instruction. You are a knowledgeable nutritionist. Please provide a detailed and accurate response to the following question.

### Instruction:
{}

### Response:
{}"""
EOS_TOKEN = tokenizer.eos_token  # Must add EOS_TOKEN

def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    outputs = examples["output"]
    texts = []
    for instruction, output in zip(instructions, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, output) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

dataset = dataset.map(formatting_prompts_func, batched=True)

# Print the first few formatted entries to verify the formatting
for i in range(5):
    print(f"Formatted Entry {i}:")
    print(dataset[i]["text"])
    print()


Map:   0%|          | 0/923 [00:00<?, ? examples/s]

Formatted Entry 0:
Below is an instruction. You are a knowledgeable nutritionist. Please provide a detailed and accurate response to the following question.

### Instruction:
** What happens if you are deficient in B12?

### Response:
** Symptoms of B12 deficiency can include fatigue, paleness, memory problems, dementia, Alzheimer's disease, neurological issues like peripheral neuropathy, and anemia.<|end_of_text|>

Formatted Entry 1:
Below is an instruction. You are a knowledgeable nutritionist. Please provide a detailed and accurate response to the following question.

### Instruction:
** What is the most common cause of B12 deficiency?

### Response:
** The most common cause of B12 deficiency is low stomach acid, not necessarily a poor diet.<|end_of_text|>

Formatted Entry 2:
Below is an instruction. You are a knowledgeable nutritionist. Please provide a detailed and accurate response to the following question.

### Instruction:
** How can you improve B12 absorption if you have low 

New Hyperparameters adjusted

In [11]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=4,  # Adjusted batch size
        gradient_accumulation_steps=8,  # Adjusted gradient accumulation steps
        warmup_steps=50,  # Increased warmup steps
        max_steps=200,  # Increased max steps for longer training
        learning_rate=1e-4,  # Adjusted learning rate
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10,  # Increased logging steps for more frequent updates
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=42,  # Ensure reproducibility
        output_dir="outputs",
    ),
)

trainer.train()


Map (num_proc=2):   0%|          | 0/923 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 923 | Num Epochs = 8
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 8
\        /    Total batch size = 32 | Total steps = 200
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
10,1.9467
20,1.557
30,1.3661
40,1.2494
50,1.1708
60,1.0873
70,1.0529
80,1.0437
90,1.0115
100,0.9088


TrainOutput(global_step=200, training_loss=0.990162878036499, metrics={'train_runtime': 3492.9579, 'train_samples_per_second': 1.832, 'train_steps_per_second': 0.057, 'total_flos': 4.502577498741965e+16, 'train_loss': 0.990162878036499, 'epoch': 6.926406926406926})

In [17]:
#save the model now
!mkdir -p "/content/drive/My Drive/fine_tuned_model"

In [18]:
model.save_pretrained("/content/drive/My Drive/fine_tuned_model")

# Save the tokenizer
tokenizer.save_pretrained("/content/drive/My Drive/fine_tuned_model")

('/content/drive/My Drive/fine_tuned_model/tokenizer_config.json',
 '/content/drive/My Drive/fine_tuned_model/special_tokens_map.json',
 '/content/drive/My Drive/fine_tuned_model/tokenizer.json')

In [23]:

import torch

# Define the model name and other configurations
model_name = "unsloth/llama-3-8b-bnb-4bit"
use_4bit = True
bnb_4bit_quant_type = "fp16"
bnb_4bit_compute_dtype = "float16"
use_nested_quant = False
device_map = "auto"  # Adjust based on your hardware setup

In [30]:
#adding conversational memory to fine-tuned model
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer

# Path to the directory where your model is saved
model_path = "/content/drive/My Drive/fine_tuned_model"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Load the model
model = AutoModelForCausalLM.from_pretrained(model_path)


`low_cpu_mem_usage` was None, now set to True since model is quantized.


In [31]:
conversation_history = []

In [32]:
def update_conversation_history(input_text, model_response):
    conversation_history.append((input_text, model_response))

In [33]:
def generate_prompt_with_memory(input_text):
    full_prompt = ""
    for (user_input, model_response) in conversation_history:
        full_prompt += f"User: {user_input}\nModel: {model_response}\n"
    full_prompt += f"User: {input_text}\nModel:"
    return full_prompt

In [34]:
def generate_response(input_text):
    # Generate prompt with conversation memory
    prompt = generate_prompt_with_memory(input_text)

    # Tokenize input prompt
    input_ids = tokenizer.encode(prompt, return_tensors='pt')

    # Generate response
    with torch.no_grad():
        output = model.generate(input_ids, max_length=100, pad_token_id=tokenizer.eos_token_id)

    # Decode and return model response
    model_response = tokenizer.decode(output[0], skip_special_tokens=True)
    return model_response

In [36]:
# Save the updated model
model.save_pretrained("/content/drive/My Drive/fine_tuned_model")

# Save the tokenizer (only if you've made changes to it)
tokenizer.save_pretrained("/content/drive/My Drive/fine_tuned_model")



('/content/drive/My Drive/fine_tuned_model/tokenizer_config.json',
 '/content/drive/My Drive/fine_tuned_model/special_tokens_map.json',
 '/content/drive/My Drive/fine_tuned_model/tokenizer.json')

In [None]:
#save model and test performance before adding additional functionalities

In [None]:
#calling nutrition api to answer nutrition based questions
import requests

def fetch_nutrition_info(query):
    api_url = 'https://api.api-ninjas.com/v1/nutrition?query={}'.format(query)
    headers = {'X-Api-Key': 'NdQIVABDKI3MTuhFLCBWpA==0bLdL8lP0nzaqU7t'}  # Replace with your actual API key
    response = requests.get(api_url, headers=headers)

    if response.status_code == requests.codes.ok:
        nutrition_data = response.json()
        return nutrition_data
    else:
        print("Error:", response.status_code, response.text)
        return None

In [None]:
def process_nutrition_data(nutrition_data):
    if not nutrition_data:
        return "Sorry, I couldn't fetch the nutrition information at the moment."

    nutrition_info = nutrition_data.get('nutrition', {})
    if not nutrition_info:
        return "No nutrition information found for this query."

    result = []
    result.append(f"A {nutrition_info.get('serving_size')} serving typically contains:")
    result.append(f"- Calories: {nutrition_info.get('calories')}")

    # Add more fields as needed
    if 'total_fat' in nutrition_info:
        result.append(f"- Total Fat: {nutrition_info.get('total_fat')['amount']} grams")
        result.append(f"  (of which Saturated Fat is {nutrition_info.get('total_fat').get('saturated_fat', {}).get('amount', 'N/A')} grams)")

    # Add more fields based on the API response structure

    return "\n".join(result)

In [None]:
#previous code for hyperparameters
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 100,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

trainer.train()


Map (num_proc=2):   0%|          | 0/923 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 923 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 100
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.3581
2,2.5852
3,2.9929
4,2.4592
5,2.7237
6,2.0559
7,2.0262
8,1.6958
9,1.6529
10,1.8102


TrainOutput(global_step=100, training_loss=1.3590038061141967, metrics={'train_runtime': 502.7584, 'train_samples_per_second': 1.591, 'train_steps_per_second': 0.199, 'total_flos': 4225189357240320.0, 'train_loss': 1.3590038061141967, 'epoch': 0.8658008658008658})

In [None]:
#save the model now
!mkdir -p "/content/drive/My Drive/Llama3_fine_tuned"

In [None]:
model.save_pretrained("/content/drive/My Drive/Llama3_finetuned")

# Save the tokenizer
tokenizer.save_pretrained("/content/drive/My Drive/Llama3_finetuned")

('/content/drive/My Drive/Llama3_finetuned/tokenizer_config.json',
 '/content/drive/My Drive/Llama3_finetuned/special_tokens_map.json',
 '/content/drive/My Drive/Llama3_finetuned/tokenizer.json')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

import torch

# Define the model name and other configurations
model_name = "unsloth/llama-3-8b-bnb-4bit"
use_4bit = True
bnb_4bit_quant_type = "fp16"
bnb_4bit_compute_dtype = "float16"
use_nested_quant = False
device_map = "auto"  # Adjust based on your hardware setup

# Load the base model with quantization configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

In [None]:
from peft import PeftModel

# Load your PeftModel from the saved checkpoint in Google Drive
model = PeftModel.from_pretrained(base_model,'/content/drive/MyDrive/Llama3_finetuned')
model = model.merge_and_unload()  # Merge LoRA weights and unload them if applicable



In [None]:
print(base_model)

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): Ll

In [None]:
print(model.config)  # Print configuration details
print(model.state_dict().keys())

LlamaConfig {
  "_name_or_path": "unsloth/llama-3-8b-bnb-4bit",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 128000,
  "eos_token_id": 128001,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 14336,
  "max_position_embeddings": 8192,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "num_key_value_heads": 8,
  "pretraining_tp": 1,
  "quantization_config": {
    "_load_in_4bit": true,
    "_load_in_8bit": false,
    "bnb_4bit_compute_dtype": "bfloat16",
    "bnb_4bit_quant_storage": "uint8",
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": true,
    "llm_int8_enable_fp32_cpu_offload": false,
    "llm_int8_has_fp16_weight": false,
    "llm_int8_skip_modules": null,
    "llm_int8_threshold": 6.0,
    "load_in_4bit": true,
    "load_in_8bit": false,
    "quant_method": "bitsandbytes"
  },
  "rm

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/464 [00:00<?, ?B/s]

In [None]:
prompt = "How to train a LLM"

# Tokenize the input
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

In [None]:
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

In [None]:
print(response)