# data training


In [2]:
%%capture
# Installs Unsloth, Xformers (Flash Attention) and all other packages!
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
# fourbit_models = [
#     "unsloth/mistral-7b-v0.3-bnb-4bit",      # New Mistral v3 2x faster!
#     "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
#     "unsloth/llama-3-8b-bnb-4bit",           # Llama-3 15 trillion tokens model 2x faster!
#     "unsloth/llama-3-8b-Instruct-bnb-4bit",
#     "unsloth/llama-3-70b-bnb-4bit",
#     "unsloth/Phi-3-mini-4k-instruct",        # Phi-3 2x faster!
#     "unsloth/Phi-3-medium-4k-instruct",
#     "unsloth/mistral-7b-bnb-4bit",
#     "unsloth/gemma-7b-bnb-4bit",             # Gemma 2.2x faster!
# ] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-Instruct-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.1.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/220 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.8 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [None]:
import pandas as pd
from datasets import Dataset
from unsloth.chat_templates import get_chat_template

# Step 1: Load the CSV file into a pandas DataFrame
csv_file_path = "/content/drive/MyDrive/datatraining/eyecare_appointment_training_data.csv"
df = pd.read_csv(csv_file_path)

# Convert the DataFrame to a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Step 2: Define your chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3",  # Replace with your desired template
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},  # Mapping as per your data format
)

# Step 3: Define the formatting function
def formatting_prompts_func(examples):
    # Convert each row of the DataFrame to the required format
    convos = []
    for i in range(len(examples['Human Message 1'])):
        convo = [
            {"from": "human", "value": examples['Human Message 1'][i]},
            {"from": "gpt", "value": examples['Chatbot Response 1'][i]},
            {"from": "human", "value": examples['Human Message 2'][i]},
            {"from": "gpt", "value": examples['Chatbot Response 2'][i]},
            {"from": "human", "value": examples['Human Message 3'][i]},
            {"from": "gpt", "value": examples['Chatbot Response 3'][i]},
            {"from": "human", "value": examples['Human Message 4'][i]},
            {"from": "gpt", "value": examples['Chatbot Response 4'][i]},
            {"from": "human", "value": examples['Human Message 5'][i]},
            {"from": "gpt", "value": examples['Chatbot Response 5'][i]},
            {"from": "human", "value": examples['Human Message 6'][i]},
            {"from": "gpt", "value": examples['Chatbot Response 6'][i]},
            {"from": "human", "value": examples['Human Message 7'][i]},
            {"from": "gpt", "value": examples['Chatbot Response 8'][i]},
            {"from": "gpt", "value": examples['Chatbot Response 9'][i]},
            {"from": "gpt", "value": examples['Chatbot Response 10'][i]},
            {"from": "gpt", "value": examples['Chatbot Response 11'][i]},
            {"from": "gpt", "value": examples['Chatbot Response 12'][i]},
            {"from": "gpt", "value": examples['Chatbot Response 13'][i]},
        ]
        convos.append(convo)

    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts}

# Step 4: Apply the formatting function to the dataset
dataset = dataset.map(formatting_prompts_func, batched=True)

# Now the dataset is ready for training with the formatted prompts


Map:   0%|          | 0/65 [00:00<?, ? examples/s]

In [None]:
dataset[5]

{'Human Message 1': "Hello, this is Olivia Brown calling to book an eye examination appointment for my child at the eyecare hospital. My child has been experiencing some difficulty seeing the board at school. Here are the details:\nParent's Name: Olivia Brown\nChild's Name: Lily Brown\nDOB: June 1, 2012\nPreferred Appointment Date: next tuesday\nPreferred Time: Afternoon, if possible\nReason for Visit: Difficulty seeing the board at school\nPlease call me back at your earliest convenience to confirm the appointment or suggest an alternative time if my preferred slot is not available. You can reach me at the contact number provided above. Thank you.\nBest regards,\nOlivia Brown\n(555) 789-0123",
 'Chatbot Response 1': 'Ok, thank you for sharing the details. Can you please help provide an email address and Date Of Birth?',
 'Human Message 2': 'test@first-insight.com , DOB is December 5, 2000',
 'Chatbot Response 2': 'Can you please confirm the following contact details\nPhone Number: (55

In [None]:
print(dataset[5]["text"])

<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Hello, this is Anthony Poole calling to book an eye examination appointment for my child at the eyecare hospital. My child, Karen, has been experiencing complaints about distant vision. Here are the details: Parent's Name: Anthony Poole, Child's Name: Karen, DOB: August 19, 2014, Preferred Appointment Date: August 27, 2024, Preferred Time: Afternoon, Reason for Visit: Complaints about distant vision.<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Ok, thank you for sharing the details. Can you please help provide an email address and Date Of Birth?<|eot_id|><|start_header_id|>user<|end_header_id|>

stevensrobyn@example.net, DOB is January 04, 1990<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Can you please confirm the following contact details? Phone Number: 793-632-2247x29006 Email: stevensrobyn@example.net DOB: January 04, 1990<|eot_id|><|start_header_id|>user<|end_header_id|>

Yes, it's correct.<|eot_id|><

In [None]:
unsloth_template = \
    "{{ bos_token }}"\
    "{{ 'You are a helpful assistant to the user\n' }}"\
    "{% for message in messages %}"\
        "{% if message['role'] == 'user' %}"\
            "{{ '>>> User: ' + message['content'] + '\n' }}"\
        "{% elif message['role'] == 'assistant' %}"\
            "{{ '>>> Assistant: ' + message['content'] + eos_token + '\n' }}"\
        "{% endif %}"\
    "{% endfor %}"\
    "{% if add_generation_prompt %}"\
        "{{ '>>> Assistant: ' }}"\
    "{% endif %}"
unsloth_eos_token = "eos_token"

if False:
    tokenizer = get_chat_template(
        tokenizer,
        chat_template = (unsloth_template, unsloth_eos_token,), # You must provide a template and EOS token
        mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
        map_eos_token = True, # Maps <|im_end|> to </s> instead
    )

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 7,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "/content/drive/MyDrive/datatraining",
    ),
)

Map (num_proc=2):   0%|          | 0/65 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer_stats = trainer.train()


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 65 | Num Epochs = 30
O^O/ \_/ \    Batch size per device = 7 | Gradient Accumulation steps = 4
\        /    Total batch size = 28 | Total steps = 60
 "-____-"     Number of trainable parameters = 41,943,040


Step,Training Loss
1,2.871
2,2.9278
3,2.9371
4,2.6039
5,2.1013
6,1.7205
7,1.8707
8,1.7403
9,1.5349
10,1.4681


In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.748 GB.
6.699 GB of memory reserved.


In [None]:
#@title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

616.2519 seconds used for training.
10.27 minutes used for training.
Peak reserved memory = 6.699 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 45.423 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3", # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping = {"role" : "from", "content" : "value", "user" : "human", "assistant" : "gpt"}, # ShareGPT style
)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"from": "human", "value": "i want to book appointment"}
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


['<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\ni want to book appointment<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nPlease provide the following details:\n\n1. Name\n2. DOB (Date of Birth)\n3. Preferred Appointment Date\n4. Preferred Time (Morning/Afternoon)\n5. Reason for Visit (Select one):\n\t* Eye Check-up\n\t* Eye Problem\n\t* Eye Comfort\n\t* Eye Surgery']

In [None]:
from unsloth.chat_templates import get_chat_template

# Initialize the tokenizer with the chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3",  # Supports zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, unsloth
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},  # ShareGPT style
)

# Enable native 2x faster inference
FastLanguageModel.for_inference(model)

# Initialize conversation list
messages = []
max_sequence_length = 2048
# Conversation loop
while True:
    # Get input from the user (human message)
    human_input = input("You: ")

    # Add the human message to the conversation
    messages.append({"from": "human", "value": human_input})

    # Apply the chat template, tokenize, and prepare for generation
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,  # Must add for generation
        return_tensors="pt",
    ).to("cuda")
    if inputs.size(1) > max_sequence_length:
        inputs = inputs[:, -max_sequence_length:]
    # Generate the response from the model
    outputs = model.generate(input_ids=inputs, max_new_tokens=64, use_cache=True)

    # Decode the generated output
    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Extract the chatbot's response and add to the conversation
    chatbot_response = decoded_output[0]

    # Print the chatbot's response
    print(f"ChatBot: {chatbot_response}")
    messages.append({"from": "gpt", "value": chatbot_response})

    # Check if the conversation should continue or end
    if human_input.lower() in ["exit", "quit", "bye"]:
        print("Ending the conversation. Goodbye!")
        break


You: Hello This is John Doe, I ordered glasses on my last Visit, Can I pick up the glasses and contacts on Thursday around 8 am?
ChatBot: user

Hello This is John Doe, I ordered glasses on my last Visit, Can I pick up the glasses and contacts on Thursday around 8 am?assistant

Ok sure, Can you please help provide Date of Birth, phone number or email address
You: 555123456 test@first-insight.com 3rd Aug
ChatBot: user

Hello This is John Doe, I ordered glasses on my last Visit, Can I pick up the glasses and contacts on Thursday around 8 am?assistant

user

Hello This is John Doe, I ordered glasses on my last Visit, Can I pick up the glasses and contacts on Thursday around 8 am?assistant

Ok sure, Can you please help provide Date of Birth, phone number or email addressuser

555123456 test@first-insight.com 3rd Augassistant

Thank you for the details, can you please provide your birth Year?
You: 1998
ChatBot: user

Hello This is John Doe, I ordered glasses on my last Visit, Can I pick up t

In [None]:
from google.colab import files
files.download('/content/drive/MyDrive/datatraining/checkpoint-60/README.md')
files.download('/content/drive/MyDrive/datatraining/checkpoint-60/adapter_config.json')
files.download('/content/drive/MyDrive/datatraining/checkpoint-60/adapter_model.safetensors')
files.download('/content/drive/MyDrive/datatraining/checkpoint-60/adapter_model.safetensors')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!zip -r dataset.zip /content/drive/MyDrive/datatraining

  adding: content/drive/MyDrive/datatraining/ (stored 0%)
  adding: content/drive/MyDrive/datatraining/eyecare_appointment_training_data.csv (deflated 87%)
  adding: content/drive/MyDrive/datatraining/runs/ (stored 0%)
  adding: content/drive/MyDrive/datatraining/runs/Aug21_02-31-47_97386205e7bd/ (stored 0%)
  adding: content/drive/MyDrive/datatraining/runs/Aug21_02-31-47_97386205e7bd/events.out.tfevents.1724207517.97386205e7bd.915.0 (deflated 65%)
  adding: content/drive/MyDrive/datatraining/checkpoint-60/ (stored 0%)
  adding: content/drive/MyDrive/datatraining/checkpoint-60/README.md (deflated 66%)
  adding: content/drive/MyDrive/datatraining/checkpoint-60/adapter_model.safetensors (deflated 7%)
  adding: content/drive/MyDrive/datatraining/checkpoint-60/adapter_config.json (deflated 54%)
  adding: content/drive/MyDrive/datatraining/checkpoint-60/tokenizer_config.json (deflated 96%)
  adding: content/drive/MyDrive/datatraining/checkpoint-60/special_tokens_map.json (deflated 70%)
  ad

In [None]:
from google.colab import files
files.download('/content/drive/MyDrive/')


# model load

In [9]:
!pip install transformers




In [4]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel

import torch
# Specify the path to the directory where the model and tokenizer files are saved
model_directory = "/content/drive/MyDrive/checkpoint-60"

# Load the tokenizer from the directory
tokenizer = AutoTokenizer.from_pretrained(model_directory)

# Initialize the tokenizer with the chat template
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3",  # Choose the template style that fits your use case
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},  # ShareGPT style
)

# Load the model from the directory and set the precision to float16
model = AutoModelForCausalLM.from_pretrained(model_directory, torch_dtype=torch.float16, trust_remote_code=True)

# Enable native 2x faster inference
FastLanguageModel.for_inference(model)



🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


`low_cpu_mem_usage` was None, now set to True since model is quantized.


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096, padding_idx=128255)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
            (lora_dropout): ModuleDict(
              (default): Identity()
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=4096, out_features=16, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=16, out_features=4096, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (k_proj): lora.Linear4bit(
            (base_layer): Linear4bit(in_features=4096, out_features=1024, bias=False)
            (lora_dropout): ModuleDict(
  

In [None]:
# Initialize conversation list
messages = []
max_sequence_length = 2048

# Conversation loop
while True:
    # Get input from the user (human message)
    human_input = input("You: ")

    # Add the human message to the conversation
    messages.append({"from": "human", "value": human_input})

    # Apply the chat template, tokenize, and prepare for generation
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,  # Must add for generation
        return_tensors="pt",
    ).to("cuda")

    # Truncate the input to fit within the model's max sequence length
    if inputs.size(1) > max_sequence_length:
        inputs = inputs[:, -max_sequence_length:]

    # Generate the response from the model
    outputs = model.generate(input_ids=inputs, max_new_tokens=64, use_cache=True)

    # Decode the generated output
    decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    # Extract the chatbot's response and add to the conversation
    chatbot_response = decoded_output[0]

    # Print the chatbot's response
    print(f"ChatBot: {chatbot_response}")
    messages.append({"from": "gpt", "value": chatbot_response})

    # Check if the conversation should continue or end
    if human_input.lower() in ["exit", "quit", "bye"]:
        print("Ending the conversation. Goodbye!")
        break


You: I have an appointment tomorrow with Dr Clark, but something came up, and I need to cancel it
ChatBot: user

I have an appointment tomorrow with Dr Clark, but something came up, and I need to cancel itassistant

Sure, Can you please let me know your Name, DOB and Phone Number or Email?
You: My name is John Doe, DOB: 10th Feb 1990, My mobile number is 3333333333.assistant
ChatBot: user

I have an appointment tomorrow with Dr Clark, but something came up, and I need to cancel itassistant

user

I have an appointment tomorrow with Dr Clark, but something came up, and I need to cancel itassistant

Sure, Can you please let me know your Name, DOB and Phone Number or Email?user

My name is John Doe, DOB: 10th Feb 1990, My mobile number is 3333333333.assistantassistant

Thanks for the details, We have sent the OTP on your mobile number to verify your identity. Can you enter OTP?
You: 1111
ChatBot: user

I have an appointment tomorrow with Dr Clark, but something came up, and I need to canc