In [None]:
%%capture
!pip install unsloth

!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import re

In [None]:
import torch
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer, DataCollatorForSeq2Seq
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel
from datasets import Dataset
from unsloth import is_bfloat16_supported

# Saving model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Warnings
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [None]:
max_seq_length = 1024
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-3B-Instruct", # "unsloth/Llama-3.2-1B-Instruct",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    lora_alpha=16,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
    use_rslora=True,
    use_gradient_checkpointing="unsloth",
    random_state = 32,
    loftq_config = None,
)
print(model.print_trainable_parameters())

==((====))==  Unsloth 2024.10.3: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: Tesla T4. Max memory: 14.748 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.4.1+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

Unsloth: We fixed a gradient accumulation bug, but it seems like you don't have the latest transformers version!
Please update transformers via:
`pip uninstall transformers -y && pip install --upgrade --no-cache-dir "git+https://github.com/huggingface/transformers.git"`
Unsloth 2024.10.3 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


trainable params: 24,313,856 || all params: 3,631,066,112 || trainable%: 0.6696
None


In [None]:
sys_prompt = '''You're Chatterbot, a friendly AI English tutor for Indian learners. Use simple words, short responses (max 15 words per question), and a light Indian accent. Start with basic questions about favorite colors, foods, and hometown. If the student converses well, ask about family and friends.

Key points:
1. Use relatable Indian examples and gentle humor
2. Subtly correct mistakes in verb tenses, pronouns, articles, and word order
3. Be patient and encouraging
4. Keep conversations natural while providing quick English tips
5. Adapt your style to the user's responses and English level

If unsure, admit it.
'''

### **Note:** Data must be this format...


```
[
  {"role": "system", "content": "You are an assistant"},
  {"role": "user", "content": "What is 2+2?"},
  {"role": "assistant", "content": "It's 4."}
]
```



In [None]:
def prepared_text_llama(messages):
  messages = eval(messages)
  system_prompt, user_dt, assistant_dt = messages[0]['content'], messages[1]['content'], messages[2]['content']
  initial_str = f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

{user_dt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{assistant_dt}<|eot_id|>'''
  messages = messages[3:]
  # print(len(messages))
  if len(messages)%2:
    messages = messages[:-1]
  if len(messages) % 2 == 0 and len(messages) != 0:
    for idx in range(0, len(messages), 2):
      user_dt = messages[idx]['content']
      assistant_dt = messages[idx+1]['content']
      initial_str += f'''<|start_header_id|>user<|end_header_id|>\n\n{user_dt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{assistant_dt}<|eot_id|>'''

  return initial_str
#   return f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

# {system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

# {user_input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

# {assistant_output}<|eot_id|>'''

def prepared_text_llama_response(messages):
  system_prompt, curr_user_dt = messages[0]['content'], messages[-1]['content']
  messages = messages[1:-1]

  if len(messages) % 2:
    messages = messages[:-1]

  initial_str = ""
  if len(messages) % 2 == 0 and len(messages) != 0:
    for idx in range(0, len(messages), 2):
      user_dt = messages[idx]['content']
      assistant_dt = messages[idx+1]['content']
      initial_str += f'''<|start_header_id|>user<|end_header_id|>\n\n{user_dt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{assistant_dt}<|eot_id|>'''

  initial_str = f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|>" + initial_str
  initial_str += f"<|start_header_id|>user<|end_header_id|>\n\n{curr_user_dt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
  return initial_str
#   return f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

# {system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>

# {user_input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

# '''

def formatting_prompt(examples):
  # print(examples['messages'])

  # role_system_contents = examples["role_system_content"]
  # role_user_contents = examples["role_user_content"]
  # role_assistant_contents = examples["role_assistant_content"]
  # texts = []
  # for role_system_content, role_user_content, role_assistant_content in zip(role_system_contents, role_user_contents, role_assistant_contents):
  #     text = prepared_text_llama(role_system_content, role_user_content, role_assistant_content)
  #     texts.append(text)
  texts = []
  for messages in examples['messages']:
    text = prepared_text_llama(messages)
    texts.append(text)
  return { "text" : texts, }

In [None]:
final_cleaned_df = pd.read_csv("/content/final_data.csv")

In [None]:
final_cleaned_df.shape

(878, 1)

In [None]:
training_data = Dataset.from_pandas(final_cleaned_df)
training_data = training_data.map(formatting_prompt, batched=True)

Map:   0%|          | 0/878 [00:00<?, ? examples/s]

In [None]:
training_data

Dataset({
    features: ['messages', 'text'],
    num_rows: 878
})

In [None]:
training_data[50]['text']

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nYou\'re Chatterbot, a friendly AI English tutor for Indian learners. Use simple words, short responses (max 15 words per question), and a light Indian accent. Start with basic questions about favorite colors, foods, and hometown. If the student converses well, ask about family and friends.\n\nKey points:\n1. Use relatable Indian examples and gentle humor\n2. Subtly correct mistakes in verb tenses, pronouns, articles, and word order\n3. Be patient and encouraging\n4. Keep conversations natural while providing quick English tips\n5. Adapt your style to the user\'s responses and English level\n\nIf unsure, admit it.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello!<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nNamaste! I\'m Chatterbot, your English buddy. What\'s your name?<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nMy name is Arjun.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nNic

In [None]:
trainer=SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=training_data,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False, # True
    args=TrainingArguments(
        learning_rate=5e-05,
        lr_scheduler_type="linear",
        per_device_train_batch_size=4, # 16
        gradient_accumulation_steps=4, # 8
        num_train_epochs=1, # 40
        # warmup_steps = 5,
        # max_steps = 60,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=5, # 10
        output_dir="output",
        seed=0,
        report_to=[],  # Disable W&B logging
    ),
)

trainer.train()

Map (num_proc=2):   0%|          | 0/878 [00:00<?, ? examples/s]

**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers and Unsloth!


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 878 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 4 | Gradient Accumulation steps = 4
\        /    Total batch size = 16 | Total steps = 55
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1,2.8046
2,3.083
3,2.8989
4,2.7796
5,2.5699
6,2.3092
7,2.1433
8,2.0127
9,1.8022
10,1.6246


TrainOutput(global_step=55, training_loss=0.7756207239898768, metrics={'train_runtime': 440.4144, 'train_samples_per_second': 1.994, 'train_steps_per_second': 0.125, 'total_flos': 6182299595120640.0, 'train_loss': 0.7756207239898768, 'epoch': 1.0})

In [None]:
# model.save_pretrained("chatterbot_lora_model") # Local saving
# tokenizer.save_pretrained("chatterbot_lora_model")

from google.colab import userdata
if False:
  model.push_to_hub("Sravana/llama_chatterbot_4", token = userdata.get('HF_TOKEN')) # Online saving
  tokenizer.push_to_hub("Sravana/llama_chatterbot_4", token = userdata.get('HF_TOKEN')) # Online saving

# Save to multiple GGUF options - much faster if you want multiple!
if True:
    model.push_to_hub_gguf(
        "Sravana/llama_chatterbot_4", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = userdata.get('HF_TOKEN'), # Get a token at https://huggingface.co/settings/tokens
    )

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which will take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 2.2G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 5.65 out of 12.67 RAM for saving.


100%|██████████| 28/28 [00:01<00:00, 14.93it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Unsloth: Saving Sravana/llama_chatterbot_4/pytorch_model-00001-of-00002.bin...
Unsloth: Saving Sravana/llama_chatterbot_4/pytorch_model-00002-of-00002.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp will take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits will take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['q4_k_m', 'q8_0', 'q5_k_m'] will take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: [0] Installing llama.cpp. This will take 3 minutes...
Unsloth: [1] Converting model at Sravana/llama_chatterbot_4 into f16 GGUF format.
The output location will be /content/Sravana/llama_chatterbot_4/unsloth.F16.gguf
This will take 3 minutes...
INFO:hf-to-gguf:Loading model: llama_chatterbot_4
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:rope_freqs.weight,           torch.float32 --> F32, shape = {64}
INFO:hf-to-gguf:gguf: loading model weight map from 'pytorch_model.bin.index.json'
INFO:hf-to-gguf:gguf: loading model part 'pytorch_model-00001-of-00

unsloth.F16.gguf:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

Saved GGUF to https://huggingface.co/Sravana/llama_chatterbot_4
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q4_K_M.gguf:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/Sravana/llama_chatterbot_4
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q8_0.gguf:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/Sravana/llama_chatterbot_4
Unsloth: Uploading GGUF to Huggingface Hub...


unsloth.Q5_K_M.gguf:   0%|          | 0.00/2.32G [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Saved GGUF to https://huggingface.co/Sravana/llama_chatterbot_4


In [None]:

text="Hello how are you?"
text

'Hello how are you?'

In [None]:
max_seq_length = 1024
model = FastLanguageModel.for_inference(model)

In [None]:
sys_prompt

'You\'re Chatterbot, a friendly AI English tutor for Indian learners. Use simple words, short responses (max 15 words per question), and a light Indian accent. Start with basic questions about favorite colors, foods, and hometown. If the student converses well, ask about family and friends.\n\nKey points:\n1. Use relatable Indian examples and gentle humor\n2. Subtly correct mistakes in verb tenses, pronouns, articles, and word order\n3. Be patient and encouraging\n4. Keep conversations natural while providing quick English tips\n5. Adapt your style to the user\'s responses and English level\n\nIf unsure, admit it.\n\nExample:\nBot: "Namaste! What\'s your plan for today?"\nUser: "I go to market for vegetables."\nBot: "Nice! Shopping for veggies? Quick tip: Try \'I\'m going to the market.\' Which sabzi is your favorite?'

In [None]:
!pip install -q gradio
import gradio as gr

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.3/42.3 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.8/319.8 kB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.7/94.7 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.0/78.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m447.4/447.4 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.9/141.9 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
global messages
messages = [{"role": "system", "content": sys_prompt}]
def llama_response(message, history):
  global messages
  print(len(messages))
  # retain always pre 3 chat conv
  if len(messages) > 5:
    messages = [messages[0]] + messages[-3:]

  messages.append({
      "role": "user",
      "content": message
  })

  inputs = tokenizer([prepared_text_llama_response(messages)], return_tensors = "pt").to("cuda")
  outputs = model.generate(**inputs, max_new_tokens = max_seq_length, use_cache = True)
  answer=tokenizer.batch_decode(outputs)
  answer = answer[0].split("<|end_header_id|>")[-1].split("<|eot_id|>")[0]

  messages.append({
      "role": "assistant",
      "content": answer
  })

  return answer

In [None]:
gr.ChatInterface(llama_response).launch(debug = False)

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4d30a37f1a34b14348.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
messages = [
    {"role": "system", "content": sys_prompt},
    {"role": "user", "content": "my name is rakesh"},
    {"role": "assistant", "content": "nice to meet you rakesh. how are you"},
    {"role": "user", "content": "what is my name?"},
]

In [None]:
def bot(messages):
  inputs = tokenizer(
      [
          prepared_text_llama_response(messages)
          # data_prompt.format(
          # #instructions
          # text,
          # #answer
          # "",
          # )
      ], return_tensors = "pt").to("cuda")

  outputs = model.generate(**inputs, max_new_tokens = max_seq_length, use_cache = True)
  answer=tokenizer.batch_decode(outputs)
  answer = answer[0].split("<|end_header_id|>")[-1]
  # answer = answer[0].split("### Response:")[-1]
  print(f"Llama Ans: {answer.strip()}")
  print()
  print("="*10)
  print()

In [None]:
bot(messages)

Llama Ans: Your name is Rakesh.<|eot_id|>




In [None]:
while True:
  input_text = input("User Input: ")
  if input_text == 'x' or input_text == 'X' or input_text == "quit":
    break
  bot(input_text)

User Input: x


In [None]:
# @markdown **Load save lora-adapter**

# if True:
#     from unsloth import FastLanguageModel
#     model, tokenizer = FastLanguageModel.from_pretrained(
#         model_name = "demomern/chatterbot", # "lora_model", # YOUR MODEL YOU USED FOR TRAINING
#         max_seq_length = max_seq_length,
#         dtype = None,
#         load_in_4bit = True,
#     )
#     FastLanguageModel.for_inference(model) # Enable native 2x faster inference

In [None]:
sys_prompt

"You're Chatterbot, a friendly AI English tutor for Indian learners. Your goal: Help improve English through fun, casual chats"

In [None]:
# messages = [
#     {"role": "system", "content": sys_prompt},
#     {"role": "user", "content": "who are you?"},
#     {"role": "assistant", "content": "I am good, thank you! what about you?"},
#     {"role": "user", "content": "what is your brother name?"},
# ]
# inputs = tokenizer.apply_chat_template(
#     messages,
#     tokenize = False,
#     add_generation_prompt = False, # Must add for generation
#     return_tensors = "pt",
# )#.to("cuda")

# from transformers import TextStreamer
# text_streamer = TextStreamer(tokenizer, skip_prompt = True)
# _ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
#                    use_cache = True, temperature = 1.5, min_p = 0.1)