In [None]:
# !pip install -U datasets

In [4]:
!pip install -q accelerate peft bitsandbytes transformers trl

In [5]:
import math
import os
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset, Dataset
from transformers import (AutoModelForCausalLM,
                   AutoTokenizer,
                   BitsAndBytesConfig,
                   HfArgumentParser,
                   TrainingArguments,
                   TextDataset,
                   DataCollatorForLanguageModeling,
                   pipeline,
                   logging)
from accelerate import Accelerator
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [None]:
# promt format for Mistral
# <s> [INST] <<SYS>> System_Promt <</SYS>> User_prompt [/INST] LLM_Response </s>

# Data Preprocessing

> Add blockquote



In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
remove_words = ["উত্তর-", "উঃ।"]

with open('data_chatbot.txt', 'w', encoding="utf8") as f:
    for index, item in data_chatbot.iterrows():
        answer = item["উত্তর"].replace('</s>', "").strip()
        question ="<s>[INST]"+" "+item["প্রশ্ন"].strip()+"[/INST]"
        #             temp_data = {"Question": question, "Answer": answer}
        #             csv_data = csv_data.append(temp_data, ignore_index = True)
        data_qa = question+" "+answer
        f.write(data_qa.replace('\n', "").replace('\r', "")+" </s>\n")

In [None]:
remove_words = ["উত্তর-", "উঃ।"]

with open('data_processed.txt', 'w', encoding="utf8") as f:
  for index, item in data.iterrows():
    answer = data2["exp_bangla"][index]
    if type(answer).__name__ == "str":
      question ="<s>[INST]"+" "+item["question_bangla"].strip()+"[/INST]"
#       question = item["question_bangla"]

      for rm_w in remove_words:
          answer = answer.replace(rm_w, "").replace('</s>', '').strip()
#         temp_data = {"Question": question, "Answer": answer}
#         csv_data = csv_data.append(temp_data, ignore_index = True)
          data_qa = question+" "+answer
          f.write(data_qa.replace('\n', "").replace('\r', "")+"</s>\n")

# LLM Model

In [None]:
# The model that you want to train from the Hugging Face hub
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# Fine-tuned model name
new_model = "Mistral-7b-QA"

# QLoRA parameters

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

# bitsandbytes parameters

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

# TrainingArguments parameters

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule
lr_scheduler_type = "cosine"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# SFT parameters

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [None]:
# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules = [
    "lm_head",
    "q_proj",
    "k_proj",
    "v_proj",
    "up_proj",
    "down_proj",
    "o_proj",
    "gate_proj"
  ]
)

In [None]:
max_seq_length = model.config.max_position_embeddings
print(f"Model can handle maximum {max_seq_length} tokens")

length = [len(tokenizer.encode(data["text"])) for data in dataset]
print("Max sequence length of the dataset is {}".format(max(length)))
# chatbot 785
# old 930

# For Training Only

# Inference

In [None]:
peft_model_id = "/content/drive/MyDrive/dataset_chatbot/mistral_qa/Mistral-7b-QA"
model.load_adapter(peft_model_id)

def filter_response(response):
    list_res = list(response.rsplit("।", -1))
    unique_parts = set(list_res)

    index_tuple = []

    for sentence in unique_parts:
        if sentence in list_res:
            index = list_res.index(sentence)
            index_tuple.append((sentence, index))

    sorted_list = sorted(index_tuple, key=lambda x: x[1])

    if len(sorted_list) > 3:
        sorted_list = sorted_list[:3]

    final_res = ""
    for sentence in sorted_list:
        final_res += (sentence[0]+"।")

    return final_res

def generate_text(prompt, model, tokenizer):
    prompt = f"<s>[INST] {prompt}[/INST]"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    with torch.no_grad():
        generated_code = tokenizer.decode(model.generate(**inputs, max_new_tokens=max_seq_length, pad_token_id=2)[0], skip_special_tokens=True)
#     print(generated_code)
    return filter_response(generated_code.rsplit('[/INST]', 1)[1].strip())

In [None]:
question = "আমার বাচ্চার বয়স মাত্র ৪ মাস। সে এখনও শাকসবজি খাওয়া শুরু করেনি। কি করতে পারি?"
response = generate_text(question, model, tokenizer)
print(response)

In [None]:
question = "আমার ৩ বছর এর বাচ্চা থার্মোমিটার মুখ দিয়ে ভেঙ্গে ফেলেছে। মুখে একটু কেটে গেছে। এখন কি করব?"
response = generate_text(question, model, tokenizer)
print(response)

In [None]:
question = "বাচ্চার সর্দি কাশি হলে কি কমলা বা মাল্টা খাওয়ানো যাবে?"
response = generate_text(question, model, tokenizer)
print(response)

In [None]:
question = "আমার শিশু একটি আপেলের টুকরো গিলে ফেলেছে এবং এটি ভুল পথে চলে গেছে। এখন ঠিক আছে বলে মনে হচ্ছে, কিন্তু আমার কি চিন্তা করা উচিত?"
response = generate_text(question, model, tokenizer)
print(response)

In [None]:
question = "আমার বাচ্চাকে দিনে কত বার খাওানো উচিৎ? বাচ্ছার বয়স ৩ বছর।"
response = generate_text(question, model, tokenizer)
print(response)

In [None]:
question = "বাচ্চাদের মধ্যে অ্যাসপিরেশন কিভাবে চিকিৎসা করা হয়?"
response = generate_text(question, model, tokenizer)
print(response)

In [None]:
question = "অ্যাসপিরেশন কিভাবে চিকিৎসা করা হয়?"
response = generate_text(question, model, tokenizer)
print(response)

In [None]:
question = "অ্যাসপিরেশন কিভাবে চিকিৎসা করা হয়?"
response = generate_text(question, model, tokenizer)
print(response)

In [None]:
# Import necessary libraries
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    TrainerCallback,
    TrainerState,
    TrainerControl,
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from trl import SFTTrainer
import math

In [3]:
pip install -U bitsandbytes


Collecting bitsandbytes
  Downloading bitsandbytes-0.44.0-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.0-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.0
Note: you may need to restart the kernel to use updated packages.


In [4]:
# Import necessary libraries
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
from datasets import load_dataset
from trl import SFTTrainer
from accelerate import infer_auto_device_map, init_empty_weights

# Set environment variable to reduce fragmentation
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# The model that you want to train from the Hugging Face hub
model_name = "/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1"

# Fine-tuned model name
new_model = "Mistral-7b-QA"

# QLoRA parameters
lora_r = 32  # Reduced to save memory
lora_alpha = 16
lora_dropout = 0.05  # Reduced dropout
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]  # Updated target modules

# bitsandbytes parameters
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = False

# TrainingArguments parameters
output_dir = "./results"
gradient_checkpointing = True
max_grad_norm = 0.3
learning_rate = 2e-4
weight_decay = 0.001
optim = "paged_adamw_32bit"
lr_scheduler_type = "cosine"
max_steps = -1
warmup_ratio = 0.03
group_by_length = True
save_steps = 0

# SFT parameters
packing = False
num_train_epochs = 1
fp16 = True
bf16 = False
per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 4

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    padding_side="left",
    use_fast=False,
    add_bos_token=False,
    add_eos_token=False,
)
tokenizer.pad_token = tokenizer.eos_token  # Ensure pad token is set

# Load the dataset
dataset_name = "/kaggle/input/noahkalifinal/Noakhali Train Translation - Sheet3.csv"
dataset = load_dataset('csv', data_files=dataset_name, split='train')

# Remove leading/trailing whitespace from column names
dataset = dataset.rename_columns({col: col.strip() for col in dataset.column_names})

# Verify the column names
print("Column Names after renaming:", dataset.column_names)

# Combine 'Question' and 'Answer' into a single text field
question_column = 'প্রশ্ন'
answer_column = 'উত্তর'

# Check if the columns exist
for col in [question_column, answer_column]:
    if col not in dataset.column_names:
        raise ValueError(f"Column '{col}' not found in the dataset. Available columns are: {dataset.column_names}")

# Shuffle the dataset
dataset = dataset.shuffle(seed=42)

# Filter valid examples
def is_valid_example(example):
    question = example[question_column]
    answer = example[answer_column]
    return (
        question is not None and answer is not None and
        isinstance(question, str) and isinstance(answer, str) and
        question.strip() != '' and answer.strip() != ''
    )

dataset = dataset.filter(is_valid_example)

# Combine 'Question' and 'Answer'
def combine_questions_answers(example):
    question = example.get(question_column, '').strip()
    answer = example.get(answer_column, '').strip()
    example['text'] = f"প্রশ্ন: {question}\nউত্তর: {answer}"
    return example

dataset = dataset.map(combine_questions_answers)

# Remove the original columns
dataset = dataset.remove_columns([question_column, answer_column])

# Verify the combined text
print("Sample entries after combining 'Question' and 'Answer':")
for i in range(min(3, len(dataset))):
    print(dataset[i]['text'])
    print()

# Configure bitsandbytes quantization with CPU offloading
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=use_nested_quant,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    load_in_4bit_fp32_cpu_offload=True,  # Enable CPU offloading
)

# Load the model with empty weights to infer device map
with init_empty_weights():
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        trust_remote_code=True,
    )

# Define maximum memory
max_memory = {
    0: "14GiB",  # Adjust based on your GPU's free memory
    "cpu": "16GiB",  # Adjust based on your CPU's available RAM
}

# Infer the device map
from transformers import LlamaConfig
no_split_modules = ["LlamaDecoderLayer"] if isinstance(model.config, LlamaConfig) else []

device_map = infer_auto_device_map(
    model,
    max_memory=max_memory,
    no_split_module_classes=no_split_modules,
)

print("Device Map:", device_map)

# Reload the model with the device map
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    trust_remote_code=True,
)

# Disable caching
model.config.use_cache = False

# Enable gradient checkpointing
if gradient_checkpointing:
    model.gradient_checkpointing_enable()

# Prepare LoRA configuration with updated target modules
model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=target_modules,  # Updated target modules
    lora_dropout=lora_dropout,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)

# Determine maximum sequence length
max_seq_length = min(512, model.config.max_position_embeddings)  # Adjust as needed
print(f"Model can handle maximum {max_seq_length} tokens")

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=max_seq_length,
        padding="max_length",
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['text'])

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="none",
    logging_steps=10,
    evaluation_strategy="no",
    save_strategy="no",
)

# Initialize the SFTTrainer
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_dataset,
    peft_config=peft_config,
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

# Start training
trainer.train()

# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
tokenizer.save_pretrained(new_model)



Generating train split: 0 examples [00:00, ? examples/s]

Column Names after renaming: ['প্রশ্ন', 'উত্তর']


Filter:   0%|          | 0/1879 [00:00<?, ? examples/s]

Map:   0%|          | 0/1879 [00:00<?, ? examples/s]

Unused kwargs: ['load_in_4bit_fp32_cpu_offload']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


Sample entries after combining 'Question' and 'Answer':
প্রশ্ন: বৌ খানের সময় বড্ডা এক্কান আসি দি কইলো, পাক কিরুম অইছে?
উত্তর: তো তুই কিয়া কইস হিয়ারে?

প্রশ্ন: এই কথাত্তে চখেত্তে হানি আইয়ের??
উত্তর: বৌ এর চোখ কান ছলছল

প্রশ্ন: ওমাহ এইচ্ছা অইলে কেন্নে অইবো??
উত্তর: হেতে কারো লগে কথা কয়না ,কারো বাসাত যায়না



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device Map: OrderedDict([('', 0)])


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model can handle maximum 512 tokens


Map:   0%|          | 0/1879 [00:00<?, ? examples/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
10,3.4452
20,1.9972
30,1.5705
40,1.4511
50,1.3579
60,1.2688
70,1.2965
80,1.2229
90,1.2226
100,1.2394


('Mistral-7b-QA/tokenizer_config.json',
 'Mistral-7b-QA/special_tokens_map.json',
 'Mistral-7b-QA/tokenizer.model',
 'Mistral-7b-QA/added_tokens.json')

In [5]:
# Import necessary libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import os

# Define the base model name (same as in your training code)
model_name = "/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1"

# Define the path to your fine-tuned model
fine_tuned_model_path = "/kaggle/working/Mistral-7b-QA"

# Create the offload folder if it doesn't exist
offload_folder = "/kaggle/working/offload"
os.makedirs(offload_folder, exist_ok=True)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    fine_tuned_model_path,
    trust_remote_code=True,
    use_fast=False,
)

# Load the base model with the correct offload parameter
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
    offload_folder=offload_folder,  # Use 'offload_folder' here
)

# Disable caching to prevent memory issues
base_model.config.use_cache = False

# Load the PEFT model with the correct offload parameter
model = PeftModel.from_pretrained(
    base_model,
    fine_tuned_model_path,
    offload_folder=offload_folder,  # Use 'offload_folder' here
)

# Move the model to the appropriate device (if necessary)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prepare your prompt
prompt = "প্রশ্ন: আইজ্জা অন্যের মনের কিসে?\nউত্তর:"

# Tokenize the input prompt
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate the model's response
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,       # Adjust as needed
        do_sample=True,          # Enable sampling for variability
        temperature=0.7,         # Adjust for creativity vs. coherence
        top_p=0.9,               # Nucleus sampling
        repetition_penalty=1.1,  # Penalty to reduce repetition
        pad_token_id=tokenizer.eos_token_id,  # Avoid warnings about missing pad_token_id
    )

# Decode the generated tokens
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the result
print("Generated Response:")
print(generated_text)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

RuntimeError: You can't move a model that has some modules offloaded to cpu or disk.

In [2]:
# Import necessary libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import os

# Define the base model name (same as in your training code)
model_name = "/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1"

# Define the path to your fine-tuned model
fine_tuned_model_path = "/kaggle/working/Mistral-7b-QA"

# Create the offload directory if it doesn't exist
offload_dir = "/kaggle/working/offload"
os.makedirs(offload_dir, exist_ok=True)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    fine_tuned_model_path,
    trust_remote_code=True,
    use_fast=False,
)

# Load the base model with the correct offload parameter
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
    offload_dir=offload_dir,  # Use 'offload_dir' here
)

# Disable caching to prevent memory issues
base_model.config.use_cache = False

# Load the PEFT model with the correct offload parameter
model = PeftModel.from_pretrained(
    base_model,
    fine_tuned_model_path,
    offload_dir=offload_dir,  # Use 'offload_dir' here
)

# Remove model.to(device) because the model is already assigned to devices
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model.to(device)

# Prepare your prompt
prompt = "প্রশ্ন: আইজ্জা অন্যের মনের কিসে?\nউত্তর:"

# Get the device of the model's first parameter
device = next(iter(model.parameters())).device

# Tokenize the input prompt and move inputs to the correct device
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate the model's response
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,       # Adjust as needed
        do_sample=True,          # Enable sampling for variability
        temperature=0.7,         # Adjust for creativity vs. coherence
        top_p=0.9,               # Nucleus sampling
        repetition_penalty=1.1,  # Penalty to reduce repetition
        pad_token_id=tokenizer.eos_token_id,  # Avoid warnings about missing pad_token_id
    )

# Decode the generated tokens
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the result
print("Generated Response:")
print(generated_text)


ModuleNotFoundError: No module named 'peft'

In [12]:
# Import necessary libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
import os

# Define the base model name
model_name = "/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1"

# Define the path to your fine-tuned model
fine_tuned_model_path = "/kaggle/working/Mistral-7b-QA"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    fine_tuned_model_path,
    trust_remote_code=True,
    use_fast=False,
)

# Set up bitsandbytes configuration for 8-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
)

# Load the base model with 8-bit quantization
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    device_map="auto",
)

# Disable caching to prevent memory issues
base_model.config.use_cache = False

# Load the PEFT model
model = PeftModel.from_pretrained(
    base_model,
    fine_tuned_model_path,
)

# Prepare your prompt
prompt = "প্রশ্ন: আইজ্জা অন্যের মনের কিসে?\nউত্তর:"

# Get the device of the model's first parameter
device = next(iter(model.parameters())).device

# Tokenize the input prompt and move inputs to the correct device
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate the model's response
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,       # Adjust as needed
        do_sample=True,          # Enable sampling for variability
        temperature=0.7,         # Adjust for creativity vs. coherence
        top_p=0.9,               # Nucleus sampling
        repetition_penalty=1.1,  # Penalty to reduce repetition
        pad_token_id=tokenizer.eos_token_id,  # Avoid warnings about missing pad_token_id
    )

# Decode the generated tokens
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the result
print("Generated Response:")
print(generated_text)


Unused kwargs: ['bnb_8bit_compute_dtype']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [6]:
# Import necessary libraries
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import os

# Define the base model name (same as in your training code)
model_name = "/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1"

# Define the path to your fine-tuned model
fine_tuned_model_path = "/kaggle/working/Mistral-7b-QA"

# Create the offload folder if it doesn't exist
offload_folder = "/kaggle/working/offload"
os.makedirs(offload_folder, exist_ok=True)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    fine_tuned_model_path,
    trust_remote_code=True,
    use_fast=False,
)

# Load the base model with the correct offload parameter
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
    offload_folder=offload_folder,  # Use 'offload_folder' here
)

# Disable caching to prevent memory issues
base_model.config.use_cache = False

# Load the PEFT model with the correct offload parameter
model = PeftModel.from_pretrained(
    base_model,
    fine_tuned_model_path,
    offload_folder=offload_folder,  # Use 'offload_folder' here
)

# Move the model to the appropriate device (if necessary)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prepare your prompt
prompt = "প্রশ্ন: আইজ্জা অন্যের মনের কিসে?\nউত্তর:"

# Tokenize the input prompt
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate the model's response
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,       # Adjust as needed
        do_sample=True,          # Enable sampling for variability
        temperature=0.7,         # Adjust for creativity vs. coherence
        top_p=0.9,               # Nucleus sampling
        repetition_penalty=1.1,  # Penalty to reduce repetition
        pad_token_id=tokenizer.eos_token_id,  # Avoid warnings about missing pad_token_id
    )

# Decode the generated tokens
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the result
print("Generated Response:")
print(generated_text)

OSError: Incorrect path_or_model_id: '/kaggle/working/Mistral-7b-QA'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [None]:
# Prepare your prompt
prompt = "প্রশ্ন: আইজ্জা অন্যের মনের কিসে?\nউত্তর:"

# Tokenize the input prompt
inputs = tokenizer(prompt, return_tensors="pt").to(device)

# Generate the model's response
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,       # Adjust as needed
        do_sample=True,          # Enable sampling for variability
        temperature=0.7,         # Adjust for creativity vs. coherence
        top_p=0.9,               # Nucleus sampling
        repetition_penalty=1.1,  # Penalty to reduce repetition
        pad_token_id=tokenizer.eos_token_id,  # Avoid warnings about missing pad_token_id
    )

# Decode the generated tokens
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Print the result
print("Generated Response:")
print(generated_text)

In [None]:
# Prepare your prompt
prompt = "প্রশ্ন: থাকগোই বাদ দে এলগা?\nউত্তর:"
# Print the result
print("Generated Response:")
print(generated_text)

In [1]:
# Install the evaluate library
!pip install evaluate --quiet

# Restart the kernel after installation (uncomment the following lines if running interactively)
# import sys
# sys.exit()

# Import necessary libraries for evaluation
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import numpy as np
from datasets import load_dataset
import evaluate

# Define the path to your fine-tuned model
new_model = "Mistral-7b-QA"  # Replace with your actual path if different

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    new_model,
    trust_remote_code=True,
    use_fast=False,
)

# Load the base model (same as used during training)
model_name = "/kaggle/input/mistral/pytorch/7b-instruct-v0.1-hf/1"

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
)

# Disable caching to prevent memory issues
base_model.config.use_cache = False

# Load the fine-tuned model with LoRA adapters
model = PeftModel.from_pretrained(
    base_model,
    new_model,
)

# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Prepare the test dataset
# Let's assume you have the 'dataset' variable from your code
# If not, reload your dataset
dataset_name = "/kaggle/input/noahkalifinal/Noakhali Train Translation - Sheet3.csv"
dataset = load_dataset('csv', data_files=dataset_name, split='train')

# Remove leading/trailing whitespace from column names
dataset = dataset.rename_columns({col: col.strip() for col in dataset.column_names})

# Combine 'Question' and 'Answer' into a single text field
question_column = 'Question'
answer_column = 'Answer'

# Filter valid examples (reuse your `is_valid_example` function)
def is_valid_example(example):
    question = example[question_column]
    answer = example[answer_column]
    return (
        question is not None and answer is not None and
        isinstance(question, str) and isinstance(answer, str) and
        question.strip() != '' and answer.strip() != ''
    )

dataset = dataset.filter(is_valid_example)

# Combine 'Question' and 'Answer'
def combine_questions_answers(example):
    question = example.get(question_column, '').strip()
    answer = example.get(answer_column, '').strip()
    example['text'] = f"প্রশ্ন: {question}\nউত্তর: {answer}"
    return example

dataset = dataset.map(combine_questions_answers)
dataset = dataset.remove_columns([question_column, answer_column])

# Split the dataset into train and test sets
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset['train']
test_dataset = split_dataset['test']

# Prepare the test dataset for evaluation
def prepare_test_examples(examples):
    questions = []
    answers = []
    for text in examples['text']:
        if 'প্রশ্ন:' in text and '\nউত্তর:' in text:
            q_a = text.split('\nউত্তর:')
            question = q_a[0]
            answer = q_a[1]
            questions.append(question + '\nউত্তর:')
            answers.append(answer.strip())
        else:
            questions.append(text)
            answers.append('')
    return {'questions': questions, 'answers': answers}

test_dataset = test_dataset.map(prepare_test_examples, batched=True, remove_columns=['text'])

# Generate predictions
from tqdm import tqdm

predictions = []
references = []

for i in tqdm(range(len(test_dataset))):
    question = test_dataset[i]['questions']
    reference = test_dataset[i]['answers']
    
    # Tokenize the question
    inputs = tokenizer(question, return_tensors='pt').to(device)
    
    # Generate the model's response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode the generated response
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract the answer part
    if 'উত্তর:' in generated_text:
        generated_answer = generated_text.split('উত্তর:')[1].strip()
    else:
        generated_answer = generated_text.strip()
    
    predictions.append(generated_answer)
    references.append(reference)

# Compute evaluation metrics
# We'll compute Exact Match (EM), BLEU, and ROUGE scores

# Exact Match
exact_matches = [int(pred.strip() == ref.strip()) for pred, ref in zip(predictions, references)]
em_score = np.mean(exact_matches) * 100
print(f"Exact Match (EM) Score: {em_score:.2f}%")

# BLEU Score
bleu_metric = evaluate.load('bleu')
bleu_score = bleu_metric.compute(predictions=predictions, references=references)['bleu'] * 100
print(f"BLEU Score: {bleu_score:.2f}%")

# ROUGE Score
rouge_metric = evaluate.load('rouge')
rouge_scores = rouge_metric.compute(predictions=predictions, references=references)
print("ROUGE Scores:")
for key in rouge_scores:
    score = rouge_scores[key].mid.fmeasure * 100
    print(f"{key}: {score:.2f}%")

# You can also print some sample predictions
for i in range(5):
    print(f"Question: {test_dataset[i]['questions']}")
    print(f"True Answer: {test_dataset[i]['answers']}")
    print(f"Generated Answer: {predictions[i]}")
    print('-' * 50)


ModuleNotFoundError: No module named 'peft'