In [1]:
# Install Unsloth for the T4 GPU architecture
!pip install --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-xcgk7dph/unsloth_5b19229314f849fd9294263149f92b7c
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-xcgk7dph/unsloth_5b19229314f849fd9294263149f92b7c
  Resolved https://github.com/unslothai/unsloth.git to commit c6f8d00329e3c685880ebd4591b90734ebcd145a
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [2]:
# Force-install compatible versions to fix the 'ensure_weight_tying' bug
!pip install --no-deps "peft>=0.13.0" "bitsandbytes>=0.43.3"



In [3]:
!pip install unsloth_zoo



In [4]:
!pip install --upgrade peft



In [5]:
from unsloth import FastLanguageModel
import torch

# Configuration based on your PDF requirements
max_seq_length = 2048 # Requirement: Sequence length not reduced [cite: 3, 24]
dtype = None # Auto detection
load_in_4bit = True # Requirement: Resource optimization 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

print("Model and Tokenizer loaded successfully!")

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


2026-01-04 10:59:48.889586: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767524388.910759     666 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767524388.917278     666 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767524388.934359     666 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767524388.934384     666 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767524388.934386     666 computation_placer.cc:177] computation placer alr

ü¶• Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.12.10: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Model and Tokenizer loaded successfully!


In [6]:
from unsloth import FastLanguageModel

# Re-wrap the model with LoRA so it becomes trainable on 4-bit
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,             # LoRA rank
    lora_alpha = 32,
    lora_dropout = 0.05,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],  # attention layers
    bias = "none",
)
print("LoRA adapters attached via Unsloth.")


In [7]:
from trl import SFTTrainer
from transformers import TrainingArguments

class LLAMAFineTuner:
    def __init__(self, model, tokenizer, dataset):
        self.model = model
        self.tokenizer = tokenizer
        self.dataset = dataset

    def train(self):
        # Setting up the Trainer with Resource Optimization 
        trainer = SFTTrainer(
            model = self.model,
            tokenizer = self.tokenizer,
            train_dataset = self.dataset,
            dataset_text_field = "text",
            max_seq_length = 2048, # Requirement: sequence length not reduced [cite: 24]
            dataset_num_proc = 2,
            args = TrainingArguments(
                per_device_train_batch_size = 2,
                gradient_accumulation_steps = 4,
                warmup_steps = 5,
                max_steps = 60, # Set low for a quick test; increase for better results
                learning_rate = 2e-4,
                fp16 = not torch.cuda.is_bf16_supported(), # Mixed precision 
                bf16 = torch.cuda.is_bf16_supported(),
                logging_steps = 1,
                optim = "adamw_8bit",
                weight_decay = 0.01,
                lr_scheduler_type = "linear",
                seed = 3407,
                output_dir = "outputs",
            ),
        )
        
        # Start training and log progress 
        trainer_stats = trainer.train()
        return trainer_stats

# After running the model loading and LoRA setup:
# tuner = LLAMAFineTuner(model, tokenizer, dataset)
# tuner.train()

In [8]:
from datasets import load_dataset

class DatasetProcessor:
    def __init__(self, tokenizer, max_seq_length=2048):
        self.tokenizer = tokenizer
        self.max_length = max_seq_length

    def formatting_prompts_func(self, examples):
        # Maps the dataset columns to Llama-3 instruction format
        # Dataset columns: 'Question' (input) and 'Answers' (output)
        inputs  = examples["Questions"]
        outputs = examples["Answers"]
        texts = []
        for input_text, output_text in zip(inputs, outputs):
            # Formatted string for training
            text = f"### Instruction:\n‡¶∏‡¶π‡¶æ‡¶®‡ßÅ‡¶≠‡ßÇ‡¶§‡¶ø‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶â‡¶§‡ßç‡¶§‡¶∞ ‡¶¶‡¶ø‡¶® (Respond with empathy).\n\n### Input:\n{input_text}\n\n### Response:\n{output_text}"
            texts.append(text)
        return { "text" : texts, }

    def process(self, path="/kaggle/input/bengali-empathetic-conversations-corpus"):
        # Load the specific dataset you mentioned
        dataset = load_dataset(path, split="train")
        
        # Shuffle and map to the new format
        dataset = dataset.map(self.formatting_prompts_func, batched=True)
        return dataset

# Initialize the processor and load data
processor = DatasetProcessor(tokenizer)
dataset = processor.process()

print(f"Dataset loaded! Total examples: {len(dataset)}")
print(f"Sample formatted text:\n{dataset[0]['text']}")

Map:   0%|          | 0/38233 [00:00<?, ? examples/s]

Dataset loaded! Total examples: 38233
Sample formatted text:
### Instruction:
‡¶∏‡¶π‡¶æ‡¶®‡ßÅ‡¶≠‡ßÇ‡¶§‡¶ø‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶â‡¶§‡ßç‡¶§‡¶∞ ‡¶¶‡¶ø‡¶® (Respond with empathy).

### Input:
 ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶∏‡ßç‡¶§‡ßç‡¶∞‡ßÄ ‡¶è‡¶¨‡¶Ç ‡¶Æ‡¶æ‡¶Ø‡¶º‡ßá‡¶∞ ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá ‡¶ü‡¶æ‡¶®‡¶ü‡¶æ‡¶® ‡¶Æ‡¶§‡¶¨‡¶ø‡¶∞‡ßã‡¶ß ‡¶ö‡¶≤‡¶õ‡ßá‡•§ ‡¶Ö‡¶§‡ßÄ‡¶§‡ßá, ‡¶§‡¶æ‡¶¶‡ßá‡¶∞ ‡¶Æ‡¶ß‡ßç‡¶Ø‡ßá ‡¶õ‡ßã‡¶ü‡¶ñ‡¶æ‡¶ü‡ßã ‡¶™‡¶æ‡¶∞‡ßç‡¶•‡¶ï‡ßç‡¶Ø ‡¶õ‡¶ø‡¶≤‡•§ ‡¶â‡¶¶‡¶æ‡¶π‡¶∞‡¶£‡¶∏‡ßç‡¶¨‡¶∞‡ßÇ‡¶™, ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶∏‡ßç‡¶§‡ßç‡¶∞‡ßÄ ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶ï‡¶æ‡¶õ‡ßá ‡¶Ö‡¶≠‡¶ø‡¶Ø‡ßã‡¶ó ‡¶ï‡¶∞‡¶¨‡ßá ‡¶Ø‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶Æ‡¶æ ‡¶ñ‡ßÅ‡¶¨ ‡¶ï‡¶∞‡ßç‡¶§‡ßÉ‡¶§‡ßç‡¶¨‡¶™‡ßç‡¶∞‡¶Ø‡¶º‡¶æ‡¶∏‡ßÄ; ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶Æ‡¶æ ‡¶Ö‡¶≠‡¶ø‡¶Ø‡ßã‡¶ó ‡¶ï‡¶∞‡¶¨‡ßá‡¶® ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶∏‡ßç‡¶§‡ßç‡¶∞‡ßÄ ‡¶Ö‡¶≤‡¶∏‡•§ ‡¶§‡¶¨‡ßá ‡¶á‡¶¶‡¶æ‡¶®‡ßÄ‡¶Ç ‡¶§‡¶æ ‡¶§‡ßÄ‡¶¨‡ßç‡¶∞‡¶§‡¶∞ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá ‡•§ ‡¶Ü‡¶Æ‡¶ø ‡¶Æ‡¶®‡ßá ‡¶ï‡¶∞‡¶ø, ‡¶è‡¶∞ ‡¶ï‡¶æ‡¶∞‡¶£ ‡¶π‡¶ö‡ßç‡¶õ‡ßá ‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶∏‡ßç‡¶§‡ßç‡¶∞‡ßÄ ‡¶§‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶è‡¶ï‡¶¨‡¶æ‡¶∞ ‡¶ï‡¶•‡¶æ‡¶∞ ‡¶™‡ßç‡¶∞‡¶§‡¶ø‡¶§‡ßç‡¶§‡

In [11]:
from datasets import DatasetDict

# Use the processed dataset you already created
full_dataset = dataset

# 90% train, 10% eval split
split_dataset = full_dataset.train_test_split(test_size=0.1, seed=42)

train_dataset = split_dataset["train"]
eval_dataset  = split_dataset["test"]

print("Train size:", len(train_dataset))
print("Eval size:", len(eval_dataset))
print("Sample train text:\n", train_dataset[0]["text"][:400])


Train size: 34409
Eval size: 3824
Sample train text:
 ### Instruction:
‡¶∏‡¶π‡¶æ‡¶®‡ßÅ‡¶≠‡ßÇ‡¶§‡¶ø‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶â‡¶§‡ßç‡¶§‡¶∞ ‡¶¶‡¶ø‡¶® (Respond with empathy).

### Input:
‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶¨‡¶®‡ßç‡¶ß‡ßÅ‡¶∞ ‡¶™‡ßã‡¶∑‡¶æ ‡¶™‡ßç‡¶∞‡¶æ‡¶£‡ßÄ ‡¶π‡¶ø‡¶∏‡¶æ‡¶¨‡ßá ‡¶Æ‡¶æ‡¶ï‡¶°‡¶º‡¶∏‡¶æ‡¶∞ ‡¶è‡¶ï‡¶ü‡¶ø ‡¶ó‡ßÅ‡¶ö‡ßç‡¶õ ‡¶Ü‡¶õ‡ßá, ‡¶ï‡¶ø‡¶®‡ßç‡¶§‡ßÅ ‡¶Ü‡¶Æ‡¶ø ‡¶§‡¶æ‡¶¶‡ßá‡¶∞ ‡¶∏‡¶π‡ßç‡¶Ø ‡¶ï‡¶∞‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡¶ø ‡¶®‡¶æ‡•§ ‡¶§‡¶æ‡¶∞‡¶æ ‡¶Ü‡¶Æ‡¶æ‡¶ï‡ßá ‡¶≠‡¶Ø‡¶º ‡¶™‡¶æ‡¶Ø‡¶º!

### Response:
‡¶Ü‡¶Æ‡¶ø ‡¶≠‡¶Ø‡¶º ‡¶™‡¶æ‡¶ö‡ßç‡¶õ‡¶ø ‡¶ï‡ßá‡¶â ‡¶¨‡ßá‡¶∞‡¶ø‡¶Ø‡¶º‡ßá ‡¶Ü‡¶∏‡¶¨‡ßá!


In [None]:
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig

# Attach LoRA to the quantized model
model = FastLanguageModel.get_peft_model(
    model,
    r = 8,
    lora_alpha = 16,
    lora_dropout = 0.05,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
    bias = "none",
)
print("LoRA adapters attached.")

sft_config = SFTConfig(
    output_dir = "./llama31_bengali_empathetic_lora_tiny",
    num_train_epochs = 1,
    max_steps = 20,                      # only 20 steps
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    gradient_accumulation_steps = 1,
    learning_rate = 2e-4,
    logging_steps = 5,
    save_steps = 20,
    eval_strategy = "steps",
    eval_steps = 10,
    max_seq_length = 2048,
    fp16 = False,
    bf16 = False,
    report_to = None,
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    args = sft_config,
    train_dataset = small_train_dataset,
    eval_dataset = small_eval_dataset,
    dataset_text_field = "text",
)

train_result = trainer.train()
print("Training finished.")


In [14]:
import math
import torch
from torch.utils.data import DataLoader

# Use a small subset of eval data to keep it fast
ppl_eval_dataset = eval_dataset.select(range(128))  # 128 samples

def collate_fn(batch):
    texts = [ex["text"] for ex in batch]
    enc = tokenizer(
        texts,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=2048,
    )
    # Labels are the same as input_ids for language modeling
    enc["labels"] = enc["input_ids"].clone()
    return enc

ppl_loader = DataLoader(ppl_eval_dataset, batch_size=2, shuffle=False, collate_fn=collate_fn)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

total_loss = 0.0
total_tokens = 0

with torch.no_grad():
    for batch in ppl_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        # loss is cross-entropy over all non-masked tokens
        loss = outputs.loss
        # number of tokens (ignore padding id if you want to refine later)
        num_tokens = (batch["labels"] != tokenizer.pad_token_id).sum().item()
        total_loss += loss.item() * num_tokens
        total_tokens += num_tokens

avg_nll = total_loss / total_tokens
perplexity = math.exp(avg_nll)

print(f"Average NLL: {avg_nll:.4f}")
print(f"Perplexity:  {perplexity:.4f}")


Average NLL: 4.8774
Perplexity:  131.2836


In [16]:
!pip install evaluate

import evaluate




In [18]:
!pip install rouge_score


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=1751a37889ae62e6716cd3d68014af54e2b51e36ee8ee19ba48eb33bf1c9244b
  Stored in directory: /root/.cache/pip/wheels/85/9d/af/01feefbe7d55ef5468796f0c68225b6788e85d9d0a281e7a70
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [20]:
import evaluate
import torch

bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")

metric_dataset = eval_dataset.select(range(64))

predictions = []
references  = []

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

for ex in metric_dataset:
    prompt = ex["text"]

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=1024,
    ).to(device)

    with torch.no_grad():
        generated_ids = model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=False,
        )

    out_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

    predictions.append(out_text)
    references.append(ex["text"])

# üëâ BLEU and ROUGE both use *raw strings* here
bleu_score = bleu.compute(
    predictions=predictions,
    references=references,
)

rouge_score = rouge.compute(
    predictions=predictions,
    references=references,
)

print("BLEU:", bleu_score)
print("ROUGE:", rouge_score)


BLEU: {'bleu': 0.7267338195774924, 'precisions': [0.7329382917858602, 0.7283643892339544, 0.7247167436005035, 0.7209698000850702], 'brevity_penalty': 1.0, 'length_ratio': 1.3371584699453551, 'translation_length': 4894, 'reference_length': 3660}
ROUGE: {'rouge1': np.float64(1.0), 'rouge2': np.float64(1.0), 'rougeL': np.float64(1.0), 'rougeLsum': np.float64(1.0)}


In [21]:
test_prompts = [
    "‡¶Ü‡¶ú ‡¶Ü‡¶Æ‡¶ø ‡¶ñ‡ßÅ‡¶¨ ‡¶è‡¶ï‡¶æ ‡¶≤‡¶æ‡¶ó‡¶õ‡ßá‡•§ ‡¶ï‡ßá‡¶â ‡¶®‡ßá‡¶á ‡¶™‡¶æ‡¶∂‡ßá‡•§",
    "‡¶™‡¶∞‡ßÄ‡¶ï‡ßç‡¶∑‡¶æ‡¶Ø‡¶º ‡¶ñ‡¶æ‡¶∞‡¶æ‡¶™ ‡¶∞‡ßá‡¶ú‡¶æ‡¶≤‡ßç‡¶ü ‡¶ï‡¶∞‡ßá‡¶õ‡¶ø, ‡¶Ü‡¶Æ‡¶ø ‡¶ñ‡ßÅ‡¶¨ ‡¶π‡¶§‡¶æ‡¶∂‡•§",
    "‡¶Ü‡¶Æ‡¶æ‡¶∞ ‡¶¨‡¶®‡ßç‡¶ß‡ßÅ‡¶∞ ‡¶∏‡¶ô‡ßç‡¶ó‡ßá ‡¶ù‡¶ó‡¶°‡¶º‡¶æ ‡¶π‡¶Ø‡¶º‡ßá‡¶õ‡ßá, ‡¶ï‡¶ø ‡¶ï‡¶∞‡¶¨?",
]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

for p in test_prompts:
    formatted = f"### Instruction:\n‡¶∏‡¶π‡¶æ‡¶®‡ßÅ‡¶≠‡ßÇ‡¶§‡¶ø‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶â‡¶§‡ßç‡¶§‡¶∞ ‡¶¶‡¶ø‡¶® (Respond with empathy).\n\n### Input:\n{p}\n\n### Response:\n"
    inputs = tokenizer(formatted, return_tensors="pt", truncation=True, max_length=1024).to(device)
    with torch.no_grad():
        gen_ids = model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=True,
            top_p=0.9,
            temperature=0.7,
        )
    out_text = tokenizer.decode(gen_ids[0], skip_special_tokens=True)
    print("USER:", p)
    print("MODEL:", out_text)
    print("-" * 80)


USER: ‡¶Ü‡¶ú ‡¶Ü‡¶Æ‡¶ø ‡¶ñ‡ßÅ‡¶¨ ‡¶è‡¶ï‡¶æ ‡¶≤‡¶æ‡¶ó‡¶õ‡ßá‡•§ ‡¶ï‡ßá‡¶â ‡¶®‡ßá‡¶á ‡¶™‡¶æ‡¶∂‡ßá‡•§
MODEL: ### Instruction:
‡¶∏‡¶π‡¶æ‡¶®‡ßÅ‡¶≠‡ßÇ‡¶§‡¶ø‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶â‡¶§‡ßç‡¶§‡¶∞ ‡¶¶‡¶ø‡¶® (Respond with empathy).

### Input:
‡¶Ü‡¶ú ‡¶Ü‡¶Æ‡¶ø ‡¶ñ‡ßÅ‡¶¨ ‡¶è‡¶ï‡¶æ ‡¶≤‡¶æ‡¶ó‡¶õ‡ßá‡•§ ‡¶ï‡ßá‡¶â ‡¶®‡ßá‡¶á ‡¶™‡¶æ‡¶∂‡ßá‡•§

### Response:
‡¶è‡¶ü‡¶æ ‡¶ñ‡ßÅ‡¶¨ ‡¶¨‡ßç‡¶Ø‡¶æ‡¶•‡ßç‡¶Ø ‡¶π‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡ßá‡•§ ‡¶Ü‡¶™‡¶®‡¶ø ‡¶ï‡ßã‡¶®‡ßã ‡¶∏‡¶Æ‡¶Ø‡¶º ‡¶ï‡ßá‡¶â ‡¶∏‡¶æ‡¶•‡ßá ‡¶•‡¶æ‡¶ï‡¶§‡ßá ‡¶ö‡¶æ‡¶®? ‡¶Ü‡¶Æ‡¶ø ‡¶¨‡¶≤‡¶§‡ßá ‡¶™‡¶æ‡¶∞‡¶ø ‡¶Ü‡¶™‡¶®‡¶æ‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶ï‡¶•‡¶æ ‡¶¨‡¶≤‡¶§‡ßá‡•§ ‡¶Ü‡¶Æ‡¶ø ‡¶∏‡¶π‡¶æ‡¶®‡ßÅ
--------------------------------------------------------------------------------
USER: ‡¶™‡¶∞‡ßÄ‡¶ï‡ßç‡¶∑‡¶æ‡¶Ø‡¶º ‡¶ñ‡¶æ‡¶∞‡¶æ‡¶™ ‡¶∞‡ßá‡¶ú‡¶æ‡¶≤‡ßç‡¶ü ‡¶ï‡¶∞‡ßá‡¶õ‡¶ø, ‡¶Ü‡¶Æ‡¶ø ‡¶ñ‡ßÅ‡¶¨ ‡¶π‡¶§‡¶æ‡¶∂‡•§
MODEL: ### Instruction:
‡¶∏‡¶π‡¶æ‡¶®‡ßÅ‡¶≠‡ßÇ‡¶§‡¶ø‡¶∞ ‡¶∏‡¶æ‡¶•‡ßá ‡¶â‡¶§‡ßç‡¶§‡¶∞ ‡¶¶‡¶ø‡¶® (Respond with empathy).

### Input:
‡¶™‡¶∞‡ßÄ‡¶ï‡ßç‡¶∑‡¶æ‡¶Ø‡¶º ‡¶ñ‡¶æ‡¶∞‡¶æ‡¶™ ‡¶∞‡ßá‡¶ú‡¶

In [None]:
# from peft import LoraConfig

# # Save the original initialization method
# original_init = LoraConfig.__init__

# # Define a new initialization that filters out the problematic argument
# def patched_init(self, *args, **kwargs):
#     if "ensure_weight_tying" in kwargs:
#         del kwargs["ensure_weight_tying"]
#     return original_init(self, *args, **kwargs)

# # Apply the patch
# LoraConfig.__init__ = patched_init

# print("LoraConfig successfully patched! You can now proceed to training.")

In [None]:
# import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Force only one T4 GPU

# import torch
# from unsloth import FastLanguageModel
# from trl import SFTTrainer
# from transformers import TrainingArguments

# # Load model again with strict single-GPU settings [cite: 8, 26]
# model, tokenizer = FastLanguageModel.from_pretrained(
#     model_name = "unsloth/llama-3.1-8b-instruct-bnb-4bit",
#     max_seq_length = 2048, # Requirement: Sequence length not reduced 
#     load_in_4bit = True,   # Resource optimization 
# )

# # Apply LoRA for attention layers 
# model = FastLanguageModel.get_peft_model(
#     model,
#     r = 16,
#     target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"],
#     lora_alpha = 16,
#     lora_dropout = 0,
#     bias = "none",
#     use_gradient_checkpointing = True, # Resource optimization 
#     random_state = 3407,
# )

# # Initialize the trainer with simplified logging [cite: 29]
# trainer = SFTTrainer(
#     model = model,
#     tokenizer = tokenizer,
#     train_dataset = dataset, # Use the 'dataset' we processed earlier
#     dataset_text_field = "text",
#     max_seq_length = 2048,
#     args = TrainingArguments(
#         per_device_train_batch_size = 2,
#         gradient_accumulation_steps = 4,
#         max_steps = 10,
#         learning_rate = 2e-4,
#         fp16 = True, # Mixed precision training 
#         logging_steps = 1,
#         optim = "adamw_8bit", # Resource optimization 
#         output_dir = "outputs",
#         report_to = "none",
#     ),
# )

# print("Final attempt starting...")
# trainer.train()