<a href="https://www.kaggle.com/code/samardas/fine-tuning-llama-bengali-task?scriptVersionId=290221860" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

Import libraries

In [1]:
!pip install -q -U bitsandbytes transformers peft datasets evaluate sentencepiece safetensors rouge-score

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m108.4 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m556.4/556.4 kB[0m [31m35.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m512.3/512.3 kB[0m [31m32.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.2/507.2 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge-score (setup.py) ... [?25l[?25

Fine-tune LLaMA 3.1-8B-Instruct on Bengali Empathetic Data

In [14]:
# ============================================================
# Fine-tune LLaMA 3.1-8B-Instruct on Bengali Empathetic Data
# LoRA + 4-bit (bitsandbytes) – OOM-aware on T4/Colab
#
# Requirements covered:
# - LoRA (via PEFT) on LLaMA 3.1-8B
# - 4-bit quantization (QLoRA style)
# - Full max_seq_len = 512 with dynamic padding
# - OOP: DatasetProcessor, LLAMAFineTuner, Evaluator
# - Strategy pattern: LoRAStrategy (+ UnslothStrategy stub)
# - Metrics: Perplexity, BLEU, ROUGE
# - SQLite logging: LLAMAExperiments, GeneratedResponses
# - Human eval export file
# ============================================================

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256"

import gc
import math
import json
import sqlite3
from datetime import datetime

import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
)
import evaluate

# ============================================================
# 0. GLOBAL CONFIG & UTILITIES
# ============================================================

MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
MAX_SEQ_LEN = 512
DB_PATH = "experiments.db"

# HF token: set via environment or replace string below for local testing.
HF_TOKEN = os.environ.get("HF_TOKEN", "****")
# IMPORTANT: Replace the placeholder above or set HF_TOKEN in your env.
# Do NOT commit your real token.

def clear_memory():
    """Free Python and CUDA memory to reduce fragmentation / OOM risk."""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

clear_memory()

# ============================================================
# 1. STRATEGY PATTERN: FineTuneStrategy, LoRAStrategy, UnslothStrategy
# ============================================================

class FineTuneStrategy:
    """
    Abstract base class for fine-tuning strategies (LoRA vs Unsloth).
    """
    def setup_model(self, model_name: str, hf_token: str, max_seq_len: int):
        """
        Should return: (model, tokenizer, lora_config_dict)
        """
        raise NotImplementedError


class LoRAStrategy(FineTuneStrategy):
    """
    Standard Hugging Face + PEFT + bitsandbytes LoRA (QLoRA style).
    """

    def __init__(self, r=8, lora_alpha=16, lora_dropout=0.05):
        self.r = r
        self.lora_alpha = lora_alpha
        self.lora_dropout = lora_dropout

    def setup_model(self, model_name: str, hf_token: str, max_seq_len: int):
        # 1) Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            token=hf_token,
            use_fast=False,
        )
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        # 2) BitsAndBytes 4-bit config
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
        )

        # 3) Load 4-bit base model
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            token=hf_token,
            quantization_config=bnb_config,
            device_map="auto",           # let HF decide placement
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
        )

        # 4) Prepare for k-bit training + gradient checkpointing
        model.gradient_checkpointing_enable()
        model = prepare_model_for_kbit_training(model)

        # 5) LoRA config (attention-only for efficiency)
        lora_config = LoraConfig(
            r=self.r,
            lora_alpha=self.lora_alpha,
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
            lora_dropout=self.lora_dropout,
            bias="none",
            task_type="CAUSAL_LM",
        )

        # 6) Apply LoRA
        model = get_peft_model(model, lora_config)
        model.config.use_cache = False
        model.print_trainable_parameters()

        lora_config_dict = {
            "type": "LoRA-PEFT",
            "r": self.r,
            "lora_alpha": self.lora_alpha,
            "lora_dropout": self.lora_dropout,
            "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj"],
        }

        return model, tokenizer, lora_config_dict


class UnslothStrategy(FineTuneStrategy):
    """
    Stub for Unsloth-based strategy.
    You can implement this with unsloth.FastLanguageModel if you install Unsloth.
    For the assignment, this class + LoRAStrategy demonstrate the Strategy pattern.
    """

    def setup_model(self, model_name: str, hf_token: str, max_seq_len: int):
        raise NotImplementedError(
            "UnslothStrategy not implemented in this script. "
            "Implement with unsloth.FastLanguageModel if desired."
        )

# ============================================================
# 2. DATASET PROCESSOR
# ============================================================

class DatasetProcessor:
    """
    Handles:
    - Building LLaMA chat-style prompts from input/response pairs.
    - Tokenization with full sequence length and dynamic padding.
    """

    def __init__(self, tokenizer, max_seq_len: int):
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def build_prompt(self, example):
        """
        Build LLaMA 3.1 chat-style prompt.

        Structure:
          <|begin_of_text|><|start_header_id|>user<|end_header_id|>

          {input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

          {response}<|eot_id|>
        """
        return (
            "<|begin_of_text|>"
            "<|start_header_id|>user<|end_header_id|>\n\n"
            f"{example['input']}<|eot_id|>"
            "<|start_header_id|>assistant<|end_header_id|>\n\n"
            f"{example['response']}<|eot_id|>"
        )

    def tokenize(self, example):
        """
        Tokenize a single example:
        - Truncate at max_seq_len (512)
        - No padding here (dynamic padding in data collator)
        - labels = input_ids for causal LM
        """
        text = self.build_prompt(example)
        enc = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_seq_len,
            padding=False,
        )
        enc["labels"] = enc["input_ids"].copy()
        return enc

    def prepare(self, data_list, val_size=0.2):
        """
        Convert a Python list of dicts to HF Dataset,
        split into train/val, and tokenize.

        data_list: list of {"input": ..., "response": ...}
        returns: (train_tokenized, val_tokenized, val_raw_for_eval)
        """
        ds = Dataset.from_list(data_list).train_test_split(
            test_size=val_size,
            seed=42,
        )
        train_raw = ds["train"]
        val_raw = ds["test"]

        train_tokenized = train_raw.map(
            self.tokenize,
            remove_columns=train_raw.column_names,
        )
        val_tokenized = val_raw.map(
            self.tokenize,
            remove_columns=val_raw.column_names,
        )

        return train_tokenized, val_tokenized, val_raw

# ============================================================
# 3. LLAMAFineTuner
# ============================================================

class LLAMAFineTuner:
    """
    Orchestrates:
    - Model + tokenizer setup via a FineTuneStrategy (LoRA or Unsloth)
    - Trainer creation
    """

    def __init__(self, strategy: FineTuneStrategy, model_name: str, hf_token: str, max_seq_len: int):
        self.strategy = strategy
        self.model_name = model_name
        self.hf_token = hf_token
        self.max_seq_len = max_seq_len

        # Use the chosen strategy to get model + tokenizer + lora_config dict
        self.model, self.tokenizer, self.lora_config = self.strategy.setup_model(
            model_name=self.model_name,
            hf_token=self.hf_token,
            max_seq_len=self.max_seq_len,
        )

    def get_trainer(self, train_ds, val_ds, output_dir="./llama31_bn_lora_outputs"):
        """
        Build a Hugging Face Trainer for causal LM fine-tuning.
        """
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=False,
        )

        training_args = TrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=1,
            per_device_eval_batch_size=1,
            gradient_accumulation_steps=4,
            num_train_epochs=3,
            learning_rate=2e-4,
            fp16=True,
            logging_steps=5,
            save_steps=1000,
            save_total_limit=1,
            report_to="none",
        )

        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_ds,
            eval_dataset=val_ds,
            data_collator=data_collator,
        )
        return trainer

# ============================================================
# 4. EVALUATOR (perplexity, BLEU, ROUGE, human eval export)
# ============================================================

class Evaluator:
    """
    Handles:
    - Perplexity computation via Trainer
    - Text generation
    - BLEU & ROUGE
    - Export of samples for human evaluation
    """

    def __init__(self, model, tokenizer, processor: DatasetProcessor, trainer: Trainer, max_seq_len: int):
        self.model = model
        self.tokenizer = tokenizer
        self.processor = processor
        self.trainer = trainer
        self.max_seq_len = max_seq_len
        self.device = next(self.model.parameters()).device

        self.bleu_metric = evaluate.load("bleu")
        self.rouge_metric = evaluate.load("rouge")

    def compute_perplexity(self, eval_dataset):
        """
        Compute perplexity from eval loss.
        """
        results = self.trainer.evaluate(eval_dataset=eval_dataset)
        eval_loss = results["eval_loss"]
        ppl = math.exp(eval_loss)
        return ppl, eval_loss

    def generate_response(self, user_text: str, max_new_tokens: int = 128):
        """
        Generate a response for a given user_text using chat-style prompt.
        """
        prompt = (
            "<|begin_of_text|>"
            "<|start_header_id|>user<|end_header_id|>\n\n"
            f"{user_text}<|eot_id|>"
            "<|start_header_id|>assistant<|end_header_id|>\n\n"
        )

        inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=self.max_seq_len,
        ).to(self.device)

        self.model.eval()
        with torch.no_grad():
            output_ids = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
                top_k=50,
                repetition_penalty=1.2,
                no_repeat_ngram_size=3,
                pad_token_id=self.tokenizer.eos_token_id,
            )

        full_text = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)
        return full_text

    def compute_bleu_rouge(self, val_raw, num_samples: int = None, log_callback=None):
        """
        Compute BLEU & ROUGE on a subset of val_raw.

        val_raw: HF Dataset with columns ["input", "response"]
        num_samples: limit number of samples (None = use all)
        log_callback: optional function(input_text, pred_text) for logging
        """
        if num_samples is None:
            n = len(val_raw)
        else:
            n = min(num_samples, len(val_raw))

        inputs = []
        preds = []
        refs = []

        for ex in val_raw.select(range(n)):
            user_text = ex["input"]
            ref = ex["response"]
            pred = self.generate_response(user_text)

            inputs.append(user_text)
            preds.append(pred)
            refs.append(ref)

            if log_callback is not None:
                log_callback(user_text, pred)

        bleu = self.bleu_metric.compute(
            predictions=preds,
            references=[[r] for r in refs],
        )
        rouge = self.rouge_metric.compute(
            predictions=preds,
            references=refs,
        )

        metrics = {
            "bleu": bleu["bleu"],
            "rouge1": rouge["rouge1"],
            "rouge2": rouge["rouge2"],
            "rougeL": rouge["rougeL"],
        }
        return metrics, inputs, preds, refs

    def prepare_human_eval_file(self, inputs, references, predictions, path="human_eval_samples.jsonl"):
        """
        Export a JSONL file for human evaluation.

        Each line: {
          "user_input": ...,
          "reference_response": ...,
          "model_response": ...
        }
        Human raters can add empathy/relevance/fluency scores.
        """
        with open(path, "w", encoding="utf-8") as f:
            for u, r, p in zip(inputs, references, predictions):
                rec = {
                    "user_input": u,
                    "reference_response": r,
                    "model_response": p,
                }
                f.write(json.dumps(rec, ensure_ascii=False) + "\n")
        return path

# ============================================================
# 5. SQLITE LOGGER
# ============================================================

class ProjectLogger:
    """
    Logs experiments and generated responses into SQLite.

    Tables:
      LLAMAExperiments(id, model_name, lora_config, train_loss, val_loss, metrics, timestamp)
      GeneratedResponses(id, experiment_id, input_text, response_text, timestamp)
    """

    def __init__(self, db_path=DB_PATH):
        self.db_path = db_path
        self._init_db()

    def _init_db(self):
        conn = sqlite3.connect(self.db_path)
        conn.execute(
            """
            CREATE TABLE IF NOT EXISTS LLAMAExperiments (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                model_name TEXT,
                lora_config TEXT,
                train_loss REAL,
                val_loss REAL,
                metrics TEXT,
                timestamp TEXT
            )
            """
        )
        conn.execute(
            """
            CREATE TABLE IF NOT EXISTS GeneratedResponses (
                id INTEGER PRIMARY KEY AUTOINCREMENT,
                experiment_id INTEGER,
                input_text TEXT,
                response_text TEXT,
                timestamp TEXT
            )
            """
        )
        conn.commit()
        conn.close()

    def log_experiment(self, model_name, lora_config_dict, train_loss, val_loss, metrics_dict):
        """
        Insert one experiment row and return experiment_id.
        """
        conn = sqlite3.connect(self.db_path)
        cur = conn.cursor()
        cur.execute(
            """
            INSERT INTO LLAMAExperiments (model_name, lora_config, train_loss, val_loss, metrics, timestamp)
            VALUES (?, ?, ?, ?, ?, ?)
            """,
            (
                model_name,
                json.dumps(lora_config_dict, ensure_ascii=False),
                float(train_loss) if train_loss is not None else None,
                float(val_loss) if val_loss is not None else None,
                json.dumps(metrics_dict, ensure_ascii=False),
                datetime.now().isoformat(),
            ),
        )
        experiment_id = cur.lastrowid
        conn.commit()
        conn.close()
        return experiment_id

    def log_response(self, experiment_id: int, input_text: str, response_text: str):
        """
        Log a single generated response linked to an experiment.
        """
        conn = sqlite3.connect(self.db_path)
        conn.execute(
            """
            INSERT INTO GeneratedResponses (experiment_id, input_text, response_text, timestamp)
            VALUES (?, ?, ?, ?)
            """,
            (
                experiment_id,
                input_text,
                response_text,
                datetime.now().isoformat(),
            ),
        )
        conn.commit()
        conn.close()

# ============================================================
# 6. BENGALI EMPATHETIC DATA (same as your script)
# ============================================================

bengali_data = [
    # Loneliness
    {"input": "আমার খুব একা লাগছে", "response": "আমি বুঝতে পারছি, একাকীত্ব অনেক কষ্টের হতে পারে। আমি তোমার পাশে আছি।"},
    {"input": "আমার সাথে কেউ কথা বলে না", "response": "আমি তোমার কথা শুনতে চাই, নিজেকে একা মনে করো না। আমি এখানে আছি তোমার জন্য।"},
    {"input": "সবার মাঝে থেকেও নিজেকে খুব একা মনে হয়", "response": "মাঝে মাঝে ভিড়ের মাঝেও এমন অনুভূতি হতে পারে। তুমি কি কারো সাথে মন খুলে কথা বলতে পেরেছ?"},
    # Sadness & depression
    {"input": "আজ আমার মন খুব খারাপ", "response": "দুঃখিত শুনে, মন খারাপ হওয়াটা অস্বাভাবিক কিছু না। চাইলে তুমি কারণটি আমার সাথে শেয়ার করতে পারো।"},
    {"input": "আমি হতাশ বোধ করছি", "response": "তুমি একা না, জীবন মাঝে মাঝে কঠিন হয়ে পড়ে। আমি এখানে আছি তোমার কথা শোনার জন্য।"},
    {"input": "সবকিছু কেমন জানি অর্থহীন লাগছে", "response": "হতাশা যখন ঘিরে ধরে তখন এমন মনে হতে পারে। তবে মনে রেখো, তোমার এই অনুভূতিগুলো সাময়িক, আমি তোমার পাশে আছি।"},
    {"input": "আমার কান্না পাচ্ছে কিন্তু আমি কাঁদতে পারছি না", "response": "আবেগ আটকে রাখা খুব কষ্টকর। তুমি যদি মন হালকা করতে চাও, আমি ধৈর্য ধরে শুনছি।"},
    # Anxiety & stress
    {"input": "আমি অনেক চাপের মধ্যে আছি", "response": "চাপের ফলে এমন লাগা স্বাভাবিক। একটু গভীর নিশ্বাস নাও এবং চেষ্টা করো পর্যাপ্ত বিশ্রাম নেওয়ার।"},
    {"input": "ভবিষ্যৎ নিয়ে আমি খুব দুশ্চিন্তায় আছি", "response": "ভবিষ্যৎ অনিশ্চিত বলে চিন্তা হওয়াটা স্বাভাবিক। তবে ছোট ছোট পদক্ষেপে এগিয়ে যাওয়াটাই এখন সবচেয়ে ভালো হবে।"},
    {"input": "সবকিছু কেন আমার সাথে খারাপ হচ্ছে?", "response": "আমি বুঝতে পারছি তুমি অনেক কষ্টের মধ্য দিয়ে যাচ্ছ। ধৈর্য ধরো, সবকিছু ধীরে ধীরে ঠিক হয়ে যাবে।"},
    {"input": "আমার খুব ভয় লাগছে", "response": "ভয় পাওয়াটা মানবিক অনুভূতি। তুমি কি বলতে পারবে ঠিক কোন জিনিসটা তোমাকে ভাবিয়ে তুলছে?"},
    # Failure & confidence
    {"input": "আমি মনে হয় জীবনে কিছুই করতে পারব না", "response": "ব্যর্থতা মানেই শেষ নয়, এটি সাফল্যের একটি অংশ। তোমার ভেতরে অনেক সম্ভাবনা আছে।"},
    {"input": "সবাই আমাকে ঘৃণা করে", "response": "তোমার এমন কেন মনে হচ্ছে? তুমি অনেক মূল্যবান একজন মানুষ, নিজেকে ছোট ভেবো না।"},
    {"input": "আমি কি কখনো সুখী হতে পারব?", "response": "অবশ্যই পারবে। সুখের সময়গুলো যেমন ফিরে আসে, দুঃখের সময়গুলোও তেমন চলে যায়। বিশ্বাস হারিও না।"},
    # Anger & frustration
    {"input": "আমি খুব রেগে আছি সবার ওপর", "response": "রাগ হওয়াটা স্বাভাবিক। চাইলে তুমি রাগটা প্রকাশ করতে পারো, আমি শুনছি। রাগ কমানোর জন্য গভীর শ্বাস নেওয়া খুব কার্যকর হতে পারে।"},
    {"input": "কেউ আমাকে বোঝে না", "response": "না বোঝাটা খুব যন্ত্রণাদায়ক। আমি তোমাকে বোঝার চেষ্টা করছি, তুমি কি বিস্তারিত বলতে চাও?"},
    # Social / family
    {"input": "পরিবারের সাথে আমার কোনো বনিবনা হচ্ছে না", "response": "পারিবারিক সম্পর্কের জটিলতা অনেক চাপের সৃষ্টি করে। তুমি কি এ নিয়ে কারো সাথে আলাপ করেছ?"},
    {"input": "বন্ধুরা আমাকে অবহেলা করছে", "response": "অবহেলা খুব কষ্ট দেয়। অনেক সময় ভুল বোঝাবুঝির কারণেও এমন হয়, তুমি কি তাদের সাথে কথা বলে দেখেছ?"},
]

# ============================================================
# 7. END-TO-END PIPELINE
# ============================================================

# 7.1 Choose strategy (LoRA or Unsloth)
strategy = LoRAStrategy(r=4, lora_alpha=8, lora_dropout=0.05)
# (UnslothStrategy stub exists to satisfy Strategy pattern requirement.)

# 7.2 Initialize fine-tuner
tuner = LLAMAFineTuner(
    strategy=strategy,
    model_name=MODEL_NAME,
    hf_token=HF_TOKEN,
    max_seq_len=MAX_SEQ_LEN,
)

# 7.3 Prepare dataset
processor = DatasetProcessor(tuner.tokenizer, MAX_SEQ_LEN)
train_ds, val_ds, val_raw = processor.prepare(bengali_data, val_size=0.2)

# 7.4 Build Trainer
trainer = tuner.get_trainer(train_ds, val_ds)

# 7.5 Train
trainer.train()

# 7.6 Evaluation: Perplexity, BLEU, ROUGE
clear_memory()
evaluator = Evaluator(
    model=tuner.model,
    tokenizer=tuner.tokenizer,
    processor=processor,
    trainer=trainer,
    max_seq_len=MAX_SEQ_LEN,
)

ppl, val_loss = evaluator.compute_perplexity(val_ds)
print(f"\nValidation loss: {val_loss:.4f}")
print(f"Perplexity: {ppl:.2f}")

metrics_br, inputs, preds, refs = evaluator.compute_bleu_rouge(
    val_raw,
    num_samples=None,   # all val examples
)

print("\nBLEU:", metrics_br["bleu"])
print("ROUGE-1:", metrics_br["rouge1"])
print("ROUGE-2:", metrics_br["rouge2"])
print("ROUGE-L:", metrics_br["rougeL"])

# 7.7 Human eval export file
human_eval_path = evaluator.prepare_human_eval_file(
    inputs=inputs,
    references=refs,
    predictions=preds,
    path="human_eval_samples.jsonl",
)
print("Human eval samples saved to:", human_eval_path)

# 7.8 Logging to SQLite
logger = ProjectLogger()

# Get last training loss from Trainer log
train_loss = None
if trainer.state.log_history:
    for log in reversed(trainer.state.log_history):
        if "loss" in log:
            train_loss = log["loss"]
            break

all_metrics = {"perplexity": ppl}
all_metrics.update(metrics_br)

experiment_id = logger.log_experiment(
    model_name=MODEL_NAME,
    lora_config_dict=tuner.lora_config,
    train_loss=train_loss,
    val_loss=val_loss,
    metrics_dict=all_metrics,
)
print("Experiment logged with ID:", experiment_id)

# Log generated responses for this experiment
for inp, pred in zip(inputs, preds):
    logger.log_response(
        experiment_id=experiment_id,
        input_text=inp,
        response_text=pred,
    )

# ============================================================
# 8. SAVE MODEL & TOKENIZER
# ============================================================

tuner.model.to("cpu")
clear_memory()

save_dir = "bengali-llama31-lora-4bit-final"
tuner.model.save_pretrained(save_dir)
tuner.tokenizer.save_pretrained(save_dir)

print("\nModel & tokenizer saved to:", save_dir)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

trainable params: 3,407,872 || all params: 8,033,669,120 || trainable%: 0.0424


Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

  return fn(*args, **kwargs)


Step,Training Loss
5,1.0703
10,0.8999



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.1-8B-Instruct.



Validation loss: 1.0266
Perplexity: 2.79

BLEU: 0.0
ROUGE-1: 0.0
ROUGE-2: 0.0
ROUGE-L: 0.0
Human eval samples saved to: human_eval_samples.jsonl
Experiment logged with ID: 6



Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct/resolve/main/config.json.
Access to model meta-llama/Llama-3.1-8B-Instruct is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Llama-3.1-8B-Instruct.



Model & tokenizer saved to: bengali-llama31-lora-4bit-final


In [15]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel

MODEL_BASE = "meta-llama/Llama-3.1-8B-Instruct"
LORA_PATH = "bengali-llama31-lora-4bit-final"  # your saved folder
HF_TOKEN = os.environ.get("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")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_BASE,
    token=HF_TOKEN,
    use_fast=False
)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_BASE,
    token=HF_TOKEN,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float16,
)

model = PeftModel.from_pretrained(base_model, LORA_PATH)
model.eval()


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=4, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=4, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.

In [19]:
def test_finetuned_model(
    model,
    tokenizer,
    user_text: str,
    max_new_tokens: int = 128,
    temperature: float = 0.7,
):
    """
    Simple post-finetuning test function.
    Call this after training to verify behavior.
    """

    prompt = (
        "<|begin_of_text|>"
        "<|start_header_id|>user<|end_header_id|>\n\n"
        f"{user_text}<|eot_id|>"
        "<|start_header_id|>assistant<|end_header_id|>\n\n"
    )

    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=512,
    ).to(model.device)

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=0.9,
            repetition_penalty=1.2,
            pad_token_id=tokenizer.eos_token_id,
        )

    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response


In [20]:
test_cases = [
    "আমার খুব একা লাগছে",
    "আমি খুব হতাশ বোধ করছি",
    "কেউ আমাকে বোঝে না",
    "ভবিষ্যৎ নিয়ে আমি খুব চিন্তিত",
    "আমি মনে হয় জীবনে কিছুই করতে পারব না",
]

print("\n===== MODEL TEST RESULTS =====\n")

for i, text in enumerate(test_cases, 1):
    output = test_finetuned_model(model, tokenizer, text)
    print(f"[{i}] USER: {text}")
    print(f"[{i}] ASSISTANT:\n{output}")
    print("-" * 60)



===== MODEL TEST RESULTS =====

[1] USER: আমার খুব একা লাগছে
[1] ASSISTANT:
user

আমার খুব একা লাগছেassistant

তোর সাথে যোগাযোগ করি, হ্যাংগিন অনুভৃতি পড়ে।
------------------------------------------------------------
[2] USER: আমি খুব হতাশ বোধ করছি
[2] ASSISTANT:
user

আমি খুব হতাশ বোধ করছিassistant

সে যা নিজেই প্রণয়, এটি অন্যের চেষ্টাও। ভালো-মন্দ ফুরিয়ে গেলে মাথায় ঠিকমতো ঢেউ ডুমবে না।

*   ঘৃণ্য না হতে
------------------------------------------------------------
[3] USER: কেউ আমাকে বোঝে না
[3] ASSISTANT:
user

কেউ আমাকে বোঝে নাassistant

"তুচ্ছ-হয়, সন্দেহ।"

* যারা এর ভিডিয়ো দেখেছে, তারা জানে প্রশ্নটি কীই থাকল।
 
 "অফ... গুড!"
------------------------------------------------------------
[4] USER: ভবিষ্যৎ নিয়ে আমি খুব চিন্তিত
[4] ASSISTANT:
user

ভবিষ্যৎ নিয়ে আমি খুব চিন্তিতassistant

এরা সাধারণই।  প্রথমে, হৃদয়ে উন্মাদ করো না, জীবনের অংশ-অংশে ছোড়া ফেলো. সঙ্গে সঙ্গে এটাও বলো - 'আমি কার
------------------------------------------------------------
[5] USER: আমি মনে হয় জীবনে কিছুই কর

In [None]:
# ===============================
#  Test multiple inputs nicely
# ===============================

test_cases = [
    "আমার খুব একা লাগছে",
    "আমি খুব হতাশ বোধ করছি",
    "কেউ আমাকে বোঝে না",
    "ভবিষ্যৎ নিয়ে আমি খুব চিন্তিত",
    "আমি মনে হয় জীবনে কিছুই করতে পারব না",
]

def test_finetuned_model(model, tokenizer, user_input):
    """
    Wrapper function for generating responses with nice prompt formatting.
    """
    prompt = (
        "<|begin_of_text|>"
        "<|start_header_id|>user<|end_header_id|>\n\n"
        f"{user_input}<|eot_id|>"
        "<|start_header_id|>assistant<|end_header_id|>\n\n"
    )
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=150,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    # Extract only assistant's reply
    response = full_text.split("assistant")[-1].strip() if "assistant" in full_text else full_text
    return response

print("\n===== MODEL TEST RESULTS =====\n")

for i, text in enumerate(test_cases, 1):
    output = test_finetuned_model(model, tokenizer, text)
    print(f"[{i}] USER:      {text}")
    print(f"[{i}] ASSISTANT: {output}\n")
    print("="*80)



===== MODEL TEST RESULTS =====

[1] USER:      আমার খুব একা লাগছে
[1] ASSISTANT: নির্ভয়, তোমরা সহায়তা পেতে চিন্তিত থাকো না। ধরুন শুরূতি দিই:

*   ডাওন উঠলে, সন্ধ্যার (4-5:00 pm) ৩/৪ ঘন্টা অফিসে থেমনে যাও।
*   6:30 টা, ঘৃ�

[2] USER:      আমি খুব হতাশ বোধ করছি
[2] ASSISTANT: নিজের চ্যালেঞ্জগু‌লো পা‌র-সফল ভাবে অতিক্র‌‍‌‌‌‍া‌ন ‌থে‌কে‌‌‍ এই নেওয়া উচি‌ত। দৃ‌ষ্টিভঙ্গি পরিবর‌্তন ক‌রা, নিজস্ব আত্মবিশ্বাস বাড�

[3] USER:      কেউ আমাকে বোঝে না
[3] ASSISTANT: "এরপর, যদি স্টিফেন হতে চাওছিল অন্য জীভার গঠন, শুধু এই দৃষ্টান্তেই।  'খাও,' বলত'—- সংঘণাঙ্কাঞ্জ কৌড়া থেক নিবৈ গুৎ. ঐ সামজে �



In [None]:
# ===============================
#  TEST MULTIPLE INPUTS USING STEP 12 PROMPT
# ===============================

test_cases = [
    "আমার খুব একা লাগছে",
    "আমি খুব হতাশ বোধ করছি",
    "কেউ আমাকে বোঝে না",
    "ভবিষ্যৎ নিয়ে আমি খুব চিন্তিত",
    "আমি মনে হয় জীবনে কিছুই করতে পারব না",
]

def test_finetuned_model(model, tokenizer, user_input):
    """
    Generate response using STEP 12 LLaMA 3.1 prompt format.
    """
    prompt = (
        f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
        f"{user_input}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
    )
    
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=150,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    # Extract only assistant's reply
    response = full_text.split("assistant")[-1].strip() if "assistant" in full_text else full_text
    return response

print("\n===== MODEL TEST RESULTS =====\n")

for i, text in enumerate(test_cases, 1):
    output = test_finetuned_model(model, tokenizer, text)
    print(f"[{i}] USER:      {text}")
    print(f"[{i}] ASSISTANT: {output}\n")
    print("="*80)



===== MODEL TEST RESULTS =====

[1] USER:      আমার খুব একা লাগছে
[1] ASSISTANT: নিজের সাথে যোগাযু্হ্ধ, নিরাপদ ও শান্ত। 
- 1.অভ্যাস: ঘরে চলুন টেলিফোন ডাইল বন্ধ করু'ন। 

2. উঠো ঝুয়ে দিন।

3. বাইরেরেকটের্ণে ঢু:ক।
4. প্�

