### install requirements:

**.py:** ```pip install transformers datasets peft trl bitsandbytes accelerate sentencepiece```

**.ipynb:** ```!pip install transformers datasets peft trl bitsandbytes accelerate sentencepiece```


In [None]:
# !pip install transformers datasets peft trl bitsandbytes accelerate sentencepiece

In [None]:
!pip install transformers datasets peft trl bitsandbytes accelerate sentencepiece

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model

# ========== Settings ==========
BASE_MODEL = "openai-community/gpt2-medium"
OUTPUT_DIR = "/content/lora-bulgarian-output"
DATASET_NAME = "cc100"
DATASET_CONFIG = "bg"
BATCH_SIZE = 2
EPOCHS = 3
LEARNING_RATE = 2e-4
MAX_SEQ_LENGTH = 512
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# ========== Dataset Loader ==========
def load_bulgarian_dataset():
    dataset = load_dataset(
        DATASET_NAME,
        DATASET_CONFIG,
        split="train",
        trust_remote_code=True,
        streaming=True
    )

    dataset = dataset.take(1000)

    dataset = dataset.filter(lambda example: len(example['text'].strip()) >= 100)

    return dataset

# ========== Model Loader ==========
def load_model_and_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        device_map="auto",
        trust_remote_code=True
    )

    return model, tokenizer

# ========== LoRA Configuration ==========
def add_lora(model):
    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["c_attn"],
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM"
    )

    model = get_peft_model(model, lora_config)
    return model

# ========== Trainer Setup ==========
def setup_trainer(model, tokenizer, dataset):
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        per_device_train_batch_size=BATCH_SIZE,
        gradient_accumulation_steps=16,
        learning_rate=LEARNING_RATE,
        max_steps=1000,
        logging_dir=f"{OUTPUT_DIR}/logs",
        logging_steps=10,
        save_strategy="steps",
        save_steps=500,
        fp16=False,
        optim="adamw_torch",
        report_to="none"  # 🚀 Disable wandb
    )

    def tokenize_function(examples):
        tokens = tokenizer(
            examples['text'],
            truncation=True,
            padding="max_length",
            max_length=MAX_SEQ_LENGTH,
        )
        tokens["labels"] = tokens["input_ids"].copy()
        return tokens

    tokenized_dataset = dataset.map(tokenize_function, batched=True)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        tokenizer=tokenizer
    )

    return trainer

# ========== Save LoRA Adapter ==========
def save_lora_adapter(trainer):
    trainer.model.save_pretrained(OUTPUT_DIR)

# ========== Main ==========
if __name__ == "__main__":
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    print("\n🔵 Loading Bulgarian dataset...")
    dataset = load_bulgarian_dataset()

    print("\n🔵 Loading model and tokenizer...")
    model, tokenizer = load_model_and_tokenizer()

    print("\n🔵 Adding LoRA adapter...")
    model = add_lora(model)

    print("\n🔵 Setting up Trainer...")
    trainer = setup_trainer(model, tokenizer, dataset)

    print("\n🚀 Starting training...")
    trainer.train()

    print("\n💾 Saving LoRA adapter...")
    save_lora_adapter(trainer)

    print("\n📦 Zipping the output...")
    os.system(f"zip -r {OUTPUT_DIR}.zip {OUTPUT_DIR}")

    print("\n✅ Done! Bulgarian LoRA Adapter trained, saved, and zipped.")



🔵 Loading Bulgarian dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

cc100.py:   0%|          | 0.00/6.37k [00:00<?, ?B/s]


🔵 Loading model and tokenizer...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.



🔵 Adding LoRA adapter...

🔵 Setting up Trainer...

🚀 Starting training...


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,6.6779
20,5.9335
30,3.5682
40,1.7796
50,1.2956
60,1.2107
70,1.1761
80,1.1562
90,1.1416
100,1.1313



💾 Saving LoRA adapter...

📦 Zipping the output...

✅ Done! Bulgarian LoRA Adapter trained, saved, and zipped.


---

**to save after training or to save anytime mid-way:** ```!zip -r lora-bulgarian-output.zip /content/lora-bulgarian-output```

In [None]:
!zip -r lora-bulgarian-output.zip /content/lora-bulgarian-output

---

# LoRA Bulgarian Inference :

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

# ========== Settings ==========
BASE_MODEL = "openai-community/gpt2-medium"
LORA_ADAPTER_PATH = "/content/lora-bulgarian-output"  # Corrected path to the directory containing adapter files
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MAX_NEW_TOKENS = 200

# ========== Load Model + Attach LoRA ==========
def load_model_with_lora():
    print("\n🔵 Loading tokenizer and base model...")
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
    model = AutoModelForCausalLM.from_pretrained(BASE_MODEL).to(DEVICE)

    print("\n🔵 Attaching LoRA adapter...")
    model = PeftModel.from_pretrained(model, LORA_ADAPTER_PATH) # Now points to the correct directory

    print("\n🔵 (Optional) Merging LoRA into model weights...")
    model = model.merge_and_unload()

    model.eval()
    return model, tokenizer

# ========== Generate Text ==========
def generate_text(model, tokenizer, prompt, max_new_tokens=MAX_NEW_TOKENS):
    inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.1,
            pad_token_id=tokenizer.eos_token_id,
        )
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# ========== Main ==========
if __name__ == "__main__":
    model, tokenizer = load_model_with_lora()

    while True:
        prompt = input("\n📝 Enter a Bulgarian prompt (or 'exit' to quit):\n> ")
        if prompt.lower() == "exit":
            break

        print("\n🚀 Generating text...")
        output = generate_text(model, tokenizer, prompt)
        print("\n🧠 Generated Text:\n")
        print(output)