<a href="https://colab.research.google.com/github/Nwokike/igbo-bilingual-chat/blob/main/Fine_tune_Igbo_Chat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# Cell 1: Setup
# --- 1. Install Libraries ---
print("--- [1/10] Installing libraries (Unsloth, Transformers, etc.)... ---")
!pip install "unsloth[colab-new]" transformers peft bitsandbytes datasets trl
import torch
from unsloth import FastLanguageModel
from datasets import load_dataset, concatenate_datasets
from transformers import TrainingArguments
from trl import SFTTrainer
import time
import os
from huggingface_hub import login, create_repo, HfApi, snapshot_download
import random
import getpass
print("--- ‚úÖ [1/10] Libraries installed ---")

# --- 2. HUGGING FACE LOGIN & AUTO-REPO CREATION ---
print("\n--- [2/10] Hugging Face Login & Repo Setup ---")
global token
token = getpass.getpass("Please paste your Hugging Face 'write' token: ")
login(token=token)

try:
    whoami_info = HfApi().whoami(token=token)
    whoami = whoami_info['name']
    print(f"--- Logged in as: {whoami} ---")
except Exception as e:
    print(f"--- ‚ö†Ô∏è Could not get username. Please check your token. Error: {e} ---")
    raise e

global NEW_HUB_REPO
NEW_HUB_REPO = f"{whoami}/Igbo-Phi3-Bilingual-Chat-v1"
print(f"--- Your new checkpoints will be saved to: {NEW_HUB_REPO} ---")

print(f"--- Checking for/creating repo {NEW_HUB_REPO}...")
try:
    create_repo(repo_id=NEW_HUB_REPO, exist_ok=True, repo_type="model")
    print(f"--- ‚úÖ [2/10] Repo '{NEW_HUB_REPO}' is ready! ---")
except Exception as e:
    print(f"--- ‚ö†Ô∏è Could not create repo. Error: {e} ---")
    raise e

# --- 3. Load the *BASE* Model & Tokenizer ---
print("\n--- [3/10] Loading base Phi-3 model... ---")
max_seq_length = 1024
dtype = None
load_in_4bit = True

global model, tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/phi-3-mini-4k-instruct-bnb-4bit", # Correct lowercase
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
print("--- ‚úÖ [3/10] Base model loaded ---")

# --- 4. Add PEFT/LoRA Adapters ---
print("\n--- [4/10] Adding PEFT/LoRA adapters... ---")
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = True,
    random_state = 42,
    max_seq_length = max_seq_length,
)
print("--- ‚úÖ [4/10] PEFT adapters added ---")

# --- 5. Load Dataset 1: "Fluency" (522k Sentences) ---
print("\n--- [5/10] Loading 1/3: Igbo Sentences (ccibeekeoc42)... ---")
igbo_sentences = load_dataset("ccibeekeoc42/english_to_igbo", split="train")

def format_translation_chat(example):
    if not (example.get('English') and example.get('Igbo')):
        return {"text": None}
    eng = example['English']
    igb = example['Igbo']
    templates = [
        f"<s><|user|>\nHow would I say '{eng}' in Igbo?<|end|>\n<|assistant|>\nYou would say '{igb}'.<|end|>",
        f"<s><|user|>\nWhat's the Igbo for '{eng}'?<|end|>\n<|assistant|>\n{igb}<|end|>",
        f"<s><|user|>\n{igb}<|end|>\n<|assistant|>\n{eng}<|end|>"
    ]
    return {"text": random.choice(templates)}

formatted_igbo_chat = igbo_sentences.map(
    format_translation_chat,
    remove_columns=list(igbo_sentences.features),
    num_proc=os.cpu_count() # <-- FIX: Use all CPU cores
).filter(lambda x: x["text"] is not None)
print(f"--- ‚úÖ [5/10] Loaded {len(formatted_igbo_chat)} 'Fluency' examples ---")

# --- 6. Load Dataset 2: "Vocabulary" (11.7MB Dictionary) ---
print("\n--- [6/10] Loading 2/3: Igbo Dictionary (nkowaokwu/ibo-dict)... ---")
igbo_dict = load_dataset("nkowaokwu/ibo-dict", data_files="ibo-dict.json", split="train")

def format_dictionary_chat(example):
    if not (example.get('igbo') and example.get('english')):
        return {"text": None}
    try:
        igb_word = example['igbo']
        eng_def = example['english']
    except (IndexError, TypeError):
        return {"text": None}
    if not (igb_word and eng_def):
        return {"text": None}

    templates = [
        f"<s><|user|>\nWhat does the Igbo word '{igb_word}' mean?<|end|>\n<|assistant|>\nIt means: {eng_def}<|end|>",
        f"<s><|user|>\nDefine '{igb_word}'.<|end|>\n<|assistant|>\n{igb_word} means {eng_def}.<|end|>"
    ]
    return {"text": random.choice(templates)}

formatted_igbo_dict = igbo_dict.map(
    format_dictionary_chat,
    remove_columns=list(igbo_dict.features),
    num_proc=os.cpu_count() # <-- FIX: Use all CPU cores
).filter(lambda x: x["text"] is not None)
print(f"--- ‚úÖ [6/10] Loaded {len(formatted_igbo_dict)} 'Vocabulary' examples ---")

# --- 7. Load Dataset 3: "Memory" (200k Chat) ---
print("\n--- [7/10] Loading 3/3: General Chat (HuggingFaceH4/ultrachat_200k)... ---")
general_chat = load_dataset("HuggingFaceH4/ultrachat_200k", split="train_sft")

def format_general_chat(example):
    try:
        if len(example['messages']) >= 2 and \
           example['messages'][0]['role'] == 'user' and \
           example['messages'][1]['role'] == 'assistant':
            user_prompt = example['messages'][0]['content']
            ai_response = example['messages'][1]['content']
            if "translate" in user_prompt.lower():
                return {"text": None}
            text = f"<s><|user|>\n{user_prompt}<|end|>\n<|assistant|>\n{ai_response}<|end|>"
            return {"text": text}
    except Exception:
        pass
    return {"text": None}

formatted_general_chat = general_chat.map(
    format_general_chat,
    remove_columns=list(general_chat.features),
    num_proc=os.cpu_count() # <-- FIX: Use all CPU cores
).filter(lambda x: x["text"] is not None)
print(f"--- ‚úÖ [7/10] Loaded {len(formatted_general_chat)} 'Memory' examples ---")

# --- 8. Combine All Datasets ---
print("\n--- [8/10] Combining all 3 datasets into one... ---")
combined_dataset = concatenate_datasets([
    formatted_igbo_chat,
    formatted_igbo_dict,
    formatted_general_chat
])
final_dataset = combined_dataset.shuffle(seed=42)
TOTAL_EXAMPLES = len(final_dataset)
print(f"--- ‚úÖ [8/10] TOTAL EXAMPLES FOR TRAINING: {TOTAL_EXAMPLES} ---")

# --- 9. Define Training Arguments ---
print("\n--- [9/10] Setting up Training Arguments... ---")
EFFECTIVE_BATCH_SIZE = 8 * 2
global NEW_MAX_STEPS
NEW_MAX_STEPS = TOTAL_EXAMPLES // EFFECTIVE_BATCH_SIZE
print(f"--- Total Examples: {TOTAL_EXAMPLES} ---")
print(f"--- Effective Batch Size: {EFFECTIVE_BATCH_SIZE} ---")
print(f"--- CALCULATED MAX STEPS: {NEW_MAX_STEPS} (for ~1 epoch) ---")

training_args = TrainingArguments(
    max_steps = NEW_MAX_STEPS,
    per_device_train_batch_size = 8,
    gradient_accumulation_steps = 2,
    optim = "adamw_8bit", # <-- FIX: Explicitly set 8-bit optimizer
    learning_rate = 2e-5,
    lr_scheduler_type = "linear",
    save_strategy = "steps",
    save_steps = 1000,
    save_total_limit = 1,
    push_to_hub = True,
    hub_model_id = NEW_HUB_REPO,
    hub_strategy = "checkpoint",
    logging_steps = 500,
    fp16 = True,
    group_by_length = True,
    report_to = "none",
)
print(f"--- ‚úÖ [9/10] Training Arguments set for {NEW_MAX_STEPS} steps ---")

# --- 10. Initialize the Trainer ---
print("\n--- [10/10] Initializing the SFTTrainer... ---")
global trainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = final_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    args = training_args,
    packing = True,
)
print("--- ‚úÖ [10/10] Trainer is ready! ---")
print("\n" + "="*50)
print("   üöÄ ALL SETUP IS COMPLETE AND CORRECT üöÄ")
print("   You are ready to train the BILINGUAL CHATBOT.")
print(f"   Your checkpoints will be saved to: {NEW_HUB_REPO}")
print("="*50 + "\n")

In [None]:
# Cell 2: Start the "Colab Relay Race" (Worker 1 ONLY)
print(f"--- üöÄ STARTING the 'Colab Relay Race' (Worker 1) ---")
print(f"--- This will run from step 0 and create the first checkpoint. ---")
print(f"--- Training for a total of {NEW_MAX_STEPS} steps. ---")

start_time_train = time.time()
try:
    # Worker 1 just calls .train() to start from scratch
    trainer.train()
    print("\n--- üéâ TRAINING COMPLETED NORMALLY! ---")
except Exception as e:
    print(f"\n--- üí• Training interrupted by unexpected error: {e} ---")
finally:
    end_time_train = time.time()
    print(f"--- Training run duration: {(end_time_train - start_time_train) / 60:.2f} minutes ---")
    print(f"--- üõë Session ended. Your checkpoint is safe on Hugging Face: {NEW_HUB_REPO} ---")

In [None]:
# Cell 3: Resume the "Colab Relay Race" (Worker 2, 3...)

# --- Configuration ---
HUB_MODEL_ID = NEW_HUB_REPO
HUB_CHECKPOINT_SUBFOLDER = "last-checkpoint"
LOCAL_CHECKPOINT_PATH = os.path.join(os.path.expanduser("~"), "local_hub_resume")

print(f"--- üëü RESUMING Training (Worker 2/3/...) ---")
print(f"--- ‚¨áÔ∏è Downloading latest checkpoint from Hub: {HUB_MODEL_ID} ---")

# --- 1. Download Checkpoint Files Locally ---
try:
    snapshot_download(
        repo_id=HUB_MODEL_ID,
        allow_patterns=[f"{HUB_CHECKPOINT_SUBFOLDER}/*"],
        local_dir=LOCAL_CHECKPOINT_PATH,
        local_dir_use_symlinks=False,
        token=token # <-- This token IS defined from Cell 1.
    )
    print("--- ‚úÖ Checkpoint downloaded. ---")
except Exception as e:
    print(f"--- ‚ö†Ô∏è FAILED to download checkpoint. Did Worker 1 run and save a checkpoint? Error: {e} ---")
    raise e

# --- 2. Define the Local Path to Resume From ---
RESUME_PATH = os.path.join(LOCAL_CHECKPOINT_PATH, HUB_CHECKPOINT_SUBFOLDER)
print(f"--- üéØ Resuming from LOCAL PATH: {RESUME_PATH} ---")

# --- 3. Run the Training ---
start_time_train = time.time()
try:
    trainer.train(resume_from_checkpoint = RESUME_PATH)
    print("\n--- üéâ TRAINING COMPLETED NORMALLY! ---")
except Exception as e:
    print(f"\n--- üí• Training interrupted by unexpected error: {e} ---")
finally:
    end_time_train = time.time()
    print(f"--- Training run duration: {(end_time_train - start_time_train) / 60:.2f} minutes ---")
    print(f"--- üõë Session ended. The latest checkpoint is safe on Hugging Face. ---")

In [None]:
# Cell 4: Merge and Save to GGUF
print("--- Merging model and converting to GGUF format... ---")

# 1. Save 16-bit Merged Model (Good for other HF users)
# This merges the 120MB adapter into the 2GB base model
model.save_pretrained_merged("merged_model", tokenizer, save_method = "merged_16bit",)
model.push_to_hub_merged(f"{YOUR_MODEL_REPO}-merged", tokenizer, save_method = "merged_16bit", token=token)

# 2. Save to GGUF (Good for Ollama / LM Studio)
# This creates a 'q4_k_m' quantized version which is fast and small
model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
model.push_to_hub_gguf(f"{YOUR_MODEL_REPO}-GGUF", tokenizer, quantization_method = "q4_k_m", token=token)

print(f"--- ‚úÖ Success! Your model is saved to {YOUR_MODEL_REPO}-GGUF ---")
print("You can now download the .gguf file and run it locally!")