In [None]:
# Install all required packages
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install transformers
!pip install peft
!pip install datasets
!pip install accelerate
!pip install tqdm
!pip install tensorboard
!pip install bitsandbytes
!pip install sentencepiece
!pip install protobuf

# Verify installations
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Number of GPUs: {torch.cuda.device_count()}")


Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.47.0
PyTorch version: 2.8.0+cu126
CUDA available: True
CUDA version: 12.6
Number of GPUs: 1


In [None]:
# GPU VERSION - Remove TPU imports, add GPU imports
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
import torch.nn.utils.prune as prune
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm
import json

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Load model and tokenizer
model_name = "tiiuae/Falcon3-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    dtype=torch.float16,  # Use float16 for GPU memory efficiency
    device_map="auto",  # Automatically handle GPU placement
    trust_remote_code=True
)

# Add padding token if missing
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


Using device: cuda
GPU: Tesla T4
GPU Memory: 15.83 GB


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/658 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.47G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/113 [00:00<?, ?B/s]

In [None]:
import os
import json

os.makedirs("data", exist_ok=True)

def convert_to_instruction_output(infile, outfile):
    outpath = os.path.join("data", outfile)
    written = 0

    with open(infile, "r", encoding="utf-8") as f, open(outpath, "w", encoding="utf-8") as out:
        for line_no, line in enumerate(f, start=1):
            if not line.strip():
                continue
            try:
                obj = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"Skipping bad line {line_no}: {e}")
                continue

            code_field = obj.get("code", "")

            # --- Extract instruction ---
            instruction = ""
            if "###Instruction:" in code_field:
                after_ins = code_field.split("###Instruction:", 1)[1]
                if "###Output:" in after_ins:
                    instruction = after_ins.split("###Output:", 1)[0].strip()
                else:
                    instruction = after_ins.strip()
            else:
                instruction = obj.get("instruction") or obj.get("prompt") or ""

            # Prepend user tag
            instruction = "#User\n" + instruction

            # --- Append Reference API if available ---
            api_data = obj.get("api_data", "")
            if api_data:
                if not isinstance(api_data, str):
                    api_str = json.dumps(api_data, ensure_ascii=False)
                else:
                    api_str = api_data
                # append (use concatenation, not join)
                instruction = instruction + f"\n<Reference API>:{api_str}\n###Assistant"

            # --- Extract output robustly ---
            output = ""
            if "###Output:" in code_field:
                output = code_field.split("###Output:", 1)[1].strip()
            else:
                output = obj.get("output") or obj.get("answer") or ""

            out_obj = {
                "instruction": instruction,
                "output": output
            }
            out.write(json.dumps(out_obj, ensure_ascii=False) + "\n")
            written += 1

    print(f"✅ Wrote {written} processed JSONL entries into {outpath}")


# Convert your train/eval sets into data/
convert_to_instruction_output("huggingface_train.json", "hf_train.jsonl")
convert_to_instruction_output("huggingface_eval.json", "hf_eval.jsonl")


✅ Wrote 8191 processed JSONL entries into data/hf_train.jsonl
✅ Wrote 911 processed JSONL entries into data/hf_eval.jsonl


In [None]:
from datasets import load_dataset

train_dataset = load_dataset("json", data_files="./data/hf_train.jsonl")["train"]
eval_dataset = load_dataset("json", data_files="./data/hf_eval.jsonl")["train"]

print("Train samples:", len(train_dataset))
print("Eval samples:", len(eval_dataset))



Train samples: 8191
Eval samples: 911


In [None]:
import json
from transformers import AutoTokenizer
from tqdm import tqdm

# === Config ===
file_path = "./data/hf_train.jsonl"

total_tokens = 0

total_tokens = 0
num_examples = 0

with open(file_path, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        obj = json.loads(line)

        # Combine instruction + output as training text
        text = obj.get("instruction", "") + "\n" + str(obj.get("output", ""))

        # Tokenize without truncation
        tokens = tokenizer(text, truncation=False, return_tensors=None)
        total_tokens += len(tokens["input_ids"])
        num_examples += 1

print(f"📊 Total tokens in dataset: {total_tokens:,}")
print(f"📈 Average tokens per example: {total_tokens/num_examples:.2f}")
print(f"📦 Number of examples: {num_examples}")


📊 Total tokens in dataset: 6,095,024
📈 Average tokens per example: 744.11
📦 Number of examples: 8191


In [None]:
def preprocess(examples):
    texts = []
    for instr, out in zip(examples["instruction"], examples["output"]):
        prompt = f"{instr}\n"

        # Ensure output is a string
        if isinstance(out, list):
            output_text = " ".join(out)
        else:
            output_text = str(out)

        texts.append(prompt + output_text)
        print(prompt + output_text)

    return tokenizer(texts, truncation=False, padding=False,)


# Corrected to use a list instead of a set
x = [train_dataset1[0], train_dataset1[1]]


# If you just want to see the output of preprocess for these two examples:
processed_examples = preprocess({"instruction": [ex["instruction"] for ex in x], "output": [ex["output"] for ex in x]})
print(processed_examples)

Map:   0%|          | 0/8191 [00:00<?, ? examples/s]

Map:   0%|          | 0/911 [00:00<?, ? examples/s]

In [None]:
train_dataset = train_dataset.map(preprocess, batched=True, remove_columns=["instruction","output"])
eval_dataset = eval_dataset.map(preprocess, batched=True, remove_columns=["instruction","output"])

In [None]:
print(eval_dataset[0])
print(train_dataset[4796])


{'input_ids': [2026, 6861, 12, 19502, 2265, 5811, 2346, 2265, 4072, 4513, 3974, 2302, 5480, 8090, 2302, 5021, 4047, 2354, 2761, 5003, 2278, 8090, 2402, 2302, 2582, 7927, 10506, 8090, 2037, 12, 2051, 13566, 7759, 28368, 8655, 12585, 3817, 2226, 2025, 32878, 15488, 27146, 89857, 26868, 2445, 2839, 2226, 2025, 13889, 3817, 2226, 2025, 2063, 70544, 5738, 58166, 2839, 2226, 2025, 106816, 3817, 2226, 2025, 23155, 76704, 2839, 2226, 2025, 5526, 2086, 3318, 3817, 2226, 2025, 42982, 21802, 2036, 106902, 2038, 9247, 2909, 2036, 19846, 117844, 2036, 2322, 69058, 2036, 6283, 2839, 2226, 2025, 5526, 2086, 3782, 3817, 2226, 2025, 17997, 7736, 2037, 5280, 2086, 3719, 37930, 3108, 42982, 21802, 2036, 106902, 2038, 9247, 2909, 2036, 19846, 117844, 2036, 2322, 69058, 2036, 6283, 5646, 2839, 2226, 2025, 5526, 2086, 15163, 3817, 4419, 2035, 2226, 2025, 20454, 2086, 25492, 2086, 55644, 3817, 2226, 6730, 13736, 2387, 26045, 2226, 2025, 8894, 2086, 3980, 3817, 4419, 2035, 2226, 2025, 23857, 3817, 2226, 8655,

In [None]:
for name, module in model.named_modules():
    print(name)



model
model.embed_tokens
model.layers
model.layers.0
model.layers.0.self_attn
model.layers.0.self_attn.q_proj
model.layers.0.self_attn.k_proj
model.layers.0.self_attn.v_proj
model.layers.0.self_attn.o_proj
model.layers.0.mlp
model.layers.0.mlp.gate_proj
model.layers.0.mlp.up_proj
model.layers.0.mlp.down_proj
model.layers.0.mlp.act_fn
model.layers.0.input_layernorm
model.layers.0.post_attention_layernorm
model.layers.1
model.layers.1.self_attn
model.layers.1.self_attn.q_proj
model.layers.1.self_attn.k_proj
model.layers.1.self_attn.v_proj
model.layers.1.self_attn.o_proj
model.layers.1.mlp
model.layers.1.mlp.gate_proj
model.layers.1.mlp.up_proj
model.layers.1.mlp.down_proj
model.layers.1.mlp.act_fn
model.layers.1.input_layernorm
model.layers.1.post_attention_layernorm
model.layers.2
model.layers.2.self_attn
model.layers.2.self_attn.q_proj
model.layers.2.self_attn.k_proj
model.layers.2.self_attn.v_proj
model.layers.2.self_attn.o_proj
model.layers.2.mlp
model.layers.2.mlp.gate_proj
model.l

In [None]:
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none"
)


model1 = get_peft_model(model, lora_config)
# Explicitly enable training mode
model1.train()

model1.enable_input_require_grads()


# Enable gradients for PEFT parameters
for name, param in model.named_parameters():
    if 'lora' in name.lower():
        param.requires_grad = True
        print(f"Enabled gradients for: {name}")



Enabled gradients for: model.layers.0.self_attn.q_proj.lora_A.default.weight
Enabled gradients for: model.layers.0.self_attn.q_proj.lora_B.default.weight
Enabled gradients for: model.layers.0.self_attn.k_proj.lora_A.default.weight
Enabled gradients for: model.layers.0.self_attn.k_proj.lora_B.default.weight
Enabled gradients for: model.layers.0.self_attn.v_proj.lora_A.default.weight
Enabled gradients for: model.layers.0.self_attn.v_proj.lora_B.default.weight
Enabled gradients for: model.layers.0.self_attn.o_proj.lora_A.default.weight
Enabled gradients for: model.layers.0.self_attn.o_proj.lora_B.default.weight
Enabled gradients for: model.layers.1.self_attn.q_proj.lora_A.default.weight
Enabled gradients for: model.layers.1.self_attn.q_proj.lora_B.default.weight
Enabled gradients for: model.layers.1.self_attn.k_proj.lora_A.default.weight
Enabled gradients for: model.layers.1.self_attn.k_proj.lora_B.default.weight
Enabled gradients for: model.layers.1.self_attn.v_proj.lora_A.default.weight

In [None]:
# print total and a handful of trainable param names
trainable = [(n, p.numel()) for n,p in model1.named_parameters() if p.requires_grad]
print("num trainable params:", sum(p for _,p in trainable))
print("sample trainable names:", [n for n,_ in trainable][:20])


num trainable params: 7208960
sample trainable names: ['base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight', 'base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight', 'base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight', 'base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight', 'base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight', 'base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight', 'base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight', 'base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight', 'base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight', 'base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight', 'base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight', 'base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight', 'base_model.model.model.layers.1.self_attn.v_proj.lora_A.

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
 from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# ================== TrainingArguments ==================
training_args = TrainingArguments(
    output_dir="./falcon3b_instruct_lora_1st",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="steps",              # save ~2 times per epoch
    save_steps=250,
    eval_strategy="steps",
    eval_steps=100,
    learning_rate=2e-4,                # LoRA usually higher lr than full FT
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=16,
    num_train_epochs=1,
    fp16=True,
    gradient_checkpointing=True,
    dataloader_num_workers=2,
    dataloader_pin_memory=True,
    remove_unused_columns=False,
    push_to_hub=False,
    save_total_limit=2,
)



# ================== Data Collator ==================
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

# ================== Trainer ==================
trainer = Trainer(
    model=model1,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
)

# ================== GPU Logging (replaces TPU master_print) ==================
print("Trainer initialized ✅")
print(f"Train samples: {len(train_dataset)}, Eval samples: {len(eval_dataset)}")

# ================== GPU Memory Info (replaces TPU memory info) ==================
if torch.cuda.is_available():
    print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    print(f"GPU memory cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
else:
    print("No GPU available - using CPU")

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Trainer initialized ✅
Train samples: 8191, Eval samples: 911
GPU memory allocated: 6.48 GB
GPU memory cached: 7.36 GB


In [None]:

model1.print_trainable_parameters()


trainable params: 7,208,960 || all params: 3,234,864,128 || trainable%: 0.2229


In [None]:
# GPU Training (replaces TPU training)
print("🚀 Starting training...")

# Print GPU memory before training (replaces TPU memory)
if torch.cuda.is_available():
    print(f"[BEFORE TRAIN] GPU memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    print(f"[BEFORE TRAIN] GPU memory cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
else:
    print("No GPU memory to check")

# Run training
train_result = trainer.train()

# Print GPU memory after training (replaces TPU memory)
if torch.cuda.is_available():
    print(f"[AFTER TRAIN] GPU memory allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
    print(f"[AFTER TRAIN] GPU memory cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
else:
    print("No GPU memory to check")

# Save model
trainer.save_model("./falcon3b_instruct_1stepoch")
print("✅ Training finished and model saved!")


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 2023}.


🚀 Starting training...
[BEFORE TRAIN] GPU memory allocated: 6.48 GB
[BEFORE TRAIN] GPU memory cached: 7.36 GB


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
100,0.759,0.766083
200,0.6817,0.693901
300,0.6063,0.644355
400,0.6149,0.615935


Step,Training Loss,Validation Loss
100,0.759,0.766083
200,0.6817,0.693901
300,0.6063,0.644355
400,0.6149,0.615935
500,0.6307,0.60267


[AFTER TRAIN] GPU memory allocated: 6.56 GB
[AFTER TRAIN] GPU memory cached: 11.88 GB
✅ Training finished and model saved!


In [None]:
from google.colab import files
import os

folder_name = "./falcon3b_instruct_1stepoch"
zip_name = folder_name + ".zip"

# Create zip file
!zip -r {zip_name} {folder_name}

# Download the zip file
files.download(zip_name)
