In [None]:
!pip install -q transformers datasets peft accelerate bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m37.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m43.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

def finetune_qwen_colab_lightweight(train_file, model_name="Qwen/Qwen2.5-3B-Instruct", output_dir="./qwen2.5-3b-research-qa-lora"):
    """
    Fine-tunes a Qwen model using a custom dataset.
    Optimized for minimal memory usage in Google Colab to avoid OutOfMemoryError.

    Key memory reduction strategies applied:
    - Reduced per_device_train_batch_size to 1
    - Kept gradient_accumulation_steps to 4 (effective batch size 4)
    - Reduced max_length for tokenization to 256
    - Using 4-bit quantization (load_in_4bit=True)
    - CPU Offloading enabled

    Args:
        train_file (str): Path to the JSON file containing the training data (in Colab environment).
        model_name (str): Name of the Qwen model to fine-tune.
        output_dir (str): Output directory for saving the fine-tuned LoRA adapters (in Colab environment).
    """

    # --- Google Colab Specific Setup (Installation - Run this in Colab) ---
    # print("Installing required libraries in Colab...")
    # !pip install -q transformers datasets peft accelerate bitsandbytes

    # --- Clear GPU Cache ---
    print("Clearing GPU memory cache...")
    torch.cuda.empty_cache()

    # --- 1. Load Model and Tokenizer ---
    print(f"Loading tokenizer and model: {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        offload_folder="offload",
        offload_state_dict=True,
        # Remove load_in_4bit=True  <--- REMOVE THIS LINE
        quantization_config={"load_in_4bit": True} # KEEP quantization_config
    )

    # --- 2. Prepare Model for QLoRA ---
    print("Preparing model for QLoRA...")
    model = prepare_model_for_kbit_training(model)

    config = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=["q_proj", "v_proj"]
    )
    model = get_peft_model(model, config)

    # --- 3. Load and Tokenize Dataset ---
    print(f"Loading and tokenizing dataset from: {train_file}...")
    def preprocess_function(examples):
        inputs = [f"Question: {q} Answer: " for q in examples["question"]]
        targets = [a for a in examples["answer"]]
        model_inputs = tokenizer(inputs, text_target=targets, max_length=256, truncation=True, # Reduced max_length to 256
                                   padding="max_length")
        return model_inputs

    try:
        dataset = load_dataset("json", data_files=train_file, split="train")
        tokenized_train_dataset = dataset.map(preprocess_function, batched=True)
    except FileNotFoundError:
        print(f"Error: Training data file not found at {train_file}. Make sure to upload it to Colab.")
        return
    except Exception as e:
        print(f"Error: An error occurred reading the JSON file: {e}")
        return

    # --- 4. Set up Training Arguments ---
    print("Setting up training arguments (Lightweight Config)...")
    training_args = TrainingArguments(
        output_dir=output_dir,
        per_device_train_batch_size=1, # **Reduced to 1 for minimal memory**
        gradient_accumulation_steps=4, # Effective batch size = 4 (still reasonable)
        learning_rate=2e-4,
        num_train_epochs=3,
        logging_steps=50,
        save_steps=500,
        save_total_limit=2,
        evaluation_strategy="no",
        fp16=True,
        optim="paged_adamw_8bit",
        lr_scheduler_type="cosine",
        warmup_ratio=0.05,
        report_to="none",
        push_to_hub=False,
    )

    # --- 5. Create Trainer and Train ---
    print("Initializing Trainer and starting training...")
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=tokenized_train_dataset,
    )

    trainer.train()

    # --- 6. Save Trained Model (LoRA Adapters) ---
    print("Saving trained LoRA adapters...")
    model.save_pretrained(output_dir)
    print(f"Fine-tuning complete! LoRA adapters saved to {output_dir}")


if __name__ == "__main__":
    # ---  Instructions for Google Colab Users (LIGHTWEIGHT VERSION) ---
    print("\n--- Instructions for Google Colab (LIGHTWEIGHT VERSION) ---")
    print("1. **Upload Dataset:** Upload your `dataset.json` file to the Colab environment.")
    print("   You can do this by dragging and dropping it into the Files sidebar (left side in Colab).")
    print("2. **Set `train_data_file` Path:** Ensure `train_data_file` below points to the correct path")
    print("   where you uploaded `dataset.json` in Colab.  For example: `'dataset.json'` or `'./data/dataset.json'`")
    print("3. **Run the Code:** Execute this Python code cell in Colab.")
    print("4. **Check Output:** After training, LoRA adapters will be in `qwen2.5-3b-research-qa-lora` folder.")
    print("   Download this folder from Colab's Files sidebar.")
    print("---")
    print("\n**This version is optimized for minimal memory usage in Colab.**")
    print("**If you still get OutOfMemoryError, consider further reducing `max_length` to 128 in the code.**")
    print("---")

    train_data_file = "./dataset.json"  # Path to your JSON training data file in Colab
    finetune_qwen_colab_lightweight(train_data_file)  # Run the lightweight fine-tuning function


--- Instructions for Google Colab (LIGHTWEIGHT VERSION) ---
1. **Upload Dataset:** Upload your `dataset.json` file to the Colab environment.
   You can do this by dragging and dropping it into the Files sidebar (left side in Colab).
2. **Set `train_data_file` Path:** Ensure `train_data_file` below points to the correct path
   where you uploaded `dataset.json` in Colab.  For example: `'dataset.json'` or `'./data/dataset.json'`
3. **Run the Code:** Execute this Python code cell in Colab.
4. **Check Output:** After training, LoRA adapters will be in `qwen2.5-3b-research-qa-lora` folder.
   Download this folder from Colab's Files sidebar.
---

**This version is optimized for minimal memory usage in Colab.**
**If you still get OutOfMemoryError, consider further reducing `max_length` to 128 in the code.**
---
Clearing GPU memory cache...
Loading tokenizer and model: Qwen/Qwen2.5-3B-Instruct...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Preparing model for QLoRA...
Loading and tokenizing dataset from: ./dataset.json...


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/209 [00:00<?, ? examples/s]

Setting up training arguments (Lightweight Config)...
Initializing Trainer and starting training...


  trainer = Trainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, *

Step,Training Loss
50,7.8668
100,1.899
150,1.5463


Saving trained LoRA adapters...
Fine-tuning complete! LoRA adapters saved to ./qwen2.5-3b-research-qa-lora


In [None]:
!zip -r /content/qwen2.5-3b-research-qa-lora.zip /content/qwen2.5-3b-research-qa-lora/

  adding: content/qwen2.5-3b-research-qa-lora/ (stored 0%)
  adding: content/qwen2.5-3b-research-qa-lora/README.md (deflated 66%)
  adding: content/qwen2.5-3b-research-qa-lora/adapter_model.safetensors (deflated 8%)
  adding: content/qwen2.5-3b-research-qa-lora/checkpoint-156/ (stored 0%)
  adding: content/qwen2.5-3b-research-qa-lora/checkpoint-156/added_tokens.json (deflated 67%)
  adding: content/qwen2.5-3b-research-qa-lora/checkpoint-156/trainer_state.json (deflated 58%)
  adding: content/qwen2.5-3b-research-qa-lora/checkpoint-156/README.md (deflated 66%)
  adding: content/qwen2.5-3b-research-qa-lora/checkpoint-156/training_args.bin (deflated 51%)
  adding: content/qwen2.5-3b-research-qa-lora/checkpoint-156/vocab.json (deflated 61%)
  adding: content/qwen2.5-3b-research-qa-lora/checkpoint-156/tokenizer.json (deflated 81%)
  adding: content/qwen2.5-3b-research-qa-lora/checkpoint-156/tokenizer_config.json (deflated 83%)
  adding: content/qwen2.5-3b-research-qa-lora/checkpoint-156/opti

In [None]:
!cp /content/qwen2.5-3b-research-qa-lora.zip /content/drive/MyDrive/qwen2.5-3b-research-qa-lora.zip

In [None]:
!pip install transformers torch



In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Load the fine-tuned model and tokenizer
model_name = "./qwen2.5-3b-research-qa-lora.zip"  # Path to your fine-tuned model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)

# Save the model in a format compatible with llama.cpp
model.save_pretrained("./qwen-gguf")
tokenizer.save_pretrained("./qwen-gguf")

# Convert to gguf format using llama.cpp
# Run this in your terminal after saving the model
# python3 llama.cpp/convert.py --model ./qwen-gguf --outfile ./qwen-gguf/model.gguf

In [None]:
!git clone https://github.com/ggerganov/llama.cpp

Cloning into 'llama.cpp'...
remote: Enumerating objects: 45719, done.[K
remote: Counting objects: 100% (349/349), done.[K
remote: Compressing objects: 100% (252/252), done.[K
remote: Total 45719 (delta 235), reused 98 (delta 97), pack-reused 45370 (from 2)[K
Receiving objects: 100% (45719/45719), 96.33 MiB | 23.33 MiB/s, done.
Resolving deltas: 100% (32957/32957), done.


In [None]:
!./llama.cpp/Makefile

/bin/bash: line 1: ./llama.cpp/Makefile: Permission denied


In [None]:
!unzip /content/drive/MyDrive/intellihack5_models/qwen2.5-3b-research-qa-lora.zip -d /content/drive/MyDrive/intellihack5_models/

Archive:  /content/drive/MyDrive/intellihack5_models/qwen2.5-3b-research-qa-lora.zip
   creating: /content/drive/MyDrive/intellihack5_models/content/qwen2.5-3b-research-qa-lora/
  inflating: /content/drive/MyDrive/intellihack5_models/content/qwen2.5-3b-research-qa-lora/README.md  
  inflating: /content/drive/MyDrive/intellihack5_models/content/qwen2.5-3b-research-qa-lora/adapter_model.safetensors  
   creating: /content/drive/MyDrive/intellihack5_models/content/qwen2.5-3b-research-qa-lora/checkpoint-156/
  inflating: /content/drive/MyDrive/intellihack5_models/content/qwen2.5-3b-research-qa-lora/checkpoint-156/added_tokens.json  
  inflating: /content/drive/MyDrive/intellihack5_models/content/qwen2.5-3b-research-qa-lora/checkpoint-156/trainer_state.json  
  inflating: /content/drive/MyDrive/intellihack5_models/content/qwen2.5-3b-research-qa-lora/checkpoint-156/README.md  
  inflating: /content/drive/MyDrive/intellihack5_models/content/qwen2.5-3b-research-qa-lora/checkpoint-156/training_

In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

base_model_name = "Qwen/Qwen2.5-3B"
lora_path = "/content/drive/MyDrive/intellihack5_models/content/qwen2.5-3b-research-qa-lora"  # Should be a directory, not .gguf!

# Load base model
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float16, device_map="auto")

# Load LoRA adapter
model = PeftModel.from_pretrained(model, lora_path)

# Test response
def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    output = model.generate(**inputs, max_new_tokens=512)
    return tokenizer.decode(output[0], skip_special_tokens=True)

print(generate_response("Hello!"))


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

ValueError: You are trying to offload the whole model to the disk. Please use the `disk_offload` function instead.