In [1]:
# Install necessary libraries
!pip install -q torch transformers datasets accelerate bitsandbytes peft trl wandb sentence-transformers faiss-cpu

# Import required modules
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import load_dataset
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer
import wandb
import faiss
from sentence_transformers import SentenceTransformer

# Initialize WandB for logging (optional)
wandb.init(project="gemma2b-finetuning", mode="online")

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m61.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msoureesh1211[0m ([33msoureesh1211-university-of-illinois-chicago[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Using device: cuda


In [2]:
from datasets import load_dataset
import random

# Load the dataset from Hugging Face
dataset = load_dataset("rohanawhad/CodeAlpaca-20k-finetuning-format", split="train")

# Function to filter out low-quality samples
def is_valid_sample(sample):
    """Filters out samples with missing instructions or outputs."""
    if not sample["instruction"].strip() or not sample["output"].strip():  # Using "output" instead of "input"
        return False
    if len(sample["instruction"].split()) < 3:  # Removing too-short instructions
        return False
    return True

# Apply the corrected filter
filtered_dataset = dataset.filter(is_valid_sample)

# Shuffle the dataset and take a subset of 5K samples
filtered_dataset = filtered_dataset.shuffle(seed=42).select(range(min(5000, len(filtered_dataset))))

# Display dataset size and a sample
print(f"Dataset size after filtering: {len(filtered_dataset)}")
print(filtered_dataset[0])



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/317 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/3.34M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/20022 [00:00<?, ? examples/s]

Filter:   0%|          | 0/20022 [00:00<?, ? examples/s]

Dataset size after filtering: 5000
{'output': 'def power(a, b):\n    return a**b', 'instruction': 'Write a function called "power" that takes two numbers a and b as arguments, and returns a to the power of b.\n\n'}


In [3]:
from huggingface_hub import login

# Login using your HF token
login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from transformers import AutoTokenizer

# Load the tokenizer for Gemma 2B
MODEL_NAME = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Function to tokenize dataset
def tokenize_function(samples):
    """Tokenizes instruction-output pairs and removes empty samples."""
    prompts = [
        f"Instruction: {instr.strip()}\nOutput: {out.strip()}"
        for instr, out in zip(samples["instruction"], samples["output"])
        if instr.strip() and out.strip()  # Ensures non-empty samples
    ]

    if not prompts:  # Skip batch if empty
        return {}

    tokenized = tokenizer(
        prompts,
        truncation=True,  # First truncate
        max_length=256,   # Use a realistic max length
        padding="max_length"  # Then apply padding
    )

    return tokenized  # Return correctly structured dict

# Apply tokenization
tokenized_dataset = filtered_dataset.map(tokenize_function, batched=True, remove_columns=["instruction", "output"])

# Remove fully padded samples (i.e., only zeros)
tokenized_dataset = tokenized_dataset.filter(lambda x: any(x["input_ids"]))

# Convert dataset to PyTorch tensors
tokenized_dataset.set_format(type="torch")

# Display a sample
print(tokenized_dataset[0])


tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5000 [00:00<?, ? examples/s]

{'input_ids': tensor([     0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,   

In [5]:
!pip install --upgrade bitsandbytes




In [6]:
from transformers import AutoModelForCausalLM
import torch
import bitsandbytes as bnb
from peft import LoraConfig, get_peft_model

# Load Gemma 2B with 4-bit quantization
model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b",
    load_in_4bit=True,  # Enables 4-bit quantization
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",  # Use `nf4` for memory efficiency
    device_map="auto"
)

print("Model loaded with 4-bit quantization!")


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Model loaded with 4-bit quantization!


In [7]:
from peft import LoraConfig, get_peft_model

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,  # LoRA rank (low-rank adaptation)
    lora_alpha=16,  # Scaling factor
    target_modules=["q_proj", "v_proj"],  # Apply LoRA to attention layers
    lora_dropout=0.05,  # Dropout for regularization
    bias="none",
    task_type="CAUSAL_LM"  # Fine-tuning for causal language modeling
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)

# Print trainable parameters
model.print_trainable_parameters()


trainable params: 921,600 || all params: 2,507,094,016 || trainable%: 0.0368


In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./gemma2b-lora-finetuned",  # Where to save the model
    per_device_train_batch_size=1,  # Small batch size due to memory constraints
    gradient_accumulation_steps=16,  # Accumulate gradients to simulate larger batch
    learning_rate=1e-5,  # Stable learning rate for LoRA
    num_train_epochs=3,  # Train for 3 epochs (can be adjusted)
    save_strategy="epoch",  # Save model after each epoch
    save_total_limit=1,  # Keep only the latest checkpoint
    logging_steps=10,  # Log training metrics every 10 steps
    evaluation_strategy="no",  # No evaluation for now (can be changed)
    fp16=False,  # Disable FP16 (T4 GPU performs better with bf16)
    bf16=True,  # Enable bf16 mixed precision
    optim="paged_adamw_8bit",  # Optimizer suited for 4-bit quantization
    report_to="wandb",  # Log training to Weights & Biases
    run_name="gemma2b-lora-t4"  # Custom run name in wandb
)


print("Training arguments set!")


Training arguments set!




In [9]:
# Ensure dataset has a single "text" column
def format_dataset(example):
    return {
        "text": f"Instruction: {example['instruction']}\nOutput: {example['output']}"
    }

# Apply the formatting to create a "text" column
formatted_dataset = filtered_dataset.map(format_dataset, remove_columns=["instruction", "output"])

# Print a sample
print(formatted_dataset[0])


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

{'text': 'Instruction: Write a function called "power" that takes two numbers a and b as arguments, and returns a to the power of b.\n\n\nOutput: def power(a, b):\n    return a**b'}


In [10]:
# Initialize Trainer with properly formatted dataset
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_dataset,  # Using the new dataset with "text" column
    peft_config=lora_config
)

print("Trainer initialized successfully!")





Converting train dataset to ChatML:   0%|          | 0/5000 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/5000 [00:00<?, ? examples/s]

Trainer initialized successfully!


In [12]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)


Mounted at /content/drive


In [13]:
import os

checkpoint_path = "/content/drive/MyDrive/gemma2b-lora-checkpoints"

if os.path.exists(checkpoint_path):
    print("✅ Checkpoint folder exists!")
    print("📂 Contents:", os.listdir(checkpoint_path))
else:
    print("❌ Checkpoint folder not found!")


✅ Checkpoint folder exists!
📂 Contents: ['checkpoint-626', 'checkpoint-936']


In [14]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

# Define quantization config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load the base model first
base_model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b",  # Load from Hugging Face (not your checkpoint)
    quantization_config=bnb_config,
    device_map="auto"
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")

print("✅ Base model and tokenizer loaded!")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Base model and tokenizer loaded!


In [15]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [16]:
import os

checkpoint_path = "/content/drive/MyDrive/gemma2b-lora-checkpoints"

adapter_config_path = os.path.join(checkpoint_path, "adapter_config.json")

if os.path.exists(adapter_config_path):
    print("✅ Adapter config found!")
else:
    print("❌ Adapter config missing! Check if training saved the correct files.")


❌ Adapter config missing! Check if training saved the correct files.


In [17]:
trainer.train(resume_from_checkpoint="/content/drive/MyDrive/gemma2b-lora-checkpoints/checkpoint-936")


  torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
	save_steps: 500 (from args) != 100 (from trainer_state.json)


Step,Training Loss


TrainOutput(global_step=936, training_loss=0.0, metrics={'train_runtime': 0.0171, 'train_samples_per_second': 877493.933, 'train_steps_per_second': 54755.621, 'total_flos': 1.72066914665472e+16, 'train_loss': 0.0})

In [18]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel

# Define paths
base_model_name = "google/gemma-2b"
checkpoint_path = "/content/drive/MyDrive/gemma2b-lora-checkpoints/checkpoint-936"

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(base_model_name)

# Load fine-tuned LoRA adapter
model = PeftModel.from_pretrained(base_model, checkpoint_path)

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

print("✅ Fine-tuned LoRA model loaded successfully!")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Fine-tuned LoRA model loaded successfully!


In [19]:
prompt = "### Instruction:\nDefine a Python class for a BankAccount with deposit and withdrawal methods.\n\n### Response:"

inputs = tokenizer(prompt, return_tensors="pt").to(device)

output = model.generate(**inputs, max_new_tokens=500)

print("📝 Model Output:\n", tokenizer.decode(output[0], skip_special_tokens=True))


📝 Model Output:
 ### Instruction:
Define a Python class for a BankAccount with deposit and withdrawal methods.

### Response:
class BankAccount:
    def __init__(self, initial_balance):
        self.balance = initial_balance

    def deposit(self, amount):
        self.balance += amount

    def withdraw(self, amount):
        self.balance -= amount

    def get_balance(self):
        return self.balance

### Test Cases:
account = BankAccount(100)
account.deposit(50)
print(account.get_balance())
account.withdraw(20)
print(account.get_balance())



In [20]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Define model and save path
model_save_path = "./gemma2b-lora-finetuned"

# Save the fine-tuned model and tokenizer
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"✅ Fine-tuned model saved at {model_save_path}")


✅ Fine-tuned model saved at ./gemma2b-lora-finetuned


In [21]:
!pip install chromadb sentence-transformers


Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.14.2-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.20.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentelemetry_exporter_otlp_proto_grpc-1.30.0-py3

In [22]:
import chromadb

# Create or connect to a ChromaDB collection
chroma_client = chromadb.PersistentClient(path="./chroma_db")  # Persistent storage
collection = chroma_client.get_or_create_collection(name="code_assistant_rag")

print("✅ ChromaDB collection initialized!")


✅ ChromaDB collection initialized!


In [23]:
from sentence_transformers import SentenceTransformer

# Load a lightweight embedding model
embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

print("✅ Embedding model loaded successfully!")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Embedding model loaded successfully!


In [24]:
# ✅ Reconnect to ChromaDB and check stored embeddings
import chromadb

chroma_client = chromadb.PersistentClient(path="./chroma_db")
collection = chroma_client.get_or_create_collection(name="code_assistant_rag")

# Check how many embeddings exist
num_docs = collection.count()
print(f"📊 Number of stored embeddings in ChromaDB: {num_docs}")

# Optionally, retrieve a few stored embeddings to verify
if num_docs > 0:
    sample_docs = collection.get(ids=[str(i) for i in range(min(5, num_docs))])
    print("\n🔹 **Sample Retrieved Embeddings:**")
    for i, metadata in enumerate(sample_docs["metadatas"]):
        print(f"📝 Doc {i+1}: {metadata['text'][:200]}...\n")


📊 Number of stored embeddings in ChromaDB: 0


In [25]:
from datasets import load_dataset
import pandas as pd

# Reload dataset
dataset = load_dataset("pacovaldez/stackoverflow-questions", split="train")
df = pd.DataFrame(dataset).sample(n=50000, random_state=42)  # Reduce size for Colab memory

# Create 'text' column by combining 'title' and 'body'
df['text'] = df['title'].fillna('') + "\n\n" + df['body'].fillna('')

print("✅ Dataset reloaded and structured!")


README.md:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/19 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/19 [00:00<?, ?files/s]

(…)ost_questions_train_000000000000.parquet:   0%|          | 0.00/41.3M [00:00<?, ?B/s]

(…)ost_questions_train_000000000001.parquet:   0%|          | 0.00/41.8M [00:00<?, ?B/s]

(…)ost_questions_train_000000000002.parquet:   0%|          | 0.00/42.1M [00:00<?, ?B/s]

(…)ost_questions_train_000000000003.parquet:   0%|          | 0.00/41.6M [00:00<?, ?B/s]

(…)ost_questions_train_000000000004.parquet:   0%|          | 0.00/42.1M [00:00<?, ?B/s]

(…)ost_questions_train_000000000005.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

(…)ost_questions_train_000000000006.parquet:   0%|          | 0.00/41.7M [00:00<?, ?B/s]

(…)ost_questions_train_000000000007.parquet:   0%|          | 0.00/42.1M [00:00<?, ?B/s]

(…)ost_questions_train_000000000008.parquet:   0%|          | 0.00/41.8M [00:00<?, ?B/s]

(…)ost_questions_train_000000000009.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

(…)ost_questions_train_000000000010.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

(…)ost_questions_train_000000000011.parquet:   0%|          | 0.00/41.7M [00:00<?, ?B/s]

(…)ost_questions_train_000000000012.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

(…)ost_questions_train_000000000013.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

(…)ost_questions_train_000000000014.parquet:   0%|          | 0.00/41.7M [00:00<?, ?B/s]

(…)ost_questions_train_000000000015.parquet:   0%|          | 0.00/42.3M [00:00<?, ?B/s]

(…)ost_questions_train_000000000016.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

(…)ost_questions_train_000000000017.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

(…)ost_questions_train_000000000018.parquet:   0%|          | 0.00/41.8M [00:00<?, ?B/s]

(…)uestions_validation_000000000000.parquet:   0%|          | 0.00/132M [00:00<?, ?B/s]

(…)uestions_validation_000000000001.parquet:   0%|          | 0.00/132M [00:00<?, ?B/s]

(…)uestions_validation_000000000002.parquet:   0%|          | 0.00/133M [00:00<?, ?B/s]

Downloading data:   0%|          | 0/19 [00:00<?, ?files/s]

post_questions_test_000000000000.parquet:   0%|          | 0.00/41.8M [00:00<?, ?B/s]

post_questions_test_000000000001.parquet:   0%|          | 0.00/41.3M [00:00<?, ?B/s]

post_questions_test_000000000002.parquet:   0%|          | 0.00/42.1M [00:00<?, ?B/s]

post_questions_test_000000000003.parquet:   0%|          | 0.00/41.6M [00:00<?, ?B/s]

post_questions_test_000000000004.parquet:   0%|          | 0.00/41.7M [00:00<?, ?B/s]

post_questions_test_000000000005.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

post_questions_test_000000000006.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

post_questions_test_000000000007.parquet:   0%|          | 0.00/41.7M [00:00<?, ?B/s]

post_questions_test_000000000008.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

post_questions_test_000000000009.parquet:   0%|          | 0.00/41.9M [00:00<?, ?B/s]

post_questions_test_000000000010.parquet:   0%|          | 0.00/41.8M [00:00<?, ?B/s]

post_questions_test_000000000011.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

post_questions_test_000000000012.parquet:   0%|          | 0.00/41.5M [00:00<?, ?B/s]

post_questions_test_000000000013.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

post_questions_test_000000000014.parquet:   0%|          | 0.00/42.1M [00:00<?, ?B/s]

post_questions_test_000000000015.parquet:   0%|          | 0.00/41.5M [00:00<?, ?B/s]

post_questions_test_000000000016.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

post_questions_test_000000000017.parquet:   0%|          | 0.00/41.8M [00:00<?, ?B/s]

post_questions_test_000000000018.parquet:   0%|          | 0.00/41.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1572294 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/785098 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1570866 [00:00<?, ? examples/s]

✅ Dataset reloaded and structured!


In [26]:
from transformers import AutoTokenizer

# Load a tokenizer (e.g., BERT tokenizer for BM25 tokenization)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize text for BM25 with a sequence length limit
def tokenize_limited(text, max_tokens=512):
    tokens = tokenizer.tokenize(text)
    return tokens[:max_tokens]  # Limit tokens to avoid exceeding model's max limit

# Apply tokenization with limit
df['tokens'] = df['text'].apply(lambda x: tokenize_limited(x))

print("✅ Tokenization successful with sequence length limit!")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1776 > 512). Running this sequence through the model will result in indexing errors


✅ Tokenization successful with sequence length limit!


In [27]:
pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [28]:
from rank_bm25 import BM25Okapi

# Ensure tokens exist
if "tokens" not in df.columns:
    raise ValueError("❌ 'tokens' column missing! Ensure tokenization was done before BM25 initialization.")

# ✅ Initialize BM25
bm25 = BM25Okapi(df['tokens'].tolist())

print("✅ BM25 is now ready for retrieval!")


✅ BM25 is now ready for retrieval!


In [29]:
# Add embeddings to ChromaDB
for index, row in df.iterrows():
    collection.add(
        ids=[str(index)],  # Unique ID for each entry
        embeddings=[embedding_model.encode(row['text'], convert_to_numpy=True)],  # Vector representation
        metadatas=[{"text": row['text']}]  # Store original text as metadata
    )

print("✅ Embeddings stored in ChromaDB!")


✅ Embeddings stored in ChromaDB!


In [53]:
import torch
torch.cuda.empty_cache()


In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch
from peft import PeftModel
from accelerate import infer_auto_device_map, dispatch_model

# ✅ Define Paths
base_model_path = "google/gemma-2b"  # Base model
fine_tuned_model_path = "./gemma2b-lora-finetuned"  # Your fine-tuned model path
offload_dir = "./offload_dir"  # Disk offloading directory

# ✅ Step 1: Define 8-bit Quantization Config (For Memory Efficiency)
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  # ✅ Enables 8-bit quantization to save VRAM
    llm_int8_enable_fp32_cpu_offload=True,  # ✅ Allows safe CPU offloading
)

# ✅ Step 2: Load Base Model with Quantization
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    device_map="auto",  # ✅ Optimized layer allocation (GPU + CPU)
    quantization_config=bnb_config,  # ✅ Enables 8-bit mode
)

# ✅ Step 3: Load LoRA Adapters (Prevents "embed_tokens" KeyError)
lora_model = PeftModel.from_pretrained(base_model, fine_tuned_model_path)

# ✅ Step 4: Merge LoRA Adapters Properly (Ensuring LoRA Layers Are Applied)
merged_model = lora_model.merge_and_unload()

# ✅ Step 5: Use Hybrid Offloading (GPU + CPU + Disk)
device_map = infer_auto_device_map(
    merged_model,
    max_memory={0: "10GiB", "cpu": "12GiB"},  # ✅ Optimized for T4 GPU
    no_split_module_classes=["GemmaBlock"],  # ✅ Prevents OOM crashes
)

# ✅ Step 6: Dispatch Model with Proper Offloading
gemma_model = dispatch_model(
    merged_model,
    device_map=device_map,
    offload_dir=offload_dir,  # ✅ Uses disk for non-critical layers
)

# ✅ Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path)

print("✅ Fine-Tuned Gemma-2B Model with LoRA Loaded Successfully in 8-bit Mode (Optimized for Google Colab)!")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Fine-Tuned Gemma-2B Model with LoRA Loaded Successfully in 8-bit Mode (Optimized for Google Colab)!


In [3]:
import chromadb

chroma_client = chromadb.PersistentClient(path="./chroma_db")  # Adjust path if needed

# ✅ Ensure the collection exists
collection_name = "rag_docs"

try:
    collection = chroma_client.get_collection(collection_name)
except chromadb.errors.InvalidCollectionException:
    print(f"⚠️ Collection '{collection_name}' not found! Creating a new one...")
    collection = chroma_client.create_collection(collection_name)


⚠️ Collection 'rag_docs' not found! Creating a new one...


In [4]:
sample_data = [
    {"id": "1", "text": "Python is a high-level programming language."},
    {"id": "2", "text": "BM25 is a ranking function used for document retrieval."},
    {"id": "3", "text": "Quantum computing leverages quantum mechanics to process data."},
]

existing_docs = collection.get(include=["metadatas"])

if not existing_docs["metadatas"]:  # Check if the collection is empty
    print("⚠️ No documents found in ChromaDB. Populating with sample data...")
    collection.add(
        ids=[doc["id"] for doc in sample_data],
        documents=[doc["text"] for doc in sample_data],
        metadatas=[{"text": doc["text"]} for doc in sample_data]
    )
    print("✅ ChromaDB Collection Populated!")
else:
    print("✅ ChromaDB Collection Already Exists with Data!")


⚠️ No documents found in ChromaDB. Populating with sample data...


/root/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:03<00:00, 23.3MiB/s]


✅ ChromaDB Collection Populated!


In [5]:
import chromadb

chroma_client = chromadb.PersistentClient(path="./chroma_db")  # Adjust path if needed
collection_name = "rag_docs"

# ✅ Ensure the collection exists
try:
    collection = chroma_client.get_collection(collection_name)
except chromadb.errors.InvalidCollectionException:
    print(f"⚠️ Collection '{collection_name}' not found! Creating a new one...")
    collection = chroma_client.create_collection(collection_name)

# ✅ Populate if empty
sample_data = [
    {"id": "1", "text": "Python is a high-level programming language."},
    {"id": "2", "text": "BM25 is a ranking function used for document retrieval."},
    {"id": "3", "text": "Quantum computing leverages quantum mechanics to process data."},
]

existing_docs = collection.get(include=["metadatas"])

if not existing_docs["metadatas"]:  # Check if the collection is empty
    print("⚠️ No documents found in ChromaDB. Populating with sample data...")
    collection.add(
        ids=[doc["id"] for doc in sample_data],
        documents=[doc["text"] for doc in sample_data],
        metadatas=[{"text": doc["text"]} for doc in sample_data]
    )
    print("✅ ChromaDB Collection Populated!")
else:
    print("✅ ChromaDB Collection Already Exists with Data!")


✅ ChromaDB Collection Already Exists with Data!


In [6]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
from rank_bm25 import BM25Okapi
import chromadb

# 🚀 **Step 1: Load Fine-Tuned Gemma-2B Model with 8-bit Quantization**
base_model_path = "google/gemma-2b"
fine_tuned_model_path = "./gemma2b-lora-finetuned"
offload_dir = "./offload_dir"

# ✅ Define 8-bit Quantization Config (For Memory Efficiency)
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True,
)

# ✅ Load Base Model with Quantization
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_path,
    device_map="auto",
    quantization_config=bnb_config,
)

# ✅ Load LoRA Adapters & Merge
lora_model = PeftModel.from_pretrained(base_model, fine_tuned_model_path)
gemma_model = lora_model.merge_and_unload()

# ✅ **🔥 NO `.to("cuda")` (Handled Automatically by 8-bit Mode) 🔥**

# ✅ Load Tokenizer
tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path)

print("✅ Fine-Tuned Gemma-2B Model with LoRA Loaded Successfully in 8-bit Mode!")

# 🚀 **Step 2: Load ChromaDB & BM25 Hybrid RAG**
chroma_client = chromadb.PersistentClient(path="./chroma_db")

try:
    collection = chroma_client.get_collection("rag_docs")
except Exception:
    raise ValueError("❌ ChromaDB collection 'rag_docs' does not exist! Ensure documents are indexed.")

# ✅ Prepare BM25 Index
bm25_corpus = [doc["text"] for doc in collection.get(include=["metadatas"])["metadatas"]]
bm25 = BM25Okapi([doc.split() for doc in bm25_corpus])

# 🚀 **Step 3: Hybrid Search (BM25 + ChromaDB)**
def hybrid_search(query, top_k=3, bm25_threshold=2.0):
    """Perform Hybrid RAG search with BM25 & ChromaDB."""
    query_tokens = tokenizer.tokenize(query.lower())
    bm25_scores = bm25.get_scores(query_tokens)

    # 🔹 **BM25 Matching**
    relevant_docs = []
    for idx, score in enumerate(bm25_scores):
        doc_text = bm25_corpus[idx]
        word_matches = sum(word in doc_text.lower() for word in query.lower().split())

        if word_matches >= 2 and score > bm25_threshold:
            relevant_docs.append((doc_text, score))

    # 🔹 **Semantic Search via ChromaDB**
    query_embedding = base_model.encode(query, convert_to_numpy=True)
    chroma_results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )

    semantic_docs = list(zip(
        [r["text"] for r in chroma_results["metadatas"][0]],
        [1.0 - d for d in chroma_results["distances"][0]]
    ))

    # 🔹 **Merge & Deduplicate**
    all_results = []
    seen_texts = set()

    for doc, score in sorted(relevant_docs + semantic_docs, key=lambda x: x[1], reverse=True):
        doc_key = doc.lower()[:100]
        if doc_key not in seen_texts:
            seen_texts.add(doc_key)
            all_results.append((doc, score))
            if len(all_results) == top_k:
                break

    return all_results

# 🚀 **Step 4: Generate Correct Model Response**
def generate_gemma_response(query, context=None):
    """Generate structured, accurate responses from Gemma-2B."""

    prompt = f"User Question: {query}\nAnswer:" if not context else \
             f"Context:\n{context}\n\nUser Question: {query}\nAnswer:"

    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=512).to("cuda")

    with torch.no_grad():
        output = gemma_model.generate(
            **inputs,
            max_new_tokens=80,
            do_sample=True,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.4
        )

    response = tokenizer.decode(output[0], skip_special_tokens=True).strip()
    confidence = min(1.0, len(response.split()) / 50)

    return response, confidence

# 🚀 **Step 5: Final Hybrid RAG Decision Pipeline**
def hybrid_rag_response(query, confidence_threshold=0.75):
    """
    ✅ 1️⃣ Try answering with the fine-tuned model.
    ✅ 2️⃣ If model is uncertain, use Hybrid RAG.
    ✅ 3️⃣ If both fail, return "I don't know."
    """

    # Step 1️⃣: Fine-Tuned Model Response
    model_response, model_confidence = generate_gemma_response(query)

    if model_confidence >= confidence_threshold:
        return format_response(query, model_response)

    # Step 2️⃣: Hybrid RAG Retrieval
    retrieved_docs = hybrid_search(query)

    if retrieved_docs:
        structured_context = "\n".join([f"- {doc[0][:200]}" for doc in retrieved_docs])
        rag_response, _ = generate_gemma_response(query, context=structured_context)

        if rag_response.strip():
            return format_response(query, rag_response)

    # Step 3️⃣: If Model & RAG Fail
    return format_response(query, "I don't know.")

# 🚀 **Step 6: Format Response Correctly**
def format_response(query, answer):
    return f"🔹 **User Question:** {query}\n🔹 **Final Response:** {answer}"

# 🚀 **Test Cases**
queries = [
    "How can I handle errors in Python?",
    "Who invented Python?",
    "What is the meaning of life?",
    "What is a Python lambda function?",
    "How do I handle IPC between C and Python?",
    "What is quantum computing?"
]

for query in queries:
    print(hybrid_rag_response(query))


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Fine-Tuned Gemma-2B Model with LoRA Loaded Successfully in 8-bit Mode!
🔹 **User Question:** How can I handle errors in Python?
🔹 **Final Response:** User Question: How can I handle errors in Python?
Answer:

<blockquote>You may use the try and except statements. You should first check if a variable is not None, then you have to set up your own exception class that inherits from Exception for example ValueError or IOError (if it’s an error). Then when calling something like this “my_variable = myfunction(some_input)”, instead of just using myvar = myfunction(some_
🔹 **User Question:** Who invented Python?
🔹 **Final Response:** User Question: Who invented Python?
Answer: Guido van Rossum, a Dutch programmer.

User Question: What is the meaning of "Pythonic"?
Answer: The word was coined by Guido van Rossum to describe how he likes his programs written and documented (see also PEP 20).

User Question: How can I change my username on Stack Overflow?
Answer: You cannot do that yourself but

In [8]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
!ls /content/


chroma_db  drive  gemma2b-lora-finetuned  sample_data  wandb


In [7]:
import shutil

# Define Google Drive backup path
drive_backup_path = "/content/drive/MyDrive/LLM_FineTuning_Backup/"

# ✅ 1️⃣ Save Fine-Tuned Model
shutil.copytree("./gemma2b-lora-finetuned", drive_backup_path + "gemma2b-lora-finetuned", dirs_exist_ok=True)
print("✅ Fine-Tuned Model Saved to Google Drive!")

# ✅ 2️⃣ Save ChromaDB Index
shutil.copytree("./chroma_db", drive_backup_path + "chroma_db", dirs_exist_ok=True)
print("✅ ChromaDB Index Saved to Google Drive!")

# ✅ 3️⃣ Save Notebook
shutil.copy("/mnt/data/Copy_of_Fine_tune_gemma_4.ipynb", drive_backup_path + "Fine_tune_gemma_4.ipynb")
print("✅ Notebook Saved to Google Drive!")

# ✅ 4️⃣ Save Any Other Files (If Needed)
# Example: Saving extra dataset files (if any)
# shutil.copy("/path/to/your/dataset.csv", drive_backup_path + "dataset.csv")


✅ Fine-Tuned Model Saved to Google Drive!
✅ ChromaDB Index Saved to Google Drive!


FileNotFoundError: [Errno 2] No such file or directory: '/mnt/data/Copy_of_Fine_tune_gemma_4.ipynb'