# Step 1: Mount Google Drive
This step mounts your Google Drive to Colab so you can access files stored there. It's especially useful for loading datasets or saving model checkpoints persistently.

In [1]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

print("✅ Google Drive mounted successfully!")

Mounted at /content/drive
✅ Google Drive mounted successfully!


# Step 2: Install Required Libraries and Login to Hugging Face Hub
This cell installs the necessary libraries (`transformers`, `huggingface_hub`) and logs into the Hugging Face Hub using your personal access token. Logging in is required for loading models or datasets that are gated or private.

In [2]:
# %%
!pip install -qq transformers huggingface_hub
from huggingface_hub import login

# Paste your token here
hf_token = "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"

# Log in
login(token=hf_token)

if hf_token:
    try:
        login(token=hf_token)
        print("✅ Successfully logged in to Hugging Face Hub.")
    except Exception as e:
        print(f"❌ Failed to log in to Hugging Face Hub: {e}")
else:
    print("ℹ️ Hugging Face token not found. Skipping login.")


✅ Successfully logged in to Hugging Face Hub.


# Step 3: Load Base Model and Prepare for LoRA Fine-Tuning
This cell does the following:
- Loads the OPT 1.3B model and its tokenizer
- Sets the padding token to be the same as the end-of-sequence token
- Defines a `LoRAConfig` specific to OPT architecture using `q_proj` and `v_proj` as target modules
- Wraps the model using `get_peft_model` to prepare it for Parameter-Efficient Fine-Tuning (LoRA)
- Finally, it prints the number of trainable parameters

In [3]:
from IPython import get_ipython
from IPython.display import display
from transformers import AutoConfig, AutoModelForCausalLM
from huggingface_hub import hf_hub_download
import json
import os # Import os
import torch


from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import get_peft_model, LoraConfig, TaskType

# model_id = "facebook/opt-1.3b"

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # OPT-specific modules
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/551 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/608 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


In [4]:
import os
import json
from datasets import Dataset

folder_path = "/content/drive/MyDrive/Colab_data/codex/json_files"
all_data = []

# 🔁 Step 1: Load all JSON files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, "r") as f:
            try:
                file_data = json.load(f)
                if isinstance(file_data, list):
                    all_data.extend(file_data)
                else:
                    print(f"⚠️ Skipped {filename}: not a list of records.")
            except Exception as e:
                print(f"⚠️ Failed to load {filename}: {e}")

# ✅ Step 2: Convert to Hugging Face Dataset
dataset = Dataset.from_list(all_data)

# 🧩 Step 3: Format into Alpaca-style prompts
def format_prompt(example):
    return {
        "text": f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"
    }

dataset = dataset.map(format_prompt)

# 🧠 Step 4: Tokenize using tokenizer (must be defined before this block)
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize, batched=True)

# 🏷️ Step 5: Align labels
tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})


Map:   0%|          | 0/166 [00:00<?, ? examples/s]

Map:   0%|          | 0/166 [00:00<?, ? examples/s]

Map:   0%|          | 0/166 [00:00<?, ? examples/s]

In [5]:
# from datasets import Dataset

# # Load JSON file manually
# with open("/content/drive/MyDrive/Colab_data/codex/DATASET_TESTCASES_alpaca.json", "r") as f:
#     data = json.load(f)

# # Convert list of dicts to Hugging Face Dataset
# dataset = Dataset.from_list(data)

# # Format prompts
# def format_prompt(example):
#     return {
#         "text": f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"
#     }

# dataset = dataset.map(format_prompt)


# def tokenize(example):
#     return tokenizer(example["text"], truncation=True, padding="max_length", max_length=512)

# tokenized_dataset = dataset.map(tokenize, batched=True)
# tokenized_dataset = tokenized_dataset.map(lambda x: {"labels": x["input_ids"]})

In [6]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./opt13b_lora_testcases",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    num_train_epochs=20,
    learning_rate=2e-4,
    save_steps=500,
    logging_steps=250,
    fp16=True,
    # evaluation_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
250,1.0578
500,0.2965
750,0.221
1000,0.1692
1250,0.1252
1500,0.1011


TrainOutput(global_step=1660, training_loss=0.3054753763129912, metrics={'train_runtime': 390.3413, 'train_samples_per_second': 8.505, 'train_steps_per_second': 4.253, 'total_flos': 1.056252138356736e+16, 'train_loss': 0.3054753763129912, 'epoch': 20.0})

In [7]:
model.save_pretrained("./opt13b_lora_testcases")

In [14]:
from transformers import pipeline, TextStreamer

stop_token = "### Instruction:"
tokenizer.pad_token = tokenizer.eos_token  # Ensure no pad token issues

# Custom function to truncate model outputs at the stop token
def truncate_at_stop(text, stop_token="### Instruction:"):
    return text.split(stop_token)[0].strip()

# Updated inference code
def compare_base_and_lora(prompts, base_model_id="facebook/opt-1.3b", lora_adapter_path="./opt13b_lora_testcases"):
    tokenizer = AutoTokenizer.from_pretrained(base_model_id)
    tokenizer.pad_token = tokenizer.eos_token

    base_model = AutoModelForCausalLM.from_pretrained(base_model_id, torch_dtype=torch.float16, device_map="auto")
    base_pipe = pipeline("text-generation", model=base_model, tokenizer=tokenizer)

    lora_model = PeftModel.from_pretrained(base_model, lora_adapter_path, is_trainable=False)
    lora_model.eval()
    lora_pipe = pipeline("text-generation", model=lora_model, tokenizer=tokenizer)

    for i, prompt in enumerate(prompts):
        print(f"\n📌 Prompt {i+1}:\n{prompt.strip()}\n")

        # base_output = base_pipe(prompt, max_new_tokens=200, do_sample=False)[0]["generated_text"]
        # base_result = truncate_at_stop(base_output.replace(prompt, ""))
        # print("🔹 Base Model Response:\n", base_result, "\n")

        lora_output = lora_pipe(prompt, max_new_tokens=200, do_sample=False)[0]["generated_text"]
        lora_result = truncate_at_stop(lora_output.replace(prompt, ""))
        print("🔸 LoRA Model Response:\n", lora_result, "\n")
        print("—" * 100)

In [15]:
test_prompts = [
    """### Instruction:
SCR01 - Revised risk weights for rated corporate exposure - Validate revised Basel 3.1 risk weights for a rated corporate exposure using BigQuery and Looker Studio

### Input:
""

### Response:
""",

        """### Instruction:
Off-Balance-Sheet Exposure Conversion - Verify application of new CCFs

### Input:
""

### Response:
"""
]

compare_base_and_lora(test_prompts, model_id, "./opt13b_lora_testcases")


Device set to use cuda:0
Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



📌 Prompt 1:
### Instruction:
SCR01 - Revised risk weights for rated corporate exposure - Validate revised Basel 3.1 risk weights for a rated corporate exposure using BigQuery and Looker Studio

### Input:
""

### Response:



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔹 Base Model Response:
 Step 1: Query BigQuery for exposure record and look-through partner (exposure ID, BigQuery table reference)
Test Data: SELECT * FROM exposures_dataset.exposures WHERE exposure_id = 'E_C_001'
Step 2: Query Looker Studio's visualization for the corporate exposure in the risk weightage table
Test Data: view dataset. Corporate_Exposure_RWA - Line Chart
Step 3: Validate risk weight for the corporate exposure (% risk weight) using BigQuery's standard deviation function
Expected Result: Risk weight = 100% (calculated as 100% ± standard deviation of rating category)
Test Data: BigQuery table reference: exposures_dataset.corporate_exposure
Scope: Databus integration and Looker Studio dashboard 



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔸 LoRA Model Response:
 Step 1: Query BigQuery for exposure record and look-through partner (exposure ID, BigQuery table reference)
Test Data: SELECT * FROM exposures_dataset.exposures WHERE exposure_id = 'E_C_001'
Step 2: Query Looker Studio's visualization for the corporate exposure in the risk weightage table
Test Data: view dataset. Corporate_Exposure_RWA - Line Chart
Step 3: Validate risk weight for the corporate exposure (% risk weight) using BigQuery's standard deviation function
Expected Result: Risk weight = 100% (calculated as 100% ± standard deviation of rating category)
Test Data: BigQuery table reference: exposures_dataset.corporate_exposure
Scope: Databus integration and Looker Studio dashboard 

————————————————————————————————————————————————————————————————————————————————————————————————————

📌 Prompt 2:
### Instruction:
Off-Balance-Sheet Exposure Conversion - Verify application of new CCFs

### Input:
""

### Response:



The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🔹 Base Model Response:
 Test CFT502: Conversion of Off-Balance-Sheet Exposures to CCFs
Input CFT502: Off-balance-sheet exposure with expected CCF class
Expected Output: Off-balance-sheet exposure converted to CCF format, reflecting new CCF reference rate and CCF weighting factor 

🔸 LoRA Model Response:
 Test CFT502: Conversion of Off-Balance-Sheet Exposures to CCFs
Input CFT502: Off-balance-sheet exposure with expected CCF class
Expected Output: Off-balance-sheet exposure converted to CCF format, reflecting new CCF reference rate and CCF weighting factor 

————————————————————————————————————————————————————————————————————————————————————————————————————


In [10]:
test_prompts = [
    """### Instruction:
Verify that users can register with valid information.

### Input:
""

### Response:
""",
    """### Instruction:
Verify that users cannot register with invalid information (e.g., invalid email format, duplicate username).

### Input:
""

### Response:
""",
    """### Instruction:
SCR01 - Revised risk weights for rated corporate exposure - Validate revised Basel 3.1 risk weights for a rated corporate exposure using BigQuery and Looker Studio

### Input:
""

### Response:
""",
    """### Instruction:
Write test cases for Two-Factor Authentication (2FA) login.

### Input:
User logs in with username and password, and then is prompted to enter a one-time code sent to their registered email or phone. If the code is incorrect or expired, login should fail.

### Response:
""",
    """### Instruction:
Write test cases for updating user profile information.

### Input:
Users should be able to update their name and contact number. Changes must be saved and reflected on subsequent logins. Empty fields or invalid phone numbers must trigger error messages.

### Response:
""",
    """### Instruction:
Generate test cases for deleting user accounts.

### Input:
Users can delete their accounts from settings. They must confirm the action via a confirmation prompt. Deleted accounts should be marked inactive and inaccessible.

### Response:
"""
]

# compare_base_and_lora(test_prompts, model_id, "./opt13b_lora_testcases")
