### Open the notebook on Colab

We should have already started a notebook server in a container on a Chameleon GPU host, and set up an SSH tunnel to this notebook server. Now, we will connect this notebook to the runtime that you have in Chameleon. This is a convenient way to work, because the notebook and its outputs will be saved automatically in your Google Drive.

-   Next to the “Connect” button in the top right, there is a ▼ symbol. Click on this symbol to expand the menu, and choose “Connect to a local runtime”.
-   Paste the `http://127.0.0.1:8888/lab?token=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX` you copied earlier into this space, and choose “Connect”.

**Alternatively, if you prefer not to use Colab** (or can’t, for some reason): just put the `http://127.0.0.1:8888/lab?token=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX` URL you copied earlier into your browser to open the Jupyter interface directly. But, then you’ll have to open a terminal in that Jupyter interface and run

    wget https://raw.githubusercontent.com/teaching-on-testbeds/llm-chi/refs/heads/main/workspace/2_single_gpu_a100.ipynb

to get a copy of this notebook in that workspace.

In [1]:
# Install necessary packages
!pip install transformers datasets torch accelerate bitsandbytes sentencepiece peft trl bert-score



In [2]:
import torch
from datasets import load_dataset
import time
import accelerate
from accelerate import infer_auto_device_map, dispatch_model
from types import MethodType
import gc
from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM, TrainingArguments, GenerationConfig
import mlflow
import mlflow.pytorch
import subprocess
from transformers.cache_utils import Cache, DynamicCache
import json
import os
import pandas as pd
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer
from bert_score import score as bertscore
import numpy as np

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
print(device)

cuda


In [5]:
### Configure the training job 
# All hyperparameters will be set here, in one convenient place
DTYPE_MAP = {
    # 'float32': torch.float32,
    'float16': torch.float16,
    'bfloat16': torch.bfloat16

}

In [6]:
# Function to load models


def load_model_for_qlora(model_name, dtype_str='float16'):
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)

    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=DTYPE_MAP.get(dtype_str)
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )

    model = prepare_model_for_kbit_training(model)  # Required for QLoRA
    return tokenizer, model



def patched_prepare_inputs_for_generation(
    self,
    input_ids,
    past_key_values=None,
    attention_mask=None,
    inputs_embeds=None,
    **kwargs,
):
    if past_key_values is not None:
        if isinstance(past_key_values, Cache):
            cache_length = past_key_values.get_seq_length()
            past_length = past_key_values.seen_tokens
            max_cache_length = past_key_values.get_max_cache_shape()
        else:
            cache_length = past_length = past_key_values[0][0].shape[2]
            max_cache_length = None

        # Keep only the unprocessed tokens:
        # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
        # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
        # input)
        if (
            attention_mask is not None
            and attention_mask.shape[1] > input_ids.shape[1]
        ):
            input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
        # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
        # input_ids based on the past_length.
        elif past_length < input_ids.shape[1]:
            input_ids = input_ids[:, past_length:]
        # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

        # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
        if (
            max_cache_length is not None
            and attention_mask is not None
            and cache_length + input_ids.shape[1] > max_cache_length
        ):
            attention_mask = attention_mask[:, -max_cache_length:]

    position_ids = kwargs.get("position_ids", None)
    if attention_mask is not None and position_ids is None:
        # create position_ids on the fly for batch generation
        position_ids = attention_mask.long().cumsum(-1) - 1
        position_ids.masked_fill_(attention_mask == 0, 1)
        if past_key_values:
            position_ids = position_ids[:, -input_ids.shape[1] :]

    # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
    if inputs_embeds is not None and past_key_values is None:
        model_inputs = {"inputs_embeds": inputs_embeds}
    else:
        model_inputs = {"input_ids": input_ids}

    model_inputs.update(
        {
            "position_ids": position_ids,
            "past_key_values": past_key_values,
            "use_cache": kwargs.get("use_cache"),
            "attention_mask": attention_mask,
        }
    )
    return model_inputs


def load_and_patch_model(path,  dtype = 'bfloat16'):
    tokenizer, model = load_model_for_qlora(path,  dtype)
    model.prepare_inputs_for_generation = MethodType(patched_prepare_inputs_for_generation, model)
    return tokenizer, model



In [7]:
def print_gpu_memory(note=""):
    result = subprocess.run(
        ["nvidia-smi", "--query-gpu=memory.used,memory.total", "--format=csv,nounits,noheader"],
        stdout=subprocess.PIPE,
        text=True
    )
    used, total = map(int, result.stdout.strip().split(','))
    print(f"{note} GPU memory: {used} MiB / {total} MiB")


In [8]:
def generate_code_with_profiling(model, tokenizer, prompt, max_new_tokens=64):
    # Tokenize input and move to model's device
    device = next(model.parameters()).device
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Clear GPU memory and reset peak memory stats
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats(device)
    torch.cuda.synchronize()

    with torch.no_grad():
        start_time = time.time()

        # Generate output
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            eos_token_id=tokenizer.eos_token_id,
            use_cache=True
        )

        torch.cuda.synchronize()
        end_time = time.time()

    # Calculate stats
    inference_time = end_time - start_time
    peak_memory_usage = torch.cuda.max_memory_allocated(device)  # Peak memory used during generate()

    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"⏱️ Inference time: {inference_time:.4f} seconds")
    print(f"📈 Peak memory usage: {peak_memory_usage} bytes")

    return decoded_output, inference_time, peak_memory_usage


In [9]:
# def clear_previous_model():
#     if 'loaded_model' in locals() and hasattr(loaded_model, 'clear_cache'):
#         loaded_model.clear_cache()
#     try:
#         del loaded_model
#         del tokenizer
#     except NameError:
#         pass
#     gc.collect()
#     torch.cuda.empty_cache()
#     print_gpu_memory("After freeing previous model")
def clear_previous_model():
    global model, tokenizer

    try:
        if 'loaded_model' in globals():
            if hasattr(model, 'clear_cache'):
                model.clear_cache()
            del model
    except Exception as e:
        print("Error clearing model:", e)

    try:
        if 'tokenizer' in globals():
            del tokenizer
    except Exception as e:
        print("Error clearing tokenizer:", e)

    gc.collect()
    torch.cuda.empty_cache()
    print_gpu_memory("After freeing previous model")

In [10]:
# # DeepSeek Coder V2
# model_name = "Lite-Base"
# model_address = "deepseek-ai/DeepSeek-Coder-V2-Lite-Base"


# for dtype_name, torch_dtype in DTYPE_MAP.items():
#     clear_previous_model()

#     torch.cuda.empty_cache()
#     gc.collect()
#     print_gpu_memory("Before loading model with dtype ",dtype_name)
#     tokenizer, loaded_model = load_and_patch_model(model_address, dtype_name)
#     print_gpu_memory("After loading model with dtype ",dtype_name)

#     for idx, sample in enumerate(dataset):
#         print(f"\n==================== Sample {idx + 1} ====================")
#         print(f"Intent: {sample['intent']}")
#         prompt = f"### Instruction:\n{sample['intent']}\n\n### Response:"
        
#         with mlflow.start_run(run_name=f"{model_name}-sample-{idx}"):
#             output, inference_time, memory_usage = generate_code_with_profiling(loaded_model, tokenizer, prompt)
        
#             # Log to MLflow
#             mlflow.log_param("model_name", model_name)
#             mlflow.log_param("dtype", dtype)
#             mlflow.log_metric("inference_time", inference_time)
#             mlflow.log_metric("memory_usage_bytes", memory_usage)
#             mlflow.log_metric("reserved_memory_usage_bytes", reserved_memory_usage)

#         print(f"\n🔹 Output from {model_name}:\n{output}")
#         print(f"⏱️ Inference time: {inference_time:.4f} seconds\n")


In [11]:
# model_name = "Lite-Instruct"
# model_address = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
# dtype_name = 'float16'
# tokenizer, loaded_model = load_and_patch_model(model_address, dtype_name)
# print_gpu_memory("After loading model with dtype ")

In [12]:
def compute_bertscore(eval_dataset, model, tokenizer, max_samples=None):
    model.eval()
    preds, refs = [], []

    for i, sample in enumerate(eval_dataset):
        if max_samples and i >= max_samples:
            break

        input_text = sample["input"]
        reference = sample["output"]

        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True).to(model.device)
        with torch.no_grad():
            output_ids = model.generate(
                **inputs,
                max_new_tokens=128,
                do_sample=False,
                num_beams=4
            )
        prediction = tokenizer.decode(output_ids[0], skip_special_tokens=True)

        preds.append(prediction)
        refs.append(reference)

    # Compute BERTScore
    P, R, F1 = bertscore(preds, refs, lang="en")
    return {
        "bertscore_precision": P.mean().item(),
        "bertscore_recall": R.mean().item(),
        "bertscore_f1": F1.mean().item()
    }


In [13]:
def load_comment_jsons(folder_path):
    data = []
    for fname in os.listdir(folder_path):
        if fname.endswith("_comments.jsonl"):
            repo_base = fname.replace("_comments.jsonl", "")
            diff_path = os.path.join(folder_path, f"{repo_base}.diff")
            if not os.path.exists(diff_path):
                continue

            # Load full diff file
            with open(diff_path, "r", encoding="utf-8") as f:
                full_diff = f.read()

            # Load comment JSONL
            with open(os.path.join(folder_path, fname), "r", encoding="utf-8") as f:
                for line in f:
                    entry = json.loads(line)
                    if "diff_hunk" in entry and "body" in entry:
                        data.append({
                            "instruction": """You are a code reviewer for a Jenkins plugin. Review the following diff for potential improvements or guideline violations.

Key guidelines to follow:
- Use standard Java libraries instead of external ones like Commons I/O when possible.
- Avoid deprecated APIs, especially in Jenkins core and plugins.
- Write clear, descriptive method and variable names.
- Add or update tests when modifying functionality or fixing bugs.
- Do not include commented-out code or leftover TODOs.
- Update documentation if user-facing behavior changes.
- Keep commits focused and avoid mixing unrelated changes.
- Code must compile cleanly and pass all tests.
- Maintain consistent formatting and follow Jenkins coding style.

Also consider other good practices not explicitly listed above.""",
                            "input": f"Full Diff:\n{full_diff}\n\nFocused Hunk:\n{entry['diff_hunk']}",
                            "output": entry["body"]
                        })
    return pd.DataFrame(data)


df = load_comment_jsons("git-client-plugin")
df.head()


Unnamed: 0,instruction,input,output
0,You are a code reviewer for a Jenkins plugin. ...,Full Diff:\ndiff --git a/src/test/java/org/jen...,Please don't make whiitespace changes to exist...
1,You are a code reviewer for a Jenkins plugin. ...,Full Diff:\ndiff --git a/src/test/java/org/jen...,Please don't make white space changes in exist...
2,You are a code reviewer for a Jenkins plugin. ...,Full Diff:\ndiff --git a/src/test/java/org/jen...,Please remove the pure white space changes in ...
3,You are a code reviewer for a Jenkins plugin. ...,Full Diff:\ndiff --git a/Jenkinsfile b/Jenkins...,I know you're only updating the jenkins versio...
4,You are a code reviewer for a Jenkins plugin. ...,Full Diff:\ndiff --git a/Jenkinsfile b/Jenkins...,Good question. I will add a comment explainin...


In [14]:
output_path = "pr_review_dataset.json"
df.to_json(output_path, orient="records", lines=True)


In [15]:

# Load dataset
dataset = load_dataset("json", data_files="pr_review_dataset.json", split="train")
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset["train"]
val_dataset = dataset["test"]

Generating train split: 0 examples [00:00, ? examples/s]

In [16]:


clear_previous_model()
# Load tokenizer + model
# model_name = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"  # already loaded by you, skip this if in memory
# tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     load_in_4bit=True,
#     device_map="auto",
#     trust_remote_code=True
# )

model_name = "Lite-Instruct"
model_address = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
dtype_name = 'float16'
tokenizer, model = load_and_patch_model(model_address, dtype_name)
print_gpu_memory("After loading model with dtype ")

# Prepare model for QLoRA
model = prepare_model_for_kbit_training(model)

# LoRA config
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],  # tune depending on DeepSeek architecture
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

# Training args
training_args = TrainingArguments(
    output_dir="./qlora-output",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    fp16=True,
    report_to="none"
)










After freeing previous model GPU memory: 1 MiB / 81920 MiB


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

After loading model with dtype  GPU memory: 15779 MiB / 81920 MiB


In [17]:
def formatting_func(example):
    return f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"


trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=training_args,
    formatting_func=formatting_func
)



Applying formatting function to train dataset:   0%|          | 0/528 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/528 [00:00<?, ? examples/s]

Adding EOS to train dataset:   0%|          | 0/528 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/528 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (19245 > 16384). Running this sequence through the model will result in indexing errors


Truncating train dataset:   0%|          | 0/528 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/59 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/59 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/59 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/59 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/59 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [18]:
mlflow.set_experiment("pr_reviewer_model_exp")

<Experiment: artifact_location='mlflow-artifacts:/1', creation_time=1746855207918, experiment_id='1', last_update_time=1746855207918, lifecycle_stage='active', name='pr_reviewer_model_exp', tags={}>

In [19]:
# from accelerate.state import AcceleratorState

# # Reset global accelerator state
# AcceleratorState._reset_state()


In [None]:
with mlflow.start_run(run_name="qlora-deepseek-pr-review"):

    mlflow.log_params({
        "model_name": model_name,
        "batch_size": training_args.per_device_train_batch_size,
        "epochs": training_args.num_train_epochs,
        "learning_rate": training_args.learning_rate,
        "gradient_accumulation_steps": training_args.gradient_accumulation_steps,
        "lora_r": lora_config.r,
        "lora_alpha": lora_config.lora_alpha
    })

    for epoch in range(int(training_args.num_train_epochs)):
        print(f"\n--- Epoch {epoch + 1} ---")

        # Reset and measure peak memory before training epoch
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        torch.cuda.synchronize()
        start_time = time.time()

        trainer.train()

        torch.cuda.synchronize()
        end_time = time.time()
        peak_memory = torch.cuda.max_memory_allocated()
        train_time = end_time - start_time

        # Log timing and memory
        mlflow.log_metric(f"train_time_epoch_{epoch+1}", train_time)
        mlflow.log_metric(f"peak_memory_epoch_{epoch+1}", peak_memory)

        # Save model checkpoint
        # epoch_ckpt_path = os.path.join(checkpoint_dir, f"epoch_{epoch+1}")
        # trainer.save_model(epoch_ckpt_path)
        # mlflow.log_artifact(epoch_ckpt_path)

        # Evaluate
        print("\nEvaluating with BERTScore...")
        bert_metrics = compute_bertscore(val_dataset, model, tokenizer, max_samples=100)
        mlflow.log_metrics({f"{k}_epoch_{epoch+1}": v for k, v in bert_metrics.items()})
        print(f"📊 BERTScore F1 (Epoch {epoch + 1}): {bert_metrics['bertscore_f1']:.4f}")



--- Epoch 1 ---


  return fn(*args, **kwargs)


Step,Training Loss
10,1.1552
20,0.9979
30,0.9326
40,0.8401
50,0.7279


In [None]:
torch.save(model.state_dict(), "my_finetuned_model.pth")


In [None]:

model.eval()
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)

response = pipe({
    "instruction": "Review the following diff for guideline violations.",
    "input": "@@ -536,7 +536,7 @@ public void tag(String name, String message) throws GitException {\n- Ref tag = repo.getRefDatabase().getRef(R_TAGS + tagName);\n+ Ref tag = repo.getRefDatabase().exactRef(R_TAGS + tagName);"
})

print(response[0]['generated_text'])