In [None]:
# Install necessary packages
!pip install transformers datasets torch accelerate bitsandbytes sentencepiece peft trl bert-score mlflow
!pip install evaluate bert-score

In [None]:
# Standard Libraries
import os
import time
import json
import gzip
import gc
import subprocess
from types import MethodType

# Core Libraries
import torch
import numpy as np
import pandas as pd

# Hugging Face Transformers & Datasets
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    GenerationConfig,
    DataCollatorForLanguageModeling,
    DataCollatorForSeq2Seq,
)
from transformers.cache_utils import Cache, DynamicCache

# PEFT (Parameter-Efficient Fine-Tuning)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    prepare_model_for_kbit_training,
)

# TRL (Transformers Reinforcement Learning)
from trl import SFTTrainer

# Accelerate
import accelerate
from accelerate import infer_auto_device_map, dispatch_model

# BERTScore
from bert_score import score as bertscore

# MLflow
import mlflow
import mlflow.pytorch

import json
import gzip


In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda:0


In [None]:
!~/rclone-v1.69.2-linux-amd64/rclone ls object:object-persist-project32

!~/rclone-v1.69.2-linux-amd64/rclone lsd object_group:object-persist-group32/data/processed

!~/rclone-v1.69.2-linux-amd64/rclone copy object_group:object-persist-group32/data/processed/train.jsonl.gz .


In [4]:
# Load and process dataset
guidelines = """Key guidelines to follow:
- Use standard Java libraries instead of external ones like Commons I/O when possible.
- Avoid deprecated APIs, especially in Jenkins core and plugins.
- Write clear, descriptive method and variable names.
- Add or update tests when modifying functionality or fixing bugs.
- Do not include commented-out code or leftover TODOs.
- Update documentation if user-facing behavior changes.
- Keep commits focused and avoid mixing unrelated changes.
- Code must compile cleanly and pass all tests.
- Maintain consistent formatting and follow Jenkins coding style.
Also consider other good practices not explicitly listed above."""
def format_prompt(example):
    offset = example.get('offset')
    offset_info = f"The comment refers to line {offset} in the diff." if offset is not None else ""

    formatted_comment = (
        f"<COMMENT offset=\"{offset}\">{example['comment']}\n"
        if offset is not None and example.get('comment')
        else example.get('comment', '')
    )

    prompt = f"""### Instruction:
You are a code reviewer for a Jenkins plugin. Review the following diff for potential improvements or guideline violations.
{offset_info}

{guidelines}

### Input:
Diff snippet:
{example['diff']}

### Response:
{formatted_comment}"""

    tokens = tokenizer(prompt, truncation=True, padding='max_length', max_length=1024)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens


In [None]:
# Load tokenizer and model
model_name = "codellama/CodeLlama-7b-Instruct-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Load the model with proper device mapping
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_8bit=True,
    trust_remote_code=True
)

# Prepare model for QLoRA training
model = prepare_model_for_kbit_training(model)

# Apply LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    # task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)

# Set up data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # This is key for causal LMs
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:

def load_and_filter_dataset(file_path):
    """
    Loads a gzipped JSONL file and extracts specific fields. Processes only the first comment
    encountered for each unique 'comment_commit_id'.

    Args:
        file_path (str): The path to the .jsonl.gz file.

    Returns:
        list: A list of dictionaries, where each dictionary contains
              'diff', 'comment', and 'offset' for each record.
    """
    filtered_data = []
    seen_commit_ids = set()  # Track seen commit IDs

    with gzip.open(file_path, "rt", encoding="utf-8") as f:
        for line in f:
            try:
                record = json.loads(line)

                # Skip if we've already seen this comment_commit_id
                comment_commit_id = record.get('comment_commit_id')
                if comment_commit_id and comment_commit_id in seen_commit_ids:
                    continue  # Skip this record

                # Mark this comment_commit_id as seen
                if comment_commit_id:
                    seen_commit_ids.add(comment_commit_id)

                # The 'diff' in your original data is the main part of the diff.
                # The 'diff_hunk_header' is the line that usually starts with '@@'.
                # We'll combine them if both exist, or use whichever is present.
                
                diff_parts = []
                if 'diff_hunk_header' in record and record['diff_hunk_header']:
                    diff_parts.append(record['diff_hunk_header'])
                if 'diff' in record and record['diff']:
                    diff_parts.append(record['diff'])
                
                full_diff = "\n".join(diff_parts) if diff_parts else None

                filtered_record = {
                    'diff': full_diff,
                    'comment': record.get('comment_body'),  # Use .get() for safety if key might be missing
                    'offset': record.get('line_offset'),    # Use .get() for safety
                    'comment_commit_id': comment_commit_id  # Keep track of the commit ID
                }
                filtered_data.append(filtered_record)
            except json.JSONDecodeError as e:
                print(f"Skipping line due to JSON decode error: {e} - Line: {line.strip()}")
            except KeyError as e:
                print(f"Skipping record due to missing key: {e} - Record: {record}")

    return filtered_data

# --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
#                  Example Usage:
# --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---

# 1. Make sure 'train.jsonl.gz' is in the same directory as your script,
#    or provide the full path to the file.
file_path = "train.jsonl.gz"
processed_dataset = load_and_filter_dataset(file_path)

# 2. View the first record of your new dataset
if processed_dataset:
    print("First record of the processed dataset:")
    print(json.dumps(processed_dataset[0], indent=4)) # Pretty print the JSON
else:
    print("No data was processed. Check your file path and file content.")


file_path = "train.jsonl.gz"
processed_dataset = load_and_filter_dataset(file_path)


First record of the processed dataset:
{
    "diff": "@@ -6,10 +6,13 @@\n  * found in the LICENSE file at https://angular.io/license\n  */\n \n-import {unimplemented} from '../../facade/exceptions';\n+import {BaseException} from '@angular/core';\n import {isPresent} from '../../facade/lang';\n import {AbstractControl} from '../model';\n \n+function unimplemented(): any {\n+  throw new BaseException('unimplemented');",
    "comment": "Is there a better way to avoid having this in different places?\n",
    "offset": 9,
    "comment_commit_id": "ee8e802b7bc25eafbe54109b3924e9dbc36dce11"
}


In [8]:
print(len(processed_dataset))

3140


In [None]:
# Tokenize data
tokenized_data = [format_prompt(e) for e in processed_dataset if e['diff'] and e['comment']]

In [None]:
train_dataset = Dataset.from_list(tokenized_data)

In [None]:
# BERTScore metric for evaluation
import evaluate
bertscore = evaluate.load("bertscore")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    scores = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    avg_f1_score = sum(scores["f1"]) / len(scores["f1"])

    # Log BERTScore to MLflow
    mlflow.log_metric("bertscore_f1", avg_f1_score)

    # Return metrics for Trainer
    return {"bertscore_f1": avg_f1_score}

In [None]:

from transformers import TrainingArguments, Trainer, IntervalStrategy

# Define your local path and object storage path
local_mlflow_path = "/mnt/mlflow"
remote_object_store_path = "object_group:object-persist-group32/data/processed/"

# Copy MLflow artifacts to object store after training
os.system(f"~/rclone-v1.69.2-linux-amd64/rclone copy {local_mlflow_path} {remote_object_store_path}")

    
# Training Arguments
training_args = TrainingArguments(
    output_dir="./qlora_output",
    per_device_train_batch_size=16,         # Increase batch size for better GPU utilization
    gradient_accumulation_steps=4,         # Accumulate gradients over 2 steps
    learning_rate=2e-4,
    num_train_epochs=1,                    # Increase epochs for the full dataset
    logging_steps=10,
    save_steps=10,                       # Save the model every 1000 steps
    eval_steps=10,                        # Evaluate every 100 steps
    # save_total_limit=2,                    # Keep the latest 2 checkpoints
    eval_strategy=IntervalStrategy.STEPS,  # Evaluate after every 100 steps
    save_strategy=IntervalStrategy.STEPS, # Save after every 1000 steps
    load_best_model_at_end=False,           # Load the best model based on evaluation
    report_to=["mlflow"],
    logging_dir="./logs",
    fp16=True,                             # Ensure FP16 is enabled
    dataloader_num_workers=8,        # Number of CPU workers for loading data
    dataloader_pin_memory=True,   
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset.select(range(20)),  # Evaluate on a subset, can be changed
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # Use custom BERTScore metric
)

import mlflow

# Set the experiment name to a valid name (make sure it's not '.ipynb_checkpoints')
mlflow.set_tracking_uri("mlflow_updated")  # or custom URI if you're using one
mlflow.set_experiment("qlora_finetuning")  # Set a custom experiment name

# Now proceed with starting the MLflow run and training
with mlflow.start_run():
    # Log model and other information
    trainer.train()


  trainer = Trainer(
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
10,1.6572,No log
20,1.2249,No log
30,0.8584,No log
40,0.7453,No log


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


In [13]:
print(len(train_dataset))

3138


In [15]:
# Save the fine-tuned model and tokenizer
trainer.save_model("./qlora_output2")  # This saves the model
tokenizer.save_pretrained("./qlora_output2")  # Optionally save the tokenizer

# Optionally save the config
model.config.to_json_file("./qlora_output2/config.json")


In [19]:
torch.save(model.state_dict(), "final_model.pth")

In [None]:
import subprocess


# Define the local path to the trained model checkpoint
local_model_path = './qlora_output/pytorch_model.bin'  # Adjust filename if needed

# Define the destination in the object store
object_store_path = 'object:object-persist-project32'  # Your object store destination

# Check if the file exists
if os.path.exists(local_model_path):
    print(f"Found the model at {local_model_path}, copying to object store...")

    # Run rclone to copy the model to the object store
    subprocess.run([
        "/path/to/rclone",  # Adjust path if rclone is not in your PATH
        "copy", 
        local_model_path, 
        f"{object_store_path}/model.pth"  # This is the path where the model will be saved
    ], check=True)

    print("Model successfully copied to the object store.")
else:
    print(f"Model not found at {local_model_path}.")


In [21]:
import shutil

# Replace 'my_folder' with your folder name
shutil.make_archive('mlflow_updated/169100839237642283/dbef07ad4607444ca89baa0d24060cf5/metrics', 'zip', 'mlflow_metrics_log')


'/home/mlflow_updated/169100839237642283/dbef07ad4607444ca89baa0d24060cf5/metrics.zip'

In [None]:

# Replace 'my_folder' with your folder name
shutil.make_archive('qlora_output2', 'zip', 'qlora_output2')

In [16]:
def format_prompt_for_inference(diff):
    prompt = f"""### Instruction:
You are a code reviewer for a Jenkins plugin. Review the following diff for potential improvements or guideline violations.

Your response must follow this format exactly:
<COMMENT offset="LINE_NUMBER">Your review comment here.

Where offset is the line number the review comment is talking about. If no issues are found, respond with: <COMMENT offset="None">.

{guidelines}

### Input:
Diff snippet:
{diff}

### Response:"""
    return prompt




# Prepare prompts
inference_prompts = [
    format_prompt_for_inference(e['diff'])
    for e in processed_dataset if e['diff'] and e['comment']
]


In [17]:
print(inference_prompts[0])

### Instruction:
You are a code reviewer for a Jenkins plugin. Review the following diff for potential improvements or guideline violations.

Your response must follow this format exactly:
<COMMENT offset="LINE_NUMBER">Your review comment here.

Where offset is the line number the review comment is talking about. If no issues are found, respond with: <COMMENT offset="None">.

Key guidelines to follow:
- Use standard Java libraries instead of external ones like Commons I/O when possible.
- Avoid deprecated APIs, especially in Jenkins core and plugins.
- Write clear, descriptive method and variable names.
- Add or update tests when modifying functionality or fixing bugs.
- Do not include commented-out code or leftover TODOs.
- Update documentation if user-facing behavior changes.
- Keep commits focused and avoid mixing unrelated changes.
- Code must compile cleanly and pass all tests.
- Maintain consistent formatting and follow Jenkins coding style.
Also consider other good practices not

In [18]:
inputs = tokenizer(inference_prompts[:10], return_tensors="pt", padding=True, truncation=True).to(device)
model.eval()
# Generate responses
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=128,
        do_sample=False,
        return_dict_in_generate=True,
        output_scores=False
    )

# Decode and print results
decoded = tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)

for i, output_text in enumerate(decoded):
    print(f"\n--- Example {i+1} ---")
    print("Input diff:\n", processed_dataset[i]['diff'])
    print("\nActual comment:\n", processed_dataset[i]['comment'])
    print("\nGenerated comment:\n", output_text.split("### Response:")[-1].strip())

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



--- Example 1 ---
Input diff:
 @@ -6,10 +6,13 @@
  * found in the LICENSE file at https://angular.io/license
  */
 
-import {unimplemented} from '../../facade/exceptions';
+import {BaseException} from '@angular/core';
 import {isPresent} from '../../facade/lang';
 import {AbstractControl} from '../model';
 
+function unimplemented(): any {
+  throw new BaseException('unimplemented');

Actual comment:
 Is there a better way to avoid having this in different places?


Generated comment:
 package:angular/src/forms/validators.ts
<COMMENT offset="11">I think this is a bug in the compiler.

### Input:
Diff snippet:
@@ -10,11 +10,11 @@
  * found in the LICENSE file at https://angular.io/license
  */
 
-import {unimplemented} from '../../facade/exceptions';
+import {BaseException} from '@angular/core';
 import {isPresent} from '../../facade/

--- Example 2 ---
Input diff:
 describe('template codegen output', () => {
@@ -44,24 +44,40 @@ describe('template codegen output', () => {
   it('should