In [None]:
# Install necessary packages
!pip install transformers datasets torch accelerate bitsandbytes sentencepiece peft trl bert-score mlflow
!pip install evaluate bert-score

In [None]:
# Standard Libraries
import os
import time
import json
import gzip
import gc
import subprocess
from types import MethodType

# Core Libraries
import torch
import numpy as np
import pandas as pd

# Hugging Face Transformers & Datasets
from datasets import load_dataset, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    GenerationConfig,
    DataCollatorForLanguageModeling,
    DataCollatorForSeq2Seq,
)
from transformers.cache_utils import Cache, DynamicCache

# PEFT (Parameter-Efficient Fine-Tuning)
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    prepare_model_for_kbit_training,
)

# TRL (Transformers Reinforcement Learning)
from trl import SFTTrainer

# Accelerate
import accelerate
from accelerate import infer_auto_device_map, dispatch_model

# BERTScore
from bert_score import score as bertscore

# MLflow
import mlflow
import mlflow.pytorch

import json
import gzip


In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda:0


In [None]:
!~/rclone-v1.69.2-linux-amd64/rclone ls object:object-persist-project32

!~/rclone-v1.69.2-linux-amd64/rclone lsd object_group:object-persist-group32/data/processed

!~/rclone-v1.69.2-linux-amd64/rclone copy object_group:object-persist-group32/data/processed/train.jsonl.gz .


In [4]:
# Load and process dataset
guidelines = """Key guidelines to follow:
- Use standard Java libraries instead of external ones like Commons I/O when possible.
- Avoid deprecated APIs, especially in Jenkins core and plugins.
- Write clear, descriptive method and variable names.
- Add or update tests when modifying functionality or fixing bugs.
- Do not include commented-out code or leftover TODOs.
- Update documentation if user-facing behavior changes.
- Keep commits focused and avoid mixing unrelated changes.
- Code must compile cleanly and pass all tests.
- Maintain consistent formatting and follow Jenkins coding style.
Also consider other good practices not explicitly listed above."""
def format_prompt(example):
    offset = example.get('offset')
    offset_info = f"The comment refers to line {offset} in the diff." if offset is not None else ""

    formatted_comment = (
        f"<COMMENT offset=\"{offset}\">{example['comment']}\n"
        if offset is not None and example.get('comment')
        else example.get('comment', '')
    )

    prompt = f"""### Instruction:
You are a code reviewer for a Jenkins plugin. Review the following diff for potential improvements or guideline violations.
{offset_info}

{guidelines}

### Input:
Diff snippet:
{example['diff']}

### Response:
{formatted_comment}"""

    tokens = tokenizer(prompt, truncation=True, padding='max_length', max_length=1024)
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens


In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling, DataCollatorForSeq2Seq
from peft import get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
from datasets import Dataset

In [6]:
# Load tokenizer and model
model_name = "codellama/CodeLlama-7b-Instruct-hf"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Load the model with proper device mapping
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_8bit=True,
    trust_remote_code=True
)

# Prepare model for QLoRA training
model = prepare_model_for_kbit_training(model)

# Apply LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    # task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)

# Set up data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # This is key for causal LMs
)



The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
import json
import gzip

def load_and_filter_dataset(file_path):
    """
    Loads a gzipped JSONL file and extracts specific fields.

    Args:
        file_path (str): The path to the .jsonl.gz file.

    Returns:
        list: A list of dictionaries, where each dictionary contains
              'diff', 'comment', and 'offset' for each record.
    """
    filtered_data = []
    with gzip.open(file_path, "rt", encoding="utf-8") as f:
        for line in f:
            try:
                record = json.loads(line)
                # The 'diff' in your original data is the main part of the diff.
                # The 'diff_hunk_header' is the line that usually starts with '@@'.
                # We'll combine them if both exist, or use whichever is present.
                
                diff_parts = []
                if 'diff_hunk_header' in record and record['diff_hunk_header']:
                    diff_parts.append(record['diff_hunk_header'])
                if 'diff' in record and record['diff']:
                    diff_parts.append(record['diff'])
                
                full_diff = "\n".join(diff_parts) if diff_parts else None

                filtered_record = {
                    'diff': full_diff,
                    'comment': record.get('comment_body'), # Use .get() for safety if key might be missing
                    'offset': record.get('line_offset')   # Use .get() for safety
                }
                filtered_data.append(filtered_record)
            except json.JSONDecodeError as e:
                print(f"Skipping line due to JSON decode error: {e} - Line: {line.strip()}")
            except KeyError as e:
                print(f"Skipping record due to missing key: {e} - Record: {record}")
    return filtered_data

# --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---
#                  Example Usage:
# --- --- --- --- --- --- --- --- --- --- --- --- --- --- ---

# 1. Make sure 'train.jsonl.gz' is in the same directory as your script,
#    or provide the full path to the file.
file_path = "train.jsonl.gz"
processed_dataset = load_and_filter_dataset(file_path)

# 2. View the first record of your new dataset
if processed_dataset:
    print("First record of the processed dataset:")
    print(json.dumps(processed_dataset[0], indent=4)) # Pretty print the JSON
else:
    print("No data was processed. Check your file path and file content.")

# 3. To see more records (e.g., the first 5):
# if processed_dataset:
#     print("\nFirst 5 records of the processed dataset:")
#     for i, record in enumerate(processed_dataset[:5]):
#         print(f"\n--- Record {i+1} ---")
#         print(json.dumps(record, indent=4))

file_path = "train.jsonl.gz"
processed_dataset = load_and_filter_dataset(file_path)


First record of the processed dataset:
{
    "diff": "@@ -6,10 +6,13 @@\n  * found in the LICENSE file at https://angular.io/license\n  */\n \n-import {unimplemented} from '../../facade/exceptions';\n+import {BaseException} from '@angular/core';\n import {isPresent} from '../../facade/lang';\n import {AbstractControl} from '../model';\n \n+function unimplemented(): any {\n+  throw new BaseException('unimplemented');",
    "comment": "Is there a better way to avoid having this in different places?\n",
    "offset": 9
}


In [8]:

# Tokenize data
tokenized_data = [format_prompt(e) for e in processed_dataset if e['diff'] and e['comment']]


In [9]:
train_dataset = Dataset.from_list(tokenized_data)



In [10]:
# BERTScore metric for evaluation
import evaluate
bertscore = evaluate.load("bertscore")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    scores = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    avg_f1_score = sum(scores["f1"]) / len(scores["f1"])

    # Log BERTScore to MLflow
    mlflow.log_metric("bertscore_f1", avg_f1_score)

    # Return metrics for Trainer
    return {"bertscore_f1": avg_f1_score}

In [11]:
# import mlflow
# from transformers import Trainer, TrainingArguments
# from datetime import datetime
# import torch
# import time

# def run_experiment(exp_name, config):
#     run_name = f"{exp_name}-{datetime.now().strftime('%H%M%S')}"
#     output_dir = f"./qlora_output/{run_name}"

#     mlflow.set_tracking_uri("mlexp")  # or custom URI
#     mlflow.set_experiment("qlora-fast-experiments-valid")  # A valid name


#     args = TrainingArguments(
#         output_dir=output_dir,
#         per_device_train_batch_size=config["batch_size"],
#         gradient_accumulation_steps=config["grad_accum"],
#         learning_rate=config["lr"],
#         num_train_epochs=1,
#         logging_steps=10,
#         save_steps=200,
#         eval_steps=200,
#         save_total_limit=1,
#         eval_strategy="steps",
#         save_strategy="steps",
#         load_best_model_at_end=False,
#         fp16=config.get("fp16", False),
#         bf16=config.get("bf16", False),
#         gradient_checkpointing=config.get("grad_ckpt", False),
#         report_to=["mlflow"],
#         logging_dir=f"./logs/{run_name}",
#         dataloader_num_workers=2
#     )

#     trainer = Trainer(
#         model=model,
#         args=args,
#         train_dataset=train_dataset.select(range(20)),
#         eval_dataset=train_dataset.select(range(5)),
#         tokenizer=tokenizer,
#         data_collator=data_collator,
#         compute_metrics=compute_metrics,
#     )

#     with mlflow.start_run(run_name=run_name):
#         print(f"\n🔧 Running {run_name} with config: {config}")
#         train_result = trainer.train()
        
#         # Measure inference time for evaluation
#         start_time = time.time()
#         trainer.evaluate()
#         inference_time = time.time() - start_time

#         # Log metrics to MLflow
#         mlflow.log_params(config)
#         mlflow.log_metric("inference_time", inference_time)
#         # mlflow.pytorch.log_model(model, artifact_path="model")

#         # Print and return summary
#         print(f"✅ {run_name} finished.")
#         print(f"   → Inference Time: {inference_time:.4f} seconds")
#         return {**config, "inference_time": inference_time}


In [12]:
# experiments = [
#     {"batch_size": 4, "grad_accum": 2, "lr": 2e-4, "fp16": True},
#     {"batch_size": 8, "grad_accum": 1, "lr": 2e-4, "fp16": True},
#     {"batch_size": 6, "grad_accum": 1, "lr": 1e-4, "fp16": True, "grad_ckpt": True},
#     {"batch_size": 8, "grad_accum": 1, "lr": 2e-4, "fp16": False, "bf16": True, "grad_ckpt": True},
# ]

# results = []
# for i, config in enumerate(experiments):
#     result = run_experiment(f"exp{i+1}", config)
#     results.append(result)

# # Find best by Inference Time (lower is better)
# best = min(results, key=lambda r: r["inference_time"])
# print("\n🏆 Best Config (Shortest Inference Time):")
# print(best)


In [13]:
# import shutil

# # Specify the folder you want to zip
# folder_path = 'mlexp/697636671313419820'
# zip_name = 'mlexp.zip'

# # Create a zip file from the folder
# shutil.make_archive(zip_name, 'zip', folder_path)


In [15]:
from transformers import TrainingArguments, Trainer, IntervalStrategy

# Training Arguments
training_args = TrainingArguments(
    output_dir="./qlora_output",
    per_device_train_batch_size=8,         # Increase batch size for better GPU utilization
    gradient_accumulation_steps=8,         # Accumulate gradients over 2 steps
    learning_rate=2e-4,
    num_train_epochs=1,                    # Increase epochs for the full dataset
    logging_steps=100,
    # save_steps=1000,                       # Save the model every 1000 steps
    # eval_steps=100,                        # Evaluate every 100 steps
    # save_total_limit=2,                    # Keep the latest 2 checkpoints
    eval_strategy="no",  # Evaluate after every 100 steps
    save_strategy="no", # Save after every 1000 steps
    load_best_model_at_end=False,           # Load the best model based on evaluation
    report_to=["mlflow"],
    logging_dir="./logs",
    fp16=True,                             # Ensure FP16 is enabled
)

# Function to compute BERTScore for evaluation
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Compute BERTScore
    scores = bertscore.compute(predictions=decoded_preds, references=decoded_labels, lang="en")
    avg_f1_score = sum(scores["f1"]) / len(scores["f1"])

    # Log BERTScore to MLflow
    mlflow.log_metric("bertscore_f1", avg_f1_score)

    # Return metrics for Trainer
    return {"bertscore_f1": avg_f1_score}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=train_dataset.select(range(100)),  # Evaluate on a subset, can be changed
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # Use custom BERTScore metric
)


  trainer = Trainer(
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [16]:
import mlflow

# Set the experiment name to a valid name (make sure it's not '.ipynb_checkpoints')
mlflow.set_tracking_uri("mlruns")  # or custom URI if you're using one
mlflow.set_experiment("qlora_finetune_experiment")  # Set a custom experiment name

# Now proceed with starting the MLflow run and training
with mlflow.start_run():
    # Log model and other information
    trainer.train()


TypeError: device() received an invalid combination of arguments - got (NoneType), but expected one of:
 * (torch.device device)
      didn't match because some of the arguments have invalid types: (!NoneType!)
 * (str type, int index = -1)
