In [63]:
import os
import gzip, json, time
import torch 
import numpy as np
from pathlib import Path
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling
)
from peft import get_peft_model, LoraConfig, TaskType
from bert_score import score
import onnxruntime as ort
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import onnx
import onnxruntime as ort

In [64]:
# Load the model

In [65]:
model_path = "/mnt/object/my_model.pth"
data_path = "/mnt/object_group/data/"

In [66]:
model_size = os.path.getsize(model_path) 
print(f"Model Size on Disk: {model_size/ (1e6) :.2f} MB")


Model Size on Disk: 31415.38 MB


In [67]:
root_dir = "/mnt/object_group/data"
preview_lines = 2  # number of lines/content lines to print

for dirpath, dirnames, filenames in os.walk(root_dir):
    print(f"\n{dirpath}")
    for fname in filenames[:2]:  # preview up to 5 files
        full_path = os.path.join(dirpath, fname)
        print(f"  └── {fname}")
        
        try:
            with open(full_path, 'r') as f:
                print("     --- File preview ---")
                for i, line in enumerate(f):
                    print("     " + line.strip())
                    if i + 1 >= preview_lines:
                        break
                print("     ---------------------")
        except UnicodeDecodeError:
            print("     [Binary or non-text file — skipped]")
        except Exception as e:
            print(f"     [Error reading file: {e}]")

    if len(filenames) > 5:
        print("  ... (more files hidden)")



/mnt/object_group/data

/mnt/object_group/data/metadata
  └── processed_prs.log
     --- File preview ---
     https://github.com/angular/angular/pull/10609
     https://github.com/angular/angular/pull/10616
     ---------------------

/mnt/object_group/data/processed
  └── dataset_card.md
     --- File preview ---
     # Dataset Card for v1
     
     ---------------------
  └── split_map.yml
     --- File preview ---
     test:
     - facebook_react
     ---------------------

/mnt/object_group/data/raw

/mnt/object_group/data/raw/angular

/mnt/object_group/data/raw/angular/angular
  └── angular_angular_10609.diff
     --- File preview ---
     diff --git a/modules/@angular/compiler-cli/package.json b/modules/@angular/compiler-cli/package.json
     index 11760c5c75f69..6b06d2ce052af 100644
     ---------------------
  └── angular_angular_10609_comments.jsonl
     --- File preview ---
     ---------------------
  ... (more files hidden)

/mnt/object_group/data/raw/apache

/mnt/object

In [68]:
train = "/mnt/object_group/data/processed/train.jsonl.gz"
with gzip.open(train, 'rt', encoding='utf-8') as f:
    for i, line in enumerate(f):
        print("    " + line.strip())
        if i >= 10:
            break


    {"comment_id": 74291026, "comment_user_login": "vikerman", "comment_body": "Is there a better way to avoid having this in different places?\n", "comment_created_at": "2016-08-10T17:30:42+00:00", "comment_html_url": "https://github.com/angular/angular/pull/10620#discussion_r74291026", "comment_path": "modules/@angular/common/src/forms-deprecated/directives/abstract_control_directive.ts", "comment_position": 10, "comment_original_position": 10, "comment_commit_id": "ee8e802b7bc25eafbe54109b3924e9dbc36dce11", "comment_original_commit_id": "ee8e802b7bc25eafbe54109b3924e9dbc36dce11", "diff_line_content": "}\n", "diff_line_type": "added", "diff_line_source_no": null, "diff_line_target_no": 15, "diff_hunk_header": "", "diff": "@@ -6,10 +6,13 @@\n  * found in the LICENSE file at https://angular.io/license\n  */\n \n-import {unimplemented} from '../../facade/exceptions';\n+import {BaseException} from '@angular/core';\n import {isPresent} from '../../facade/lang';\n import {AbstractControl} 

In [69]:
DATA_ROOT = Path("/mnt/object_group/data/processed")  # Adjust as needed

def load_samples(limit_per_split=10):
    diff_samples = []
    for repo_dir in DATA_ROOT.iterdir():
        if not repo_dir.is_dir():
            continue
        for pr_dir in (repo_dir / "diff").iterdir():
            pr_id = pr_dir.name
            diff_file = pr_dir
            comments_file = DATA_ROOT / repo_dir.name / "comments" / f"{repo_dir.name}_{pr_id}_comments.jsonl"

            if not diff_file.exists() or not comments_file.exists():
                continue

            with open(diff_file, "r", encoding="utf-8") as df:
                diff_content = df.read()

            with open(comments_file, "r", encoding="utf-8") as cf:
                for line in cf:
                    try:
                        comment = json.loads(line)
                        offset = comment.get("original_position")
                        side = comment.get("side", "RIGHT")
                        body = comment.get("body", "").strip()
                        if offset is not None and body:
                            sample = {
                                "input": f"<DIFF>\n{diff_content}\n</DIFF>\n<COMMENT side=\"{side}\" offset=\"{offset}\">",
                                "output": body
                            }
                            diff_samples.append(sample)
                    except json.JSONDecodeError:
                        continue

    random.shuffle(diff_samples)
    return {
        "train": diff_samples[:limit_per_split],
        "test": diff_samples[limit_per_split:2*limit_per_split],
        "eval": diff_samples[2*limit_per_split:3*limit_per_split],
    }


In [70]:
def load_jsonl_gz(file_path, limit):
    samples = []
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= limit:
                break
            entry = json.loads(line)
            prompt = f"<DIFF>\n{entry['diff']}\n</DIFF>\n"
            comment = f"<COMMENT side=\"{entry['side']}\" offset=\"{entry['line_offset']}\">{entry['comment_body']}"
            samples.append({'input': prompt, 'label': comment})
    return samples

train_data = load_jsonl_gz("/mnt/object_group/data/processed/train.jsonl.gz", 10)
test_data  = load_jsonl_gz("/mnt/object_group/data/processed/test.jsonl.gz", 8)
eval_data  = load_jsonl_gz("/mnt/object_group/data/processed/val.jsonl.gz", 4)

In [71]:
# train_data

In [72]:
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Base")
def tokenize(example):
    input = tokenizer(example['input'], truncation=True, padding='max_length', max_length=512)
    label = tokenizer(example['label'], truncation=True, padding='max_length', max_length=128)
    input['labels'] = label['input_ids']
    return input

train_ds = Dataset.from_list(train_data).map(tokenize)
test_ds  = Dataset.from_list(test_data).map(tokenize)
eval_ds  = Dataset.from_list(eval_data).map(tokenize)


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [73]:
print(torch.cuda.is_available())

True


In [74]:
base_model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Base", trust_remote_code = True).to("cuda")
lora_cfg = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]  # Add this
)
model = get_peft_model(base_model, lora_cfg)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 0 has a total capacity of 79.25 GiB of which 2.75 MiB is free. Process 11898 has 79.24 GiB memory in use. Of the allocated memory 78.78 GiB is allocated by PyTorch, and 63.05 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

We get CUDA Out of Memory error even to get te pef model, hence do optimizations to reduce the size

### ONNX Optimizations

In [None]:
device = torch.device("cpu")

In [None]:
model_arch = "deepseek-ai/DeepSeek-Coder-V2-Lite-Base"
model = AutoModelForCausalLM.from_pretrained(model_arch, trust_remote_code=True)

In [None]:
state_dict = torch.load(model_path, map_location=device, weights_only=False)
model.load_state_dict(state_dict)
model.eval()

In [None]:
# onnx_model_path = "/mnt/object/models/finetuned_deepseek_onnx/model.onnx"
# Path("/mnt/object/models/finetuned_deepseek_onnx").mkdir(parents=True, exist_ok=True)
onnx_model_path = "models/finetunedDeepseekcoV2.onnx"

In [None]:
# Sample input token count
idx = 9
sample_input = train_data[idx]['input']
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Base", trust_remote_code=True)
tokens = tokenizer(sample_input, return_tensors="pt")
print("Token count:", tokens['input_ids'].shape[1])
print(train_data[idx]['input'] + "\n" + train_data[idx]['label'])
# print(train_data[idx].keys())
print("ho")
print(train_data[idx])

In [None]:
tokens.keys()

In [None]:
torch.onnx.export(model, (tokens["input_ids"], tokens["attention_mask"]), onnx_model_path,
                  export_params=True, opset_version=20,
                  do_constant_folding=True, input_names=['input_ids', 'attention_mask'],
                  output_names=['output'], 
                  dynamic_axes={
                      "input_ids": {0: "batch_size"},
                      "attention_mask": {0: "batch_size"},
                      "output": {0: "batch_size"}
                  }
                 )


In [None]:
print(f"ONNX model saved to {onnx_model_path}")

In [None]:
onnx_model = onnx.load(onnx_model_path)
onnx.checker.check_model(onnx_model_path)

Creating an inference session

In [None]:
ort_session = ort.InferenceSession(onnx_model_path, providers=['CPUExecutionProvider'])

In [None]:
ort_session.get_providers()

### Check Model Size of ONNX Model

In [None]:
onnx_model_size_in_bytes = os.path.getsize(onnx_model_path) 
model_size_gb = onnx_model_size_in_bytes / (1024 ** 3)
model_size_mb = onnx_model_size_in_bytes / (1024 ** 2)
# print(f"ONNX model size: {model_size_gb:.5f} GiB, {model_size_mb:.5f} MiB")
print(f"Model Size on Disk: {onnx_model_size_in_bytes/ (1e6) :.2f} MB")

In [None]:
# deleting the loaded pytorch model of 32 GB RAM to clear up space
del model

In [78]:
print(f"{os.path.getsize('./models')/(1e6):.2f}")

0.23


### Test accuracy on ONNX Model

In [51]:
!free -h

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


               total        used        free      shared  buff/cache   available
Mem:           503Gi       264Gi       106Gi        14Mi       136Gi       238Gi
Swap:             0B          0B          0B


In [53]:
!nvidia-smi


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Sat May 10 10:39:22 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off |   00000000:27:00.0 Off |                    0 |
| N/A   62C    P0             79W /  300W |   68093MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

From the above, there's plenty of memory available in CPU

In [None]:
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Base", trust_remote_code=True)
onnx_model_path = "models/finetunedDeepseekcoV2.onnx"


In [54]:
test_samples = test_data[:10]
references = []
predictions = []
latencies = []

In [61]:
for data in test_samples:
    input_text = data["input"]
    reference_comment = data["input"]
    # Tokenize input
    inputs = tokenizer(input_text, return_tensors="np", padding=True, truncation=True)

    # Measure Latency
    start = time.time()
    outputs = ort_session.run(None, {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"]
    })
    end = time.time()

    latencies.append(end - start)

    # generate text from ids
    generated_ids = np.argmax(outputs[0], axis=-1)
    generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    references.append(reference_comment)
    predictions.append(generated_text)

InvalidArgument: [ONNXRuntimeError] : 2 : INVALID_ARGUMENT : Got invalid dimensions for input: input for the following indices
 index: 1 Got: 383 Expected: 136
 Please fix either the inputs/outputs or the model.

In [57]:
print([inp.name for inp in ort_session.get_inputs()])

['input', 'attention_mask']


In [None]:
training_args = TrainingArguments(
    output_dir="./deepseek-finetune",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_steps=5,
    save_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

trainer.train()
