In [1]:
import os
import gzip, json, time
import torch 
import numpy as np
from pathlib import Path
from datasets import Dataset, load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM, 
    TrainingArguments, 
    Trainer, 
    DataCollatorForLanguageModeling
)
from peft import get_peft_model, LoraConfig, TaskType
from bert_score import score
import onnxruntime as ort
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import onnx
import onnxruntime as ort

In [2]:
!nvidia-smi

Sat May 10 18:13:38 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off |   00000000:27:00.0 Off |                    0 |
| N/A   53C    P0             47W /  300W |       1MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

In [None]:
!fuser -v /dev/nvidia*

In [None]:
!kill -9 <PID>

In [2]:
# Load the model

In [3]:
model_path = "/mnt/object/my_model.pth"
data_path = "/mnt/object_group/data/"

In [4]:
model_size = os.path.getsize(model_path) 
print(f"Model Size on Disk: {model_size/ (1e6) :.2f} MB")


Model Size on Disk: 31415.38 MB


In [5]:
root_dir = "/mnt/object_group/data"
preview_lines = 2  # number of lines/content lines to print

for dirpath, dirnames, filenames in os.walk(root_dir):
    print(f"\n{dirpath}")
    for fname in filenames[:2]:  # preview up to 5 files
        full_path = os.path.join(dirpath, fname)
        print(f"  └── {fname}")
        
        try:
            with open(full_path, 'r') as f:
                print("     --- File preview ---")
                for i, line in enumerate(f):
                    print("     " + line.strip())
                    if i + 1 >= preview_lines:
                        break
                print("     ---------------------")
        except UnicodeDecodeError:
            print("     [Binary or non-text file — skipped]")
        except Exception as e:
            print(f"     [Error reading file: {e}]")

    if len(filenames) > 5:
        print("  ... (more files hidden)")



/mnt/object_group/data

/mnt/object_group/data/metadata
  └── processed_prs.log
     --- File preview ---
     https://github.com/angular/angular/pull/10609
     https://github.com/angular/angular/pull/10616
     ---------------------

/mnt/object_group/data/processed
  └── dataset_card.md
     --- File preview ---
     # Dataset Card for v1
     
     ---------------------
  └── split_map.yml
     --- File preview ---
     test:
     - facebook_react
     ---------------------

/mnt/object_group/data/raw

/mnt/object_group/data/raw/angular

/mnt/object_group/data/raw/angular/angular
  └── angular_angular_10609.diff
     --- File preview ---
     diff --git a/modules/@angular/compiler-cli/package.json b/modules/@angular/compiler-cli/package.json
     index 11760c5c75f69..6b06d2ce052af 100644
     ---------------------
  └── angular_angular_10609_comments.jsonl
     --- File preview ---
     ---------------------
  ... (more files hidden)

/mnt/object_group/data/raw/apache

/mnt/object

KeyboardInterrupt: 

In [None]:
train = "/mnt/object_group/data/processed/train.jsonl.gz"
with gzip.open(train, 'rt', encoding='utf-8') as f:
    for i, line in enumerate(f):
        print("    " + line.strip())
        if i >= 10:
            break


In [6]:
DATA_ROOT = Path("/mnt/object_group/data/processed")  # Adjust as needed

def load_samples(limit_per_split=10):
    diff_samples = []
    for repo_dir in DATA_ROOT.iterdir():
        if not repo_dir.is_dir():
            continue
        for pr_dir in (repo_dir / "diff").iterdir():
            pr_id = pr_dir.name
            diff_file = pr_dir
            comments_file = DATA_ROOT / repo_dir.name / "comments" / f"{repo_dir.name}_{pr_id}_comments.jsonl"

            if not diff_file.exists() or not comments_file.exists():
                continue

            with open(diff_file, "r", encoding="utf-8") as df:
                diff_content = df.read()

            with open(comments_file, "r", encoding="utf-8") as cf:
                for line in cf:
                    try:
                        comment = json.loads(line)
                        offset = comment.get("original_position")
                        side = comment.get("side", "RIGHT")
                        body = comment.get("body", "").strip()
                        if offset is not None and body:
                            sample = {
                                "input": f"<DIFF>\n{diff_content}\n</DIFF>\n<COMMENT side=\"{side}\" offset=\"{offset}\">",
                                "output": body
                            }
                            diff_samples.append(sample)
                    except json.JSONDecodeError:
                        continue

    random.shuffle(diff_samples)
    return {
        "train": diff_samples[:limit_per_split],
        "test": diff_samples[limit_per_split:2*limit_per_split],
        "eval": diff_samples[2*limit_per_split:3*limit_per_split],
    }


In [8]:
def load_jsonl_gz(file_path, limit):
    samples = []
    with gzip.open(file_path, 'rt', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= limit:
                break
            entry = json.loads(line)
            prompt = f"<DIFF>\n{entry['diff']}\n</DIFF>\n"
            comment = f"<COMMENT side=\"{entry['side']}\" offset=\"{entry['line_offset']}\">{entry['comment_body']}"
            samples.append({'input': prompt, 'label': comment})
    return samples

train_data = load_jsonl_gz("/mnt/object_group/data/processed/train.jsonl.gz", 10)
test_data  = load_jsonl_gz("/mnt/object_group/data/processed/test.jsonl.gz", 8)
eval_data  = load_jsonl_gz("/mnt/object_group/data/processed/val.jsonl.gz", 4)

In [8]:
# train_data

In [9]:
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Base")
def tokenize(example):
    input = tokenizer(example['input'], truncation=True, padding='max_length', max_length=512)
    label = tokenizer(example['label'], truncation=True, padding='max_length', max_length=128)
    input['labels'] = label['input_ids']
    return input

train_ds = Dataset.from_list(train_data).map(tokenize)
test_ds  = Dataset.from_list(test_data).map(tokenize)
eval_ds  = Dataset.from_list(eval_data).map(tokenize)


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [10]:
print(torch.cuda.is_available())

True


In [11]:
base_model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Base", trust_remote_code = True).to("cuda")
lora_cfg = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]  # Add this
)
model = get_peft_model(base_model, lora_cfg)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [12]:
# torch.cuda.empty_cache()
# torch.cuda.ipc_collect()


We get CUDA Out of Memory error even to get te pef model, hence do optimizations to reduce the size

In [13]:
device = torch.device("cpu")

In [14]:
model_arch = "deepseek-ai/DeepSeek-Coder-V2-Lite-Base"
model = AutoModelForCausalLM.from_pretrained(model_arch, trust_remote_code=True)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [15]:
state_dict = torch.load(model_path, map_location=device, weights_only=False)
model.load_state_dict(state_dict)
model.eval()

DeepseekV2ForCausalLM(
  (model): DeepseekV2Model(
    (embed_tokens): Embedding(102400, 2048)
    (layers): ModuleList(
      (0): DeepseekV2DecoderLayer(
        (self_attn): DeepseekV2Attention(
          (q_proj): Linear(in_features=2048, out_features=3072, bias=False)
          (kv_a_proj_with_mqa): Linear(in_features=2048, out_features=576, bias=False)
          (kv_a_layernorm): DeepseekV2RMSNorm()
          (kv_b_proj): Linear(in_features=512, out_features=4096, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): DeepseekV2YarnRotaryEmbedding()
        )
        (mlp): DeepseekV2MLP(
          (gate_proj): Linear(in_features=2048, out_features=10944, bias=False)
          (up_proj): Linear(in_features=2048, out_features=10944, bias=False)
          (down_proj): Linear(in_features=10944, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): DeepseekV2RMSNorm()
        (post_atten

In [16]:
class Wrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model
    def forward(self, input_ids, attention_mask):
        return self.model(input_ids=input_ids, attention_mask=attention_mask).logits
# os._exit(0)

### ONNX Optimizations

In [4]:
# onnx_model_path = "/mnt/object/models/finetuned_deepseek_onnx/model.onnx"
# Path("/mnt/object/models/finetuned_deepseek_onnx").mkdir(parents=True, exist_ok=True)
onnx_model_path = "models/finetunedDeepseekcoV2.onnx"

In [19]:
# Sample input token count
idx = 9
sample_input = train_data[idx]['input']
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Base", trust_remote_code=True)
tokens = tokenizer(sample_input, return_tensors="pt", truncation=True)
print("Token count:", tokens['input_ids'].shape[1])
print(train_data[idx]['input'] + "\n" + train_data[idx]['label'])
# print(train_data[idx].keys())
print("ho")
print(train_data[idx])

Token count: 136
<DIFF>
@@ -28,8 +28,9 @@ import {StaticAndDynamicReflectionCapabilities} from './static_reflection_capabi
 import {StaticReflector, StaticSymbol} from './static_reflector';
 
 function extract(
-    ngOptions: tsc.AngularCompilerOptions, program: ts.Program, host: ts.CompilerHost) {
-  const extractor = Extractor.create(ngOptions, program, host);
+    ngOptions: tsc.AngularCompilerOptions, cliOptions: tsc.CliOptions, program: ts.Program,
</DIFF>

<COMMENT side="RIGHT" offset="5">only add the format

ho
{'input': "<DIFF>\n@@ -28,8 +28,9 @@ import {StaticAndDynamicReflectionCapabilities} from './static_reflection_capabi\n import {StaticReflector, StaticSymbol} from './static_reflector';\n \n function extract(\n-    ngOptions: tsc.AngularCompilerOptions, program: ts.Program, host: ts.CompilerHost) {\n-  const extractor = Extractor.create(ngOptions, program, host);\n+    ngOptions: tsc.AngularCompilerOptions, cliOptions: tsc.CliOptions, program: ts.Program,\n</DIFF>\n", 'l

In [20]:
wrapped = Wrapper(model)
torch.onnx.export(wrapped, 
                  (tokens["input_ids"], tokens["attention_mask"]), 
                  onnx_model_path,
                  export_params=True, 
                  opset_version=20,
                  do_constant_folding=True, 
                  input_names=['input_ids', 'attention_mask'],
                  output_names=['output'], 
                  dynamic_axes={
                      "input_ids": {0: "batch_size", 1: "sequence_length"},
                      "attention_mask": {0: "batch_size", 1: "sequence_length"},
                      "output": {0: "batch_size", 1: "sequence_length"}
                  }
)

  if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
  if past_key_values_length > 0:
  if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
  if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
  if attn_output.size() != (bsz, self.num_heads, q_len, self.v_head_dim):
  tokens_per_expert = tokens_per_expert.cpu().numpy()
In-place op on output of tensor.shape. See https://pytorch.org/docs/main/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode
In-place op on output of tensor.shape. See https://pytorch.org/docs/main/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode
In-place op on output of tensor.shape. See https://pytorch.org/docs/main/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode
In-place op on output of tensor.shape. See https://pytorch.org/docs/main/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode
In-place op on output of tensor.shap

In [35]:
print("== ONNX Model Inputs ==")
for inp in ort_session.get_inputs():
    print(f"Name: {inp.name}, Shape: {inp.shape}, Type: {inp.type}")

# Print output details
print("\n== ONNX Model Outputs ==")
for out in ort_session.get_outputs():
    print(f"Name: {out.name}, Shape: {out.shape}, Type: {out.type}")


== ONNX Model Inputs ==
Name: input_ids, Shape: ['batch_size', 'sequence_length'], Type: tensor(int64)
Name: attention_mask, Shape: ['batch_size', 'sequence_length'], Type: tensor(int64)

== ONNX Model Outputs ==
Name: output, Shape: ['batch_size', 'sequence_length', 102400], Type: tensor(float)


In [38]:
print(len(sample_text))

1383


In [39]:
sample_text = test_data[0]['input']

# Tokenize with padding and truncation to consistent length
tokens = tokenizer(sample_text, return_tensors="np", padding="max_length", truncation=True, max_length=136)

# Make sure inputs are int64 numpy arrays (ONNX expects this)
input_dict = {
    "input_ids": tokens["input_ids"].astype("int64"),
    "attention_mask": tokens["attention_mask"].astype("int64")
}

# Run ONNX model
outputs = ort_session.run(None, input_dict)


In [25]:
state_dict.keys()

odict_keys(['model.embed_tokens.weight', 'model.layers.0.self_attn.q_proj.weight', 'model.layers.0.self_attn.kv_a_proj_with_mqa.weight', 'model.layers.0.self_attn.kv_a_layernorm.weight', 'model.layers.0.self_attn.kv_b_proj.weight', 'model.layers.0.self_attn.o_proj.weight', 'model.layers.0.mlp.gate_proj.weight', 'model.layers.0.mlp.up_proj.weight', 'model.layers.0.mlp.down_proj.weight', 'model.layers.0.input_layernorm.weight', 'model.layers.0.post_attention_layernorm.weight', 'model.layers.1.self_attn.q_proj.weight', 'model.layers.1.self_attn.kv_a_proj_with_mqa.weight', 'model.layers.1.self_attn.kv_a_layernorm.weight', 'model.layers.1.self_attn.kv_b_proj.weight', 'model.layers.1.self_attn.o_proj.weight', 'model.layers.1.mlp.experts.0.gate_proj.weight', 'model.layers.1.mlp.experts.0.up_proj.weight', 'model.layers.1.mlp.experts.0.down_proj.weight', 'model.layers.1.mlp.experts.1.gate_proj.weight', 'model.layers.1.mlp.experts.1.up_proj.weight', 'model.layers.1.mlp.experts.1.down_proj.weight

In [21]:
# torch.onnx.export(wrapped, 
#                   (tokens["input_ids"], tokens["attention_mask"]), 
#                   onnx_model_path,
#                   export_params=True, 
#                   opset_version=20,
#                   do_constant_folding=True, 
#                   input_names=['input_ids', 'attention_mask'],
#                   output_names=['output'], 
#                   dynamic_axes={
#                       "input_ids": {0: "batch_size", 1: "sequence_length"},
#                       "attention_mask": {0: "batch_size", 1: "sequence_length"},
#                       "output": {0: "batch_size", 1: "sequence_length"}
#                   }
#                  )


In [17]:
tokens.keys()

dict_keys(['input_ids', 'attention_mask'])

In [22]:
print(f"ONNX model saved to {onnx_model_path}")

ONNX model saved to models/finetunedDeepseekcoV2.onnx


In [24]:
onnx_model = onnx.load(onnx_model_path)
onnx.checker.check_model(onnx_model_path)

In [25]:
print(f"Number of initializers (weights): {len(onnx_model.graph.initializer)}")

Number of initializers (weights): 5063


In [47]:
# os.exit(0)
print("e")

e


Creating an inference session

In [5]:
ort_session = ort.InferenceSession(onnx_model_path, providers=['TensorrtExecutionProvider'])



In [6]:
ort_session.get_providers()

['TensorrtExecutionProvider', 'CPUExecutionProvider']

### Check Model Size of ONNX Model

In [28]:
onnx_model_size_in_bytes = os.path.getsize(onnx_model_path) 
model_size_gb = onnx_model_size_in_bytes / (1024 ** 3)
model_size_mb = onnx_model_size_in_bytes / (1024 ** 2)
# print(f"ONNX model size: {model_size_gb:.5f} GiB, {model_size_mb:.5f} MiB")
print(f"Model Size on Disk: {onnx_model_size_in_bytes/ (1e6) :.2f} MB")

Model Size on Disk: 7.93 MB


In [None]:
# deleting the loaded pytorch model of 32 GB RAM to clear up space
# del model

In [29]:
print(f"{os.path.getsize('./models')/(1e6):.2f}")

0.24


### BERT and Location Precision on ONNX Model

In [30]:
!free -h

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


               total        used        free      shared  buff/cache   available
Mem:           503Gi       209Gi       161Gi        14Mi       135Gi       293Gi
Swap:             0B          0B          0B


In [31]:
!nvidia-smi


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Sat May 10 16:29:55 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.05              Driver Version: 560.35.05      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off |   00000000:27:00.0 Off |                    0 |
| N/A   56C    P0             72W /  300W |   68093MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                

From the above, there's plenty of memory available in CPU

In [33]:
input_text

"<DIFF>\n@@ -838,45 +839,58 @@ function performConcurrentWorkOnRoot(root, didTimeout) {\n       throw fatalError;\n     }\n \n-    // Check if this render may have yielded to a concurrent event, and if so,\n-    // confirm that any newly rendered stores are consistent.\n-    // TODO: It's possible that even a concurrent render may never have yielded\n-    // to the main thread, if it was fast enough, or if it expired. We could\n-    // skip the consistency check in that case, too.\n-    const renderWasConcurrent = !includesBlockingLane(root, lanes);\n-    const finishedWork: Fiber = (root.current.alternate: any);\n-    if (\n-      renderWasConcurrent &&\n-      !isRenderConsistentWithExternalStores(finishedWork)\n-    ) {\n-      // A store was mutated in an interleaved event. Render again,\n-      // synchronously, to block further mutations.\n-      exitStatus = renderRootSync(root, lanes);\n-\n-      // We need to check again if something threw\n-      if (exitStatus === RootErrore

In [12]:
# Load tokenizer and ONNX session
print("Start")
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Base", trust_remote_code=True)
print("Got tokenizer")
# ort_session = ort.InferenceSession("models/finetunedDeepseekcoV2.onnx", providers=["CPUExecutionProvider"])
print("Got ort session")
# List for predictions and labels
preds = []
labels = []

# List for latency tracking
latencies = []
num_trials = 5  # Number of trials for latency measurement

# Iterate through test data
for sample in test_data:
    input_text = sample["input"]
    label_text = sample["label"]

    # Tokenize input text
    tokens = tokenizer(input_text, return_tensors="np", padding=True, truncation=True, max_length=136)
    print("Tokenized input text")
    # Prepare input dict for ONNX model
    input_dict = {
        "input_ids": tokens["input_ids"],
        "attention_mask": tokens["attention_mask"]
    }

    # Warm-up run to avoid initial latency spike
    ort_session.run(None, input_dict)
    print("Started ort_session")
    # Measure inference latency
    start_time = time.time()
    for _ in range(num_trials):
        ort_outs = ort_session.run(None, input_dict)
        print("Infer no.:", _)
        latencies.append(time.time() - start_time)

    # Get the predicted token IDs
    output_ids = np.argmax(ort_outs[0], axis=-1)

    # Convert token IDs to text
    pred_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()

    # Store prediction and ground truth
    preds.append(pred_text)
    labels.append(label_text)


Start
Got tokenizer
Got ort session
Tokenized input text
Started ort_session
Infer no.: 0
Infer no.: 1
Infer no.: 2
Infer no.: 3
Infer no.: 4
Tokenized input text
Started ort_session
Infer no.: 0
Infer no.: 1
Infer no.: 2
Infer no.: 3
Infer no.: 4
Tokenized input text


[1;31m2025-05-10 18:57:50.843013812 [E:onnxruntime:Default, tensorrt_execution_provider.h:89 log] [2025-05-10 18:57:50   ERROR] IExecutionContext::getTensorShape: Error Code 7: Internal Error (IShuffleLayer /model/model/layers.0/self_attn/Reshape_10: reshaping failed for tensor: /model/model/layers.0/self_attn/Add_2_output_0 reshape would change volume 4096 to 8704 Instruction: RESHAPEinput dims{1 1 64 64} reshape dims{136 64}.)[m
[1;31m2025-05-10 18:57:50.843078875 [E:onnxruntime:, sequential_executor.cc:572 ExecuteKernel] Non-zero status code returned while running TRTKernel_graph_main_graph_9653763781726499959_0 node. Name:'TensorrtExecutionProvider_TRTKernel_graph_main_graph_9653763781726499959_0_0' Status Message: cannot create std::vector larger than max_size()[m


RuntimeException: [ONNXRuntimeError] : 6 : RUNTIME_EXCEPTION : Non-zero status code returned while running TRTKernel_graph_main_graph_9653763781726499959_0 node. Name:'TensorrtExecutionProvider_TRTKernel_graph_main_graph_9653763781726499959_0_0' Status Message: cannot create std::vector larger than max_size()

In [None]:
# Evaluate BERTScore for predictions
P, R, F1 = score(preds, labels, lang="en")
print(f"Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1: {F1.mean():.4f}")

In [None]:
# Location Precision Calculation (simple example with window size of 5)
window_size = 5
location_precision = []
for pred, label in zip(preds, labels):
    # Example comparison logic based on a sliding window (adjust based on your use case)
    pred_tokens = pred.split()
    label_tokens = label.split()
    correct_preds = sum(1 for p, l in zip(pred_tokens, label_tokens) if p == l)
    location_precision.append(correct_preds / min(len(pred_tokens), len(label_tokens)))

print(f"Location Precision: {np.mean(location_precision):.4f}")


In [None]:
# Calculate Inference Latency
print(f"Inference Latency (single sample, median): {np.percentile(latencies, 50) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 95th percentile): {np.percentile(latencies, 95) * 1000:.2f} ms")
print(f"Inference Latency (single sample, 99th percentile): {np.percentile(latencies, 99) * 1000:.2f} ms")
print(f"Inference Throughput (single sample): {num_trials / np.sum(latencies):.2f} FPS")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-Coder-V2-Lite-Base", trust_remote_code=True)
onnx_model_path = "models/finetunedDeepseekcoV2.onnx"

In [54]:
test_samples = test_data[:10]
references = []
predictions = []
latencies = []

In [61]:
for data in test_samples:
    input_text = data["input"]
    reference_comment = data["input"]
    # Tokenize input
    inputs = tokenizer(input_text, return_tensors="np", padding=True, truncation=True)

    # Measure Latency
    start = time.time()
    outputs = ort_session.run(None, {
        "input_ids": inputs["input_ids"],
        "attention_mask": inputs["attention_mask"]
    })
    end = time.time()

    latencies.append(end - start)

    # generate text from ids
    generated_ids = np.argmax(outputs[0], axis=-1)
    generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

    references.append(reference_comment)
    predictions.append(generated_text)

InvalidArgument: [ONNXRuntimeError] : 2 : INVALID_ARGUMENT : Got invalid dimensions for input: input for the following indices
 index: 1 Got: 383 Expected: 136
 Please fix either the inputs/outputs or the model.

In [57]:
print([inp.name for inp in ort_session.get_inputs()])

['input', 'attention_mask']


In [None]:
training_args = TrainingArguments(
    output_dir="./deepseek-finetune",
    per_device_train_batch_size=2,
    num_train_epochs=1,
    logging_steps=5,
    save_strategy="no",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

trainer.train()
