In [1]:
from llmcoder import LLMCoder
from llmcoder.utils import get_data_dir, get_system_prompt
import json
import os
import numpy as np
from tqdm import tqdm
import tiktoken

In [2]:
# Run llmcoder preprocess -n codebert -s 100

In [3]:
llmcoder = LLMCoder()

In [4]:
base_messages = llmcoder.messages.copy()

In [5]:
CODE_BERT_DATA_DIR = get_data_dir("codebert")
print(CODE_BERT_DATA_DIR)

/home/psaegert/Projects/23ws-LLMcoder/data/codebert


In [6]:
# Scan through the pairs in the dataset and truncate the output.txt file to the first appearence of an empty line
for pair_folder in tqdm(sorted(os.listdir(os.path.join(CODE_BERT_DATA_DIR, "pairs")))):
    truncate_at = None
    with open(os.path.join(CODE_BERT_DATA_DIR, "pairs", pair_folder, "output.txt"), "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            if line.strip() == "":
                truncate_at = i
                break

    if truncate_at is not None:
        with open(os.path.join(CODE_BERT_DATA_DIR, "pairs", pair_folder, "output.txt"), "w") as f:
            f.writelines(lines[:i])
            

100%|██████████| 2200/2200 [00:00<00:00, 107831.54it/s]


In [7]:
# Run llmcoder export -n codebert

In [8]:
# Read the conversations.jsonl file
with open(os.path.join(CODE_BERT_DATA_DIR, "conversations.jsonl")) as f:
    conversations = [json.loads(line) for line in f]
len(conversations)

2200

In [9]:
print(conversations[0]["messages"][1]["content"])

import argparse
import os
import subprocess


def get_commit_message():
    """Retrieve the commit message."""
    build_source_version_message = os.environ["BUILD_SOURCEVERSIONMESSAGE"]

    if os.environ["BUILD_REASON"] == "PullRequest":
        # By default pull requests use refs/pull/PULL_ID/merge as the source branch
        # which has a "Merge ID into ID" as a commit message. The latest commit
        # message is the second to last commit
        commit_id = build_source_version_message.split()[1]
        git_cmd = ["git", "log", commit_id, "-1", "--pretty=%B"]
        commit_message = subprocess.run(
            git_cmd, capture_output=True, text=True
        ).stdout.strip()
    else:
        commit_message = build_source_version_message

    # Sanitize the commit message to avoid introducing a vulnerability: a PR
    # submitter could include the "##vso" special marker in their commit
    # message to attempt to obfuscate the injection of arbitrary commands in
    # the Azur

In [10]:
N_COMPLETIONS = 10
OPENAI_MAX_TOKENS = 4097

In [11]:
scoring_prompt = get_system_prompt("2023-12-09_Scorer_v1.1.txt")

In [12]:
enc = tiktoken.get_encoding("p50k_base")

In [13]:
X = []
y = []

batch_size = 2

# Shuffle the conversations
np.random.shuffle(conversations)

# Generate completions with LLMCoder
for conversation in tqdm(conversations[:1000]):
    user_code = conversation["messages"][1]["content"]

    # Construct a message list
    messages = base_messages + [{
        "role": "user",
        "content": user_code
    }]

    # Generate N_COMPLETIONS completions
    chat_completions = llmcoder.client.chat.completions.create(
        messages=messages,
        model=llmcoder.model_first,
        temperature=0.7,  # Default is 0.7
        n=N_COMPLETIONS
    )

    # Extract the completion text
    completed_code_candidates = [user_code[-1000:] + choice.message.content for choice in chat_completions.choices]

    # Add the ground truth completion to the candidates
    completed_code_candidates.append(user_code[-1000:] + conversation["messages"][2]["content"])

    # Throw out any candidates that are too long
    if len(enc.encode(llmcoder.score_prompt(completed_code_candidates))) > OPENAI_MAX_TOKENS - 10: # What, do you think I trust this code?
        continue  # We have enough data, so we can skip this

    # Score the candidates in batches of batch_size
    scores = np.empty((len(completed_code_candidates), 4))
    for i in range(0, len(completed_code_candidates), batch_size):
        upper_bound = min(i + batch_size, len(completed_code_candidates) - 1)
        if upper_bound - i == 0:
            break
        batch_scores = llmcoder.score_code(completed_code_candidates[i : upper_bound], client=llmcoder.client, scoring_prompt=scoring_prompt, reduction=None)
        scores[i : upper_bound, :] = batch_scores

    # Give the ground truth completion a score of 10
    scores[-1] = 10

    # Add the scores to the X and y lists
    X.extend(completed_code_candidates)
    y.extend(scores)

100%|██████████| 1000/1000 [2:23:01<00:00,  8.58s/it] 


In [14]:
assert len(X) == len(y)

In [15]:
# Store the dataset with huggingface datasets
import datasets

dataset = datasets.Dataset.from_dict({
    "code": X,
    "score": y
})

# Split the dataset into train, validation and test sets
dataset = dataset.train_test_split(test_size=0.1)

dataset.save_to_disk(os.path.join(CODE_BERT_DATA_DIR, "score_codebert_dataset"))

Saving the dataset (0/1 shards):   0%|          | 0/4356 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/484 [00:00<?, ? examples/s]

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['code', 'score'],
        num_rows: 4356
    })
    test: Dataset({
        features: ['code', 'score'],
        num_rows: 484
    })
})

In [47]:
print(dataset["train"][1]['score'])

[7.0, 6.0, 7.0, 6.0]


In [48]:
import torch
from torch import nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [59]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/CodeBERT-base")

In [60]:
from transformers import RobertaConfig, RobertaForSequenceClassification

config = RobertaConfig.from_pretrained("microsoft/CodeBERT-base", num_labels=4)
model = RobertaForSequenceClassification(config).to(device)
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [61]:
from torch.utils.data import Dataset

class CodeDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings = tokenizer(dataset["train"]["code"], truncation=True, padding=True)
train_labels = np.array(dataset["train"]["score"])

# Standard scale the labels
train_labels /= 10

train_dataset = CodeDataset(train_encodings, train_labels)

val_encodings = tokenizer(dataset["test"]["code"], truncation=True, padding=True)
val_labels = np.array(dataset["test"]["score"])

# Standard scale the labels
val_labels /= 10

val_dataset = CodeDataset(val_encodings, val_labels)

In [62]:
import numpy as np
from sklearn.metrics import mean_squared_error

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.squeeze(logits)  # Remove dimensions of size 1
    return {"mse": mean_squared_error(labels, predictions)}

In [63]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=16,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)

In [64]:
from torch import nn
from transformers import Trainer

In [65]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [66]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Mse
1,0.3684,0.391199,2.663119
2,0.3854,0.382157,1.768052
3,0.3578,0.363547,1.844101
4,0.376,0.360737,2.486769
5,0.3725,0.352571,3.222676
6,0.3414,0.351575,3.913135
7,0.3198,0.352505,5.246758
8,0.3417,0.350089,4.363342
9,0.3455,0.352147,5.661312
10,0.3305,0.346344,5.445999


TrainOutput(global_step=4368, training_loss=0.3484636201308324, metrics={'train_runtime': 943.5927, 'train_samples_per_second': 73.862, 'train_steps_per_second': 4.629, 'total_flos': 1.8338117409570816e+16, 'train_loss': 0.3484636201308324, 'epoch': 16.0})

In [67]:
trainer.evaluate()

{'eval_loss': 0.34902313351631165,
 'eval_mse': 6.709889888763428,
 'eval_runtime': 2.0956,
 'eval_samples_per_second': 230.962,
 'eval_steps_per_second': 3.818,
 'epoch': 16.0}

In [68]:
trainer.save_model('score_codebert_16')

In [73]:
# Run the model on a few examples


example1 = """def foo():
    print('Hello, world!')

if __name__ == '__main__':
    foo()
"""

input_ids = tokenizer.encode(example1, return_tensors="pt").to(device)
logits = model(input_ids).logits
print(logits)

tensor([[1.8515, 5.3318, 6.1809, 2.5942]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
