In [14]:
import os
import torch
import datasets
import numpy as np

from torch.utils.data import Dataset
from sklearn.metrics import mean_squared_error
from transformers import Trainer

from transformers import AutoTokenizer
from transformers import RobertaConfig, RobertaForSequenceClassification

from llmcoder.utils import get_data_dir

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [16]:
dataset = datasets.load_from_disk(os.path.join(get_data_dir(), "codebert", "score_codebert_dataset"))
dataset

DatasetDict({
    train: Dataset({
        features: ['code', 'score'],
        num_rows: 4356
    })
    test: Dataset({
        features: ['code', 'score'],
        num_rows: 484
    })
})

In [17]:
tokenizer = AutoTokenizer.from_pretrained("microsoft/CodeBERT-base")

In [18]:
config = RobertaConfig.from_pretrained("microsoft/CodeBERT-base", num_labels=4)
model = RobertaForSequenceClassification(config).to(device)
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
class CodeDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings = tokenizer(dataset["train"]["code"], truncation=True, padding=True)
train_labels = np.array(dataset["train"]["score"])

# Standard scale the labels
train_labels /= 10

train_dataset = CodeDataset(train_encodings, train_labels)

val_encodings = tokenizer(dataset["test"]["code"], truncation=True, padding=True)
val_labels = np.array(dataset["test"]["score"])

# Standard scale the labels
val_labels /= 10

val_dataset = CodeDataset(val_encodings, val_labels)

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.squeeze(logits)  # Remove dimensions of size 1
    return {"mse": mean_squared_error(labels, predictions)}

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=16,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Mse
1,0.3684,0.391199,2.663119
2,0.3854,0.382157,1.768052
3,0.3578,0.363547,1.844101
4,0.376,0.360737,2.486769
5,0.3725,0.352571,3.222676
6,0.3414,0.351575,3.913135
7,0.3198,0.352505,5.246758
8,0.3417,0.350089,4.363342
9,0.3455,0.352147,5.661312
10,0.3305,0.346344,5.445999


TrainOutput(global_step=4368, training_loss=0.3484636201308324, metrics={'train_runtime': 943.5927, 'train_samples_per_second': 73.862, 'train_steps_per_second': 4.629, 'total_flos': 1.8338117409570816e+16, 'train_loss': 0.3484636201308324, 'epoch': 16.0})

In [None]:
trainer.evaluate()

{'eval_loss': 0.34902313351631165,
 'eval_mse': 6.709889888763428,
 'eval_runtime': 2.0956,
 'eval_samples_per_second': 230.962,
 'eval_steps_per_second': 3.818,
 'epoch': 16.0}

In [None]:
trainer.save_model('score_codebert_x')

In [None]:
# Run the model on a few examples


example1 = """def foo():
    print('Hello, world!')

if __name__ == '__main__':
    foo()
"""

input_ids = tokenizer.encode(example1, return_tensors="pt").to(device)
logits = model(input_ids).logits
print(logits)

tensor([[1.8515, 5.3318, 6.1809, 2.5942]], device='cuda:0',
       grad_fn=<AddmmBackward0>)
