In [1]:

from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset, load_dataset, DatasetDict
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch
from transformers import pipeline
from torch.nn import CrossEntropyLoss
from torch.utils.data.dataloader import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator,notebook_launcher
from transformers import get_scheduler
from huggingface_hub import Repository, get_full_repo_name
from transformers import AutoModelForMaskedLM
from transformers import default_data_collator
import math
import time
import argparse

In [5]:
ds_train = load_dataset("code_search_net", "java", split="train")
ds_test = load_dataset("code_search_net", "java", split="test")
ds_valid = load_dataset("code_search_net", "java", split="validation")
raw_datasets = DatasetDict(
    {
        "train": ds_train.shuffle().select(range(4000)), #train_size)), # "train": ds_train,  # .shuffle().select(range(50000)),
        "test": ds_test.shuffle().select(range(500)),
        "valid": ds_valid.shuffle().select(range(500)) # "valid": ds_valid,  # .shuffle().select(range(500))
    }
)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [6]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 500
    })
    valid: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 500
    })
})

In [7]:
print(raw_datasets["test"][0]["whole_func_string"])

public void deleteTableStyle(String featureTable, GeometryType geometryType) {
        deleteTableMapping(getTableStyleMappingDao(featureTable), featureTable,
                geometryType);
    }


In [8]:
for key in raw_datasets["train"][0]:
    print(f"{key.upper()}: {raw_datasets['train'][0][key][:1000]}")

REPOSITORY_NAME: avaje-common/avaje-jetty-runner
FUNC_PATH_IN_REPOSITORY: src/main/java/org/avaje/jettyrunner/BaseRunner.java
FUNC_NAME: BaseRunner.startServer
WHOLE_FUNC_STRING: public void startServer() {

    server = new Server(httpPort);
    server.setHandler(wrapHandlers());

    if (isWebSocketInClassPath()) {
      setupForWebSocket();
    }
    try {
      server.start();
      log().info("server started");

      Runtime.getRuntime().addShutdownHook(new Thread(new ShutdownRunnable()));
      
      if (useStdInShutdown) {
        // generally for use in IDE via JettyRun, Use CTRL-D in IDE console to shutdown
        BufferedReader systemIn = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
        while((systemIn.readLine()) != null) {
          // ignore anything except CTRL-D by itself
        }
        System.out.println("Shutdown via CTRL-D");
        System.exit(0);
      }

    } catch (Exception e) {
      e.printStackTrace();
      System.exit(100);
    

In [9]:
# use bert model checkpoint tokenizer
model_checkpoint = "microsoft/codebert-base-mlm"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [10]:
#define tokenize function to tokenize the dataset
def tokenize_function(data):
    result = tokenizer(data["whole_func_string"])
    return result

# batched is set to True to activate fast multithreading!
tokenize_dataset = raw_datasets.map(tokenize_function, batched = True, remove_columns = raw_datasets["train"].column_names)

print(f'[DBG] tokenized_dataset: {tokenize_dataset}')
print(f'[DBG] len(tokenizer): {len(tokenizer)}')
print(f'[DBG] tokenizer.bos_token_id: {tokenizer.bos_token_id}')
print(f'[DBG] tokenizer.eos_token_id: {tokenizer.eos_token_id}')


Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (573 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

[DBG] tokenized_dataset: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 500
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 500
    })
})
[DBG] len(tokenizer): 50265
[DBG] tokenizer.bos_token_id: 0
[DBG] tokenizer.eos_token_id: 2


In [11]:

def concat_chunk_dataset(data):
    chunk_size = 128
    # concatenate texts
    concatenated_sequences = {k: sum(data[k], []) for k in data.keys()}
    #compute length of concatenated texts
    total_concat_length = len(concatenated_sequences[list(data.keys())[0]])

    # drop the last chunk if is smaller than the chunk size
    total_length = (total_concat_length // chunk_size) * chunk_size

    # split the concatenated sentences into chunks using the total length
    result = {k: [t[i: i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_sequences.items()}

    '''we create a new labels column which is a copy of the input_ids of the processed text data,the labels column serve as 
    ground truth for our masked language model to learn from. '''
    
    result["labels"] = result["input_ids"].copy()

    return result

processed_dataset = tokenize_dataset.map(concat_chunk_dataset, batched = True)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [12]:
from transformers import DataCollatorForLanguageModeling

''' Apply random masking once on the whole test data, then uses the default data collector to handle the test dataset in batches '''

data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm_probability = 0.15)

# Function to insert random mask
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

# Map insert_random_mask function to test dataset
eval_dataset = processed_dataset["test"].map(insert_random_mask,batched=True,remove_columns=processed_dataset["test"].column_names
)

# Rename columns
eval_dataset = eval_dataset.rename_columns({
    "masked_input_ids": "input_ids",
    "masked_attention_mask": "attention_mask",
    "masked_labels": "labels"
})


Map:   0%|          | 0/1212 [00:00<?, ? examples/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [13]:
import os

# Disable tokenizers parallelism
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [14]:
model_checkpoint

'microsoft/codebert-base-mlm'

In [16]:

def training_function():

    # set batch size to 32, a larger bacth size when using a more powerful gpu
    batch_size = 32

    train_dataloader = DataLoader(processed_dataset["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator)
    eval_dataloader = DataLoader(processed_dataset["test"], batch_size=batch_size, collate_fn=default_data_collator)

    # initialize pretrained bert model
    model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
    # set the optimizer
    optimizer = AdamW(model.parameters(), lr=5e-5)

    # initialize accelerator for training
    accelerator = Accelerator()
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader)

    # set the number of epochs which is set to 30
    num_train_epochs = 5
    num_update_steps_per_epoch = len(train_dataloader)
    num_training_steps = num_train_epochs * num_update_steps_per_epoch

    # define the learning rate scheduler for training
    lr_scheduler = get_scheduler("linear",optimizer=optimizer,num_warmup_steps=0,num_training_steps=num_training_steps)


    progress_bar = tqdm(range(num_training_steps))

    # directory to save the models
    output_dir = "MLP_TrainedModels"

    for epoch in range(num_train_epochs):
        # Training
        model.train()
        for batch in train_dataloader:
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        # Evaluation
        model.eval()
        losses = []
        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch)
            loss = outputs.loss # <===== Added.
            losses.append(accelerator.gather(loss.repeat(batch_size)))
            # loss = outputs.loss
            # losses.append(accelerator.gather(loss.repeat(batch_size)))

        # losses = torch.cat(losses)
        # losses = losses[: len(eval_dataset)]
        loss = torch.mean(torch.cat(losses))
        print(f">>> Epoch {epoch}: Loss: {loss.item()}")

        # perplexity metric used for mask language model training
        try:
            #perplexity = math.exp(torch.mean(losses))
            perplexity = torch.exp(torch.tensor(loss))
        except OverflowError:
            perplexity = float("inf")
        print(f">>> Epoch {epoch}: Perplexity: {perplexity.item()}")

        # Calculate probabilities
        losses_tensor = torch.cat(losses)  # Concatenate the list of tensors into a single tensor
        # losses_np = losses_tensor.cpu().numpy()  # Convert concatenated tensor to NumPy array
        # probabilities = torch.nn.functional.softmax(torch.tensor(losses_np), dim=0)  # Calculate probabilities
        probabilities = torch.nn.functional.softmax(-losses_tensor, dim=0)  # Taking negative of losses_tensor to ensure proper softmax calculation

        # Calculate entropy
        #entropy = -torch.sum(probabilities * torch.log(probabilities))
        entropy = -torch.sum(probabilities * torch.log(probabilities + 1e-20)) 
        print(f">>> Epoch {epoch}: Entropy: {entropy.item()}")  # Print entropy

        # Save model
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
        if accelerator.is_main_process:
            tokenizer.save_pretrained(output_dir)

notebook_launcher(training_function, num_processes= 2)

Launching training on 2 GPUs.


Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a 

>>> Epoch 0: Loss: 0.1353398859500885>>> Epoch 0: Loss: 0.1353398859500885



  perplexity = torch.exp(torch.tensor(loss))
  perplexity = torch.exp(torch.tensor(loss))


>>> Epoch 0: Perplexity: 1.144925832748413>>> Epoch 0: Perplexity: 1.144925832748413

>>> Epoch 0: Entropy: 7.102756023406982>>> Epoch 0: Entropy: 7.102756023406982



 40%|████      | 298/745 [01:43<02:30,  2.96it/s]

>>> Epoch 1: Loss: 0.1361062228679657>>> Epoch 1: Loss: 0.1361062228679657

>>> Epoch 1: Perplexity: 1.145803689956665>>> Epoch 1: Perplexity: 1.145803689956665

>>> Epoch 1: Entropy: 7.102712631225586>>> Epoch 1: Entropy: 7.102712631225586



 60%|██████    | 447/745 [02:37<01:41,  2.94it/s]

>>> Epoch 2: Loss: 0.12380828708410263>>> Epoch 2: Loss: 0.12380828708410263

>>> Epoch 2: Perplexity: 1.1317988634109497>>> Epoch 2: Perplexity: 1.1317988634109497

>>> Epoch 2: Entropy: 7.102789878845215>>> Epoch 2: Entropy: 7.102789878845215



 80%|████████  | 596/745 [03:32<00:50,  2.93it/s]

>>> Epoch 3: Loss: 0.1227213442325592>>> Epoch 3: Loss: 0.1227213442325592

>>> Epoch 3: Perplexity: 1.130569338798523>>> Epoch 3: Perplexity: 1.130569338798523

>>> Epoch 3: Entropy: 7.102800369262695>>> Epoch 3: Entropy: 7.102800369262695



100%|██████████| 745/745 [04:27<00:00,  2.94it/s]

>>> Epoch 4: Loss: 0.1270119547843933>>> Epoch 4: Loss: 0.1270119547843933

>>> Epoch 4: Perplexity: 1.1354305744171143
>>> Epoch 4: Perplexity: 1.1354305744171143>>> Epoch 4: Entropy: 7.102749347686768

>>> Epoch 4: Entropy: 7.102749347686768


100%|██████████| 745/745 [04:30<00:00,  2.75it/s]
100%|██████████| 745/745 [04:30<00:00,  2.75it/s]


In [17]:
model = "MLP_TrainedModels"

pred_model = pipeline("fill-mask", model = "MLP_TrainedModels")

text = "public Evaluation create(SimpleNode node, Object source)\n    {\n        return <mask>(node, source, false);\n    }"

preds = pred_model(text)
print(preds)

[{'score': 0.9685537219047546, 'token': 1045, 'token_str': ' create', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return create(node, source, false);\n    }'}, {'score': 0.005051335785537958, 'token': 10516, 'token_str': ' evaluate', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return evaluate(node, source, false);\n    }'}, {'score': 0.0024361107498407364, 'token': 146, 'token_str': ' make', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return make(node, source, false);\n    }'}, {'score': 0.0014410096919164062, 'token': 609, 'token_str': ' process', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return process(node, source, false);\n    }'}, {'score': 0.0009554359130561352, 'token': 1119, 'token_str': ' build', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return build(node, source, fa

In [18]:
model = "MLP_TrainedModels"
pred_model = pipeline("fill-mask", model=model)
text = "public FileWatcher register(final Path path, final Class<? extends FileEventHandler> handler) {\n    return <mask>(path, handler, EMPTY);\n  }"

# Get predictions
preds = pred_model(text)
print(preds)

# Correct answer (assuming the correct answer is the first mask prediction)
correct_answer = preds[0]['token_str']
print(correct_answer)

# Compute the reciprocal ranks
ranks = [i + 1 for i, pred in enumerate(preds) if pred['token_str'] == correct_answer]  # Ranks start from 1
reciprocal_ranks = [1 / rank for rank in ranks]  # Compute reciprocals

# Calculate Mean Reciprocal Rank
mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)

print("Mean Reciprocal Rank (MRR):", mrr)


[{'score': 0.9833874702453613, 'token': 5124, 'token_str': ' register', 'sequence': 'public FileWatcher register(final Path path, final Class<? extends FileEventHandler> handler) {\n    return register(path, handler, EMPTY);\n  }'}, {'score': 0.004609870258718729, 'token': 1606, 'token_str': ' add', 'sequence': 'public FileWatcher register(final Path path, final Class<? extends FileEventHandler> handler) {\n    return add(path, handler, EMPTY);\n  }'}, {'score': 0.0015960102900862694, 'token': 1045, 'token_str': ' create', 'sequence': 'public FileWatcher register(final Path path, final Class<? extends FileEventHandler> handler) {\n    return create(path, handler, EMPTY);\n  }'}, {'score': 0.0011484508868306875, 'token': 23379, 'token_str': ' bind', 'sequence': 'public FileWatcher register(final Path path, final Class<? extends FileEventHandler> handler) {\n    return bind(path, handler, EMPTY);\n  }'}, {'score': 0.0005712391575798392, 'token': 29662, 'token_str': 'register', 'sequence'

In [21]:
model = "MLP_TrainedModels"
pred_model = pipeline("fill-mask", model=model)
text = "public Evaluation create(SimpleNode node, Object source)\n    {\n        return <mask>(node, source, false);\n    }"

# Get predictions
preds = pred_model(text)
print(preds)

# Sort predictions by score in descending order
sorted_preds = sorted(preds, key=lambda x: x['score'], reverse=True)
print(sorted_preds)
# Determine the rank of the correct answer
correct_answer = preds[0]['token_str']
print(correct_answer)
correct_rank = next(i+1 for i, pred in enumerate(sorted_preds) if pred['token_str'] == correct_answer)
print(correct_rank)

# Compute the reciprocal ranks
reciprocal_ranks = [1 / rank for rank in range(1, len(sorted_preds) + 1)]
print(reciprocal_ranks)

# Calculate Mean Reciprocal Rank
mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)

print("Mean Reciprocal Rank (MRR):", mrr)


[{'score': 0.9685537219047546, 'token': 1045, 'token_str': ' create', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return create(node, source, false);\n    }'}, {'score': 0.005051335785537958, 'token': 10516, 'token_str': ' evaluate', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return evaluate(node, source, false);\n    }'}, {'score': 0.0024361107498407364, 'token': 146, 'token_str': ' make', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return make(node, source, false);\n    }'}, {'score': 0.0014410096919164062, 'token': 609, 'token_str': ' process', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return process(node, source, false);\n    }'}, {'score': 0.0009554359130561352, 'token': 1119, 'token_str': ' build', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return build(node, source, fa

In [22]:
import pandas as pd

# Get the correct answer (assuming it's the first mask prediction)
correct_answer = preds[0]['token_str']

# Create a DataFrame from the predictions
df = pd.DataFrame(preds)

# Sort the DataFrame by score in descending order
df_sorted = df.sort_values(by='score', ascending=False)

# Reset the index of the sorted DataFrame
df_sorted.reset_index(drop=True, inplace=True)

# Determine the rank of the correct answer
correct_rank = df_sorted.index[df_sorted['token_str'] == correct_answer].tolist()[0] + 1  # Add 1 to start ranks from 1

# Calculate the reciprocal ranks
df_sorted['rank'] = df_sorted.index + 1
df_sorted['reciprocal_rank'] = 1 / df_sorted['rank']

# Calculate Mean Reciprocal Rank (MRR)
mrr = df_sorted['reciprocal_rank'].mean()

print("Mean Reciprocal Rank (MRR):", mrr)


Mean Reciprocal Rank (MRR): 0.45666666666666667
