In [1]:
from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset, load_dataset, DatasetDict
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch
from transformers import pipeline
from torch.nn import CrossEntropyLoss
from torch.utils.data.dataloader import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator,notebook_launcher
from transformers import get_scheduler
from huggingface_hub import Repository, get_full_repo_name
from transformers import AutoModelForMaskedLM
from transformers import default_data_collator
import math
import time
import argparse

In [2]:
codesearchnet_dataset = load_dataset("code_search_net", "java")
codesearchnet_dataset

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 454451
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 26909
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 15328
    })
})

In [4]:
sample = codesearchnet_dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> code: {row['whole_func_string']}'")


'>>> code: public Boolean isWriteLocked(K token) {
	RWLock<K> lock = locks.get(token);
	if (lock == null) return null;
	return lock.isWriteLocked();
    }'

'>>> code: @Override
    public int getLevel() {
        Level level = log4jLogger.getLevel();
        if (level == null)
            level = Logger.getRootLogger().getLevel();
        switch (level.toInt()) {
            case Level.TRACE_INT:
                return TRACE;
            case Level.DEBUG_INT:
                return DEBUG;
            case Level.INFO_INT:
                return INFO;
            case Level.WARN_INT:
                return WARN;
            case Level.ERROR_INT:
                return ERROR;
            case Level.FATAL_INT:
                return FATAL;
            default:
                throw new IllegalArgumentException("Unsupported log4j level: " + level);
        }
    }'

'>>> code: public TerminalEmulatorDeviceConfiguration withCursorBlinking(boolean cursorBlinking) {
        if(this.cursorBli

In [5]:
model_checkpoint = "microsoft/codebert-base-mlm"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [6]:
codesearchnet_dataset

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 454451
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 26909
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 15328
    })
})

In [7]:
def tokenize_function(examples):
    result = tokenizer(examples["whole_func_string"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = codesearchnet_dataset.map(
    tokenize_function, batched=True, remove_columns=['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url']
)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 454451
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 26909
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 15328
    })
})

In [9]:
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> code {idx} length: {len(sample)}'")

'>>> code 0 length: 65'
'>>> code 1 length: 84'
'>>> code 2 length: 168'


In [10]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated code length: {total_length}'")

'>>> Concatenated code length: 317'


In [11]:
chunk_size = 128
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 61'


In [12]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [13]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1132440
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61821
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 31402
    })
})

In [14]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

'Name, resourceGroupName, fabricName, containerName), serviceCallback);\n    }</s><s>public Observable<Void> inquireAsync(String vaultName, String resourceGroupName, String fabricName, String containerName, String filter) {\n        return inquireWithServiceResponseAsync(vaultName, resourceGroupName, fabricName, containerName, filter).map(new Func1<ServiceResponse<Void>, Void>() {\n            @Override\n            public'

In [15]:
tokenizer.decode(lm_datasets["train"][1]["labels"])

'Name, resourceGroupName, fabricName, containerName), serviceCallback);\n    }</s><s>public Observable<Void> inquireAsync(String vaultName, String resourceGroupName, String fabricName, String containerName, String filter) {\n        return inquireWithServiceResponseAsync(vaultName, resourceGroupName, fabricName, containerName, filter).map(new Func1<ServiceResponse<Void>, Void>() {\n            @Override\n            public'

In [16]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [17]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> <s>public<mask> inquire(String vaultName, String<mask>GroupName, String fabricName<mask> String container<mask>) {
       <mask>WithServiceResponseAsync(vaultName, resourceGroupName, fabricName, containerName).toBl<mask>().single<mask>body();
    }</s><s>public ServiceFuture<<mask>oid> inquireAsync(<mask> vaultName<mask> String complexityGroup<mask><mask> String fabricName, String containerName, final Service<mask><mask>V<mask>> serviceCallback) {
   <mask>    return ServiceFuture.fromResponse(inqu<mask>WithServiceResponseAsync golfvault'

'>>> Name, resourceGroupName, fabric<mask>, containerName<mask> serviceCallback);
  <mask><mask></s><s>public Observable<Void> inquireAsync(String vaultName, String resourceGroupName, String fabricName<mask><mask> container<mask>, String filter) {
<mask>       return<mask>WithServiceResponseAsync(vaultName, resourceGroup<mask>, fabric<mask><mask> containerName, filter).map(new Func1<ServiceResponse<Void<mask><mask>>()<mask>
          humane  @�

In [18]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [19]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> <s>public void inquire(<mask> vaultName, String resourceGroupName, String fabricName, String containerName)<mask>
        inquireWithServiceResponseAsync(vaultName<mask> resourceGroupName<mask> fabricName<mask> containerName<mask>toBlocking().<mask>().body();
    }</s><s>public<mask><mask><Void<mask> inquireAsync(<mask> vaultName, String resourceGroupName, String<mask><mask>, String containerName<mask> final ServiceCallback<Void<mask> serviceCallback<mask><mask>
       <mask> ServiceFuture.fromResponse(inquireWithServiceResponseAsync(vault'

'>>> Name, resourceGroupName, fabricName, containerName<mask> serviceCallback);
   <mask></s><s><mask><mask><mask><mask>Void> inquireAsync(String vaultName<mask> String resourceGroupName, String fabricName<mask> String containerName<mask><mask> filter) {
       <mask> inquireWithServiceResponseAsync(vaultName,<mask><mask><mask>, fabricName, containerName,<mask>).map<mask>new<mask><mask>1<mask>ServiceResponse<mask><mask><mask>>, Void>() {<mask

In [21]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
from datasets import Dataset

train_size = 16000
valid_size = 2000

# Split the dataset into train, validation, and test sets
train_dataset = lm_datasets["train"].shuffle(seed=42).select(range(train_size))
valid_dataset = lm_datasets["validation"].shuffle(seed=42).select(range(valid_size))

# Print sizes of each split
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(valid_dataset)}")


Train dataset size: 16000
Validation dataset size: 2000


In [23]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(train_dataset) // batch_size

training_args = TrainingArguments(
    output_dir="MLM_FinetunedModel",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)

In [24]:
from transformers import Trainer

trainer = Trainer(
    model= AutoModelForMaskedLM.from_pretrained(model_checkpoint),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print (train_dataset)
print (valid_dataset)

Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 16000
})
Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 2000
})


In [25]:
import os
os.environ['WANDB_NOTEBOOK_NAME'] = 'MLM_wholewordmask.ipynb'
os.environ['WANDB_MODE'] = 'disabled'
#  WANDB_MODE=disabled

In [26]:

import math

# Evaluate the model
eval_results = trainer.evaluate()

# Calculate perplexity
perplexity = math.exp(eval_results['eval_loss'])


# Calculate loss
loss = eval_results['eval_loss']

print(f">>> Perplexity: {perplexity:.2f}")
print(f">>> Loss: {loss:.2f}")



>>> Perplexity: 3.53
>>> Loss: 1.26


In [27]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.524505
2,0.559700,0.518791
3,0.559700,0.510306


TrainOutput(global_step=375, training_loss=0.5416294250488282, metrics={'train_runtime': 328.0549, 'train_samples_per_second': 146.317, 'train_steps_per_second': 1.143, 'total_flos': 3159185559552000.0, 'train_loss': 0.5416294250488282, 'epoch': 3.0})

In [28]:
# Evaluate the model
eval_results = trainer.evaluate()

# Calculate perplexity
perplexity = math.exp(eval_results['eval_loss'])

# Calculate loss
loss = eval_results['eval_loss']

print(f">>> Perplexity: {perplexity:.2f}")
print(f">>> Loss: {loss:.2f}")


>>> Perplexity: 1.68
>>> Loss: 0.52


In [29]:
model = "MLM_FinetunedModel"

pred_model = pipeline("fill-mask", model = "MLM_FinetunedModel")

text = "public boolean noNamesBound() {\n\t\tfinal Node node = getNamesBoundNode(false);\n\t\treturn <mask>!= null? node.getChildrenNames().isEmpty() : true;\n\t}"

preds = pred_model(text)
print(preds)

[{'score': 0.9990767240524292, 'token': 37908, 'token_str': ' node', 'sequence': 'public boolean noNamesBound() {\n\t\tfinal Node node = getNamesBoundNode(false);\n\t\treturn node!= null? node.getChildrenNames().isEmpty() : true;\n\t}'}, {'score': 0.00031342700822278857, 'token': 23796, 'token_str': ' null', 'sequence': 'public boolean noNamesBound() {\n\t\tfinal Node node = getNamesBoundNode(false);\n\t\treturn null!= null? node.getChildrenNames().isEmpty() : true;\n\t}'}, {'score': 8.079819963313639e-05, 'token': 4095, 'token_str': ' parent', 'sequence': 'public boolean noNamesBound() {\n\t\tfinal Node node = getNamesBoundNode(false);\n\t\treturn parent!= null? node.getChildrenNames().isEmpty() : true;\n\t}'}, {'score': 7.529496360803023e-05, 'token': 9749, 'token_str': ' root', 'sequence': 'public boolean noNamesBound() {\n\t\tfinal Node node = getNamesBoundNode(false);\n\t\treturn root!= null? node.getChildrenNames().isEmpty() : true;\n\t}'}, {'score': 6.670878065051511e-05, 'token

Checking with only one input text and one masked token

In [30]:
model = "MLM_FinetunedModel"
pred_model = pipeline("fill-mask", model=model)
text = "public class HelloWorld {\n\tpublic static void main(String[] args) { \n\t\tSystem.out.<mask>('Hello,World!'); \n} \n}"

# Get predictions
preds = pred_model(text)
print(preds)

# Sort predictions by score in descending order
sorted_preds = sorted(preds, key=lambda x: x['score'], reverse=True)
print(sorted_preds)
# Determine the rank of the correct answer
correct_answer = preds[0]['token_str']
print(correct_answer)
correct_rank = next(i+1 for i, pred in enumerate(sorted_preds) if pred['token_str'] == correct_answer)
print(correct_rank)

# Compute the reciprocal ranks
reciprocal_ranks = [1 / rank for rank in range(1, len(sorted_preds) + 1)]
print(reciprocal_ranks)

# Calculate Mean Reciprocal Rank
mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)

print("Mean Reciprocal Rank (MRR):", mrr)

[{'score': 0.9578849077224731, 'token': 49396, 'token_str': 'println', 'sequence': "public class HelloWorld {\n\tpublic static void main(String[] args) { \n\t\tSystem.out.println('Hello,World!'); \n} \n}"}, {'score': 0.031606901437044144, 'token': 17265, 'token_str': 'print', 'sequence': "public class HelloWorld {\n\tpublic static void main(String[] args) { \n\t\tSystem.out.print('Hello,World!'); \n} \n}"}, {'score': 0.005591296125203371, 'token': 29631, 'token_str': 'write', 'sequence': "public class HelloWorld {\n\tpublic static void main(String[] args) { \n\t\tSystem.out.write('Hello,World!'); \n} \n}"}, {'score': 0.002985660918056965, 'token': 49775, 'token_str': 'printf', 'sequence': "public class HelloWorld {\n\tpublic static void main(String[] args) { \n\t\tSystem.out.printf('Hello,World!'); \n} \n}"}, {'score': 0.0005502876010723412, 'token': 995, 'token_str': 'out', 'sequence': "public class HelloWorld {\n\tpublic static void main(String[] args) { \n\t\tSystem.out.out('Hello,W

When correct answer is always the first token

In [31]:
# Get predictions for 50 masked texts
input_file = "/home/user1-selab3/shradha_test/jsoninput/whole_func_strings2.txt"

with open(input_file, "r") as file:
    text_list = file.readlines()

pred_model = pipeline("fill-mask", model=model)

# List to store reciprocal ranks for each masked text
reciprocal_ranks = []

for i in range(50):
    # Generate a masked text (assuming 'text' is a list of 50 masked texts)
    text = text_list[i]
    # print(text)

    # Get predictions for the current masked text
    preds = pred_model(text)
    print(preds)
    print("*" * 50)

    # Sort predictions by score in descending order
    sorted_preds = sorted(preds, key=lambda x: int(x['score']), reverse=True)
    print(sorted_preds)

    # Determine the rank of the correct answer
    correct_answer = preds[0]['token_str']
    print(correct_answer)
    correct_rank = next((i + 1 for i, pred in enumerate(sorted_preds) if pred['token_str'] == correct_answer), 0)
    print(correct_rank)

    # Compute the reciprocal rank for the current masked text
    reciprocal_rank = 1 / correct_rank if correct_rank != 0 else 0
    print(reciprocal_rank)

    # Append reciprocal rank to the list
    reciprocal_ranks.append(reciprocal_rank)

# Calculate the Mean Reciprocal Rank (MRR) across all masked texts
mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)
print(sum(reciprocal_ranks))
print(len(reciprocal_ranks))

print("Mean Reciprocal Rank (MRR):", mrr)

[{'score': 0.9996681213378906, 'token': 37908, 'token_str': ' node', 'sequence': "'public boolean noNamesBound() {\\n\\t\\tfinal Node node = getNamesBoundNode(false);\\n\\t\\treturn node!= null? node.getChildrenNames().isEmpty() : true;\\n\\t}'\n"}, {'score': 0.00010552719322731718, 'token': 23796, 'token_str': ' null', 'sequence': "'public boolean noNamesBound() {\\n\\t\\tfinal Node node = getNamesBoundNode(false);\\n\\t\\treturn null!= null? node.getChildrenNames().isEmpty() : true;\\n\\t}'\n"}, {'score': 4.391767652123235e-05, 'token': 4095, 'token_str': ' parent', 'sequence': "'public boolean noNamesBound() {\\n\\t\\tfinal Node node = getNamesBoundNode(false);\\n\\t\\treturn parent!= null? node.getChildrenNames().isEmpty() : true;\\n\\t}'\n"}, {'score': 3.752904376597144e-05, 'token': 9749, 'token_str': ' root', 'sequence': "'public boolean noNamesBound() {\\n\\t\\tfinal Node node = getNamesBoundNode(false);\\n\\t\\treturn root!= null? node.getChildrenNames().isEmpty() : true;\\n\\

when ground_truth_text = highest_score_sequence``

In [33]:
from transformers import pipeline

# Initialize the pipeline for masked text prediction
pred_model = pipeline("fill-mask", model=model_checkpoint)

# Path to the file containing masked texts
masked_texts_file = "/home/user1-selab3/shradha_test/jsoninput/whole_func_strings2.txt"
# Path to the file containing ground truth texts
ground_truth_file = "/home/user1-selab3/shradha_test/jsoninput/ground_truth.txt"

# List to store reciprocal ranks for each masked text
reciprocal_ranks = []

# Read the masked texts and ground truth texts from their respective files
with open(masked_texts_file, "r") as masked_file, open(ground_truth_file, "r") as truth_file:
    masked_texts = masked_file.readlines()
    ground_truth_texts = truth_file.readlines()

for masked_text, truth_text in zip(masked_texts, ground_truth_texts):
    # Get predictions for the current masked text
    preds = pred_model(masked_text)
    print(preds)
    
    # Sort predictions by score in descending order
    sorted_preds = sorted(preds, key=lambda x: x['score'], reverse=True)
    
    # Assign ranks to each prediction based on their score
    for rank, pred in enumerate(sorted_preds, start=1):
        pred['rank'] = rank
        print (pred['rank'])
    
    # Determine the correct answer
    correct_answer = None
    if truth_text.strip() in sorted_preds[0]['sequence']:
        correct_answer = sorted_preds[0]['sequence']
        print(correct_answer)

    if correct_answer:
        # Determine the rank of the correct answer in the predictions
        correct_rank = next((pred['rank'] for pred in sorted_preds if pred['sequence'] == correct_answer), 0)
        print(correct_rank)
        
        reciprocal_ranks.append(1 / correct_rank if correct_rank != 0 else 0)  # Append reciprocal rank to the list

# Calculate the Mean Reciprocal Rank (MRR) across all masked texts
mrr = sum(reciprocal_ranks) / len(ground_truth_texts)

# Print the Mean Reciprocal Rank (MRR)
print("Mean Reciprocal Rank (MRR):", mrr)


Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'score': 0.9997387528419495, 'token': 37908, 'token_str': ' node', 'sequence': "'public boolean noNamesBound() {\\n\\t\\tfinal Node node = getNamesBoundNode(false);\\n\\t\\treturn node!= null? node.getChildrenNames().isEmpty() : true;\\n\\t}'\n"}, {'score': 6.51895024930127e-05, 'token': 4095, 'token_str': ' parent', 'sequence': "'public boolean noNamesBound() {\\n\\t\\tfinal Node node = getNamesBoundNode(false);\\n\\t\\treturn parent!= null? node.getChildrenNames().isEmpty() : true;\\n\\t}'\n"}, {'score': 3.807593384408392e-05, 'token': 9749, 'token_str': ' root', 'sequence': "'public boolean noNamesBound() {\\n\\t\\tfinal Node node = getNamesBoundNode(false);\\n\\t\\treturn root!= null? node.getChildrenNames().isEmpty() : true;\\n\\t}'\n"}, {'score': 3.418727283133194e-05, 'token': 23796, 'token_str': ' null', 'sequence': "'public boolean noNamesBound() {\\n\\t\\tfinal Node node = getNamesBoundNode(false);\\n\\t\\treturn null!= null? node.getChildrenNames().isEmpty() : true;\\n\\t}

MRRTest 1

Input1: void foo() { token1 = token2.[MASK]; token3 = token 1 + token4; }, Input2 = token-a 
output: token-a (99%), token-b (19%), token-c (18%), token-d (16%), token-e (7%)MRR = 1 (in case token-a)
MRR = 2/5 (in case token-b)
MRR = 3/5 (in case token-c)
#----------------------------------------------------

Test 2
Input1: void foo() { token1 = token2.[MASK]; token3 = token 1 + token4; }, Input2 = token-x 
output: token-a (20%), token-b (19%), token-c (18%), token-d (16%), token-e (7%)MRR = 1 (in case token-a)
MRR = 0 (in case token-x)

#----------------------------------------------------Total MRR = 50%
#----------------------------------------------------We need to remove overwap b/w trainig and testing datasets in CodeSearchNet.
For test dataset, we need to creat unseen datasets, open source repositories (e.g., RoBERTa datasets)

Using pandas

In [37]:
import pandas as pd

# Iterate through each masked text and its corresponding ground truth text
pred_model = pipeline("fill-mask", model=model_checkpoint)

# Path to the file containing masked texts and ground truth texts
ground_truth_file = "/home/user1-selab3/shradha_test/jsoninput/output_java.txt"

# List to store reciprocal ranks for each masked text
reciprocal_ranks = []

# Read the masked texts and ground truth texts from their respective files
with open(masked_texts_file, "r") as masked_file, open(ground_truth_file, "r") as truth_file:
    masked_texts = masked_file.readlines()
    ground_truth_texts = truth_file.readlines()
    
# Initialize an empty list to store predictions
all_preds = []

# Initialize a counter for masked token IDs
masked_token_id_counter = 0

# Initialize a counter for ground truth text line IDs
ground_truth_line_id_counter = 0

# Initialize an empty list to store ground truth texts and their IDs
ground_truth_data = []

# Iterate through masked texts and ground truth texts
for masked_text, truth_text in zip(masked_texts, ground_truth_texts):
    # Increment the masked token ID counter for each new masked token
    masked_token_id_counter += 1
    
    # Get predictions for the current masked text
    preds = pred_model(masked_text, top_k= 10)
    
    # Initialize an empty list to store predictions for the current masked text
    masked_text_preds = []
    
    # Iterate through predictions for the current masked text
    for rank, pred in enumerate(sorted(preds, key=lambda x: x['score'], reverse=True), start=1):
        # Create a dictionary for each prediction with required fields
        pred_dict = {
            'token_id': masked_token_id_counter,
            'rank': rank,
            'score': pred['score'],
            'token': pred['token'],
            'token_str': pred['token_str'],
            'sequence': pred['sequence']
        }
        # Append the prediction dictionary to the list of predictions for the current masked text
        masked_text_preds.append(pred_dict)
    
    # Append the list of predictions for the current masked text to the list of all predictions
    all_preds.extend(masked_text_preds)
    
    # Increment the ground truth line ID counter
    ground_truth_line_id_counter += 1
    
    # Store ground truth text and its ID
    ground_truth_data.append({'ground_truth_text': truth_text, 'token_id': ground_truth_line_id_counter})

# Convert the list of predictions into a DataFrame
preds_df = pd.DataFrame(all_preds)

# Convert the list of ground truth data into a DataFrame
ground_truth_df = pd.DataFrame(ground_truth_data)

# Print the DataFrame containing predictions
print("Predictions DataFrame:")
print(preds_df)

# Print the DataFrame containing ground truth text
print("\nGround Truth DataFrame:")
print(ground_truth_df)

Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Predictions DataFrame:
     token_id  rank     score  token token_str  \
0           1     1  0.990949  30921    return   
1           1     2  0.007247    671    return   
2           1     3  0.000237  42555    Return   
3           1     4  0.000144  42326        //   
4           1     5  0.000118  42964        ++   
..        ...   ...       ...    ...       ...   
555        56     6  0.000012   4030       New   
556        56     7  0.000011  21277        //   
557        56     8  0.000006   3211     throw   
558        56     9  0.000006    285    public   
559        56    10  0.000005   1437             

                                              sequence  
0    public static int factorial(int n) {\n\tif (n ...  
1    public static int factorial(int n) {\n\tif (n ...  
2    public static int factorial(int n) {\n\tif (n ...  
3    public static int factorial(int n) {\n\tif (n ...  
4    public static int factorial(int n) {\n\tif (n ...  
..                                

In [46]:
results = preds_df.merge(ground_truth_df, how='left', on=['token_id'])

def fill_sequence(row):
    if row['sequence'] in row['ground_truth_text']:
        return row['sequence']
    else:
        return None

# Apply the function to the 'sequence' column
results['match_sequence'] = results.apply(fill_sequence, axis=1)

# Display the results DataFrame
print(results['match_sequence'])

0      public static int factorial(int n) {\n\tif (n ...
1                                                   None
2                                                   None
3                                                   None
4                                                   None
                             ...                        
555                                                 None
556                                                 None
557                                                 None
558                                                 None
559                                                 None
Name: match_sequence, Length: 560, dtype: object


In [47]:
len(results['match_sequence'])

560

In [48]:
results['match_sequence'].fillna('None', inplace=True)

# Define the file path where you want to save the values
output_file_path = "match_sequence_values.txt"

# Open the file in write mode
with open(output_file_path, 'w') as output_file:
    # Write each value in results['match_sequence'] to the file
    for value in results['match_sequence']:
        output_file.write(str(value) + '\n')

# Print confirmation message
print("Match sequence values have been saved to:", output_file_path)

Match sequence values have been saved to: match_sequence_values.txt


In [49]:
results['match_sequence'].fillna('None', inplace=True)
# Group by 'token_id' and 'match_sequence', taking the minimum rank
relevances_rank = results.groupby(['token_id', 'match_sequence'])['rank'].min()

print(relevances_rank)

token_id  match_sequence                                                                                                                                                                     
1         None                                                                                                                                                                                   2
          public static int factorial(int n) {\n\tif (n == 0)\n\treturn 1;\n\telse\n\treturn n * factorial(n - 1);}\n                                                                            1
2         None                                                                                                                                                                                   2
          public static boolean isPrime(int num) {\n\tif (num <= 1)\n\treturn false;\n\tfor (int i = 2; i <= Math.sqrt(num); i++) {\n\tif (num % i == 0)\n\treturn false;}\n\treturn true;}\n    1
3         None                

In [58]:
ranks = relevances_rank[relevances_rank.index.get_level_values('match_sequence') != 'None']

print(ranks)

token_id  match_sequence                                                                                                                                                                             
1         0.72%: public static int factorial(int n) {\n\tif (n == 0)\n\treturn 1;\n\telse\n\t return n * factorial(n - 1);}\n                                                                            2
          99.09%: public static int factorial(int n) {\n\tif (n == 0)\n\treturn 1;\n\telse\n\treturn n * factorial(n - 1);}\n                                                                            1
2         0.09%: public static boolean isPrime(int num) {\n\tif (num <= 1)\n\treturn false;\n\tfor (int i = 2; i <= Math.sqrt(num); i++) {\n\tif (num% i == 0)\n\treturn false;}\n\treturn true;}\n      8
          0.29%: public static boolean isPrime(int num) {\n\tif (num <= 1)\n\treturn false;\n\tfor (int i = 2; i <= Math.sqrt(num); i++) {\n\tif (num / i == 0)\n\treturn false;}\n\treturn true;

In [59]:
reciprocal_ranks = 1 / (ranks)
reciprocal_ranks

token_id  match_sequence                                                                                                                                                                             
1         0.72%: public static int factorial(int n) {\n\tif (n == 0)\n\treturn 1;\n\telse\n\t return n * factorial(n - 1);}\n                                                                            0.500000
          99.09%: public static int factorial(int n) {\n\tif (n == 0)\n\treturn 1;\n\telse\n\treturn n * factorial(n - 1);}\n                                                                            1.000000
2         0.09%: public static boolean isPrime(int num) {\n\tif (num <= 1)\n\treturn false;\n\tfor (int i = 2; i <= Math.sqrt(num); i++) {\n\tif (num% i == 0)\n\treturn false;}\n\treturn true;}\n      0.125000
          0.29%: public static boolean isPrime(int num) {\n\tif (num <= 1)\n\treturn false;\n\tfor (int i = 2; i <= Math.sqrt(num); i++) {\n\tif (num / i == 0)\n\treturn fa

In [60]:
reciprocal_ranks.mean()

0.3253844246031746