In [1]:
from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset, load_dataset, DatasetDict
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch
from transformers import pipeline
from torch.nn import CrossEntropyLoss
from torch.utils.data.dataloader import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator,notebook_launcher
from transformers import get_scheduler
from huggingface_hub import Repository, get_full_repo_name
from transformers import AutoModelForMaskedLM
from transformers import default_data_collator
import math
import time
import argparse

In [2]:
codesearchnet_dataset = load_dataset("code_search_net", "java")
codesearchnet_dataset

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 454451
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 26909
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 15328
    })
})

In [3]:
# ds_train = load_dataset("code_search_net", "java", split="train")
# ds_test = load_dataset("code_search_net", "java", split="test")
# ds_valid = load_dataset("code_search_net", "java", split="validation")
# raw_datasets = DatasetDict(
#     {
#         "train": ds_train.shuffle().select(range(12000)), #train_size)), # "train": ds_train,  # .shuffle().select(range(50000)),
#         "test": ds_test.shuffle().select(range(1500)),
#         "valid": ds_valid.shuffle().select(range(1500)) # "valid": ds_valid,  # .shuffle().select(range(500))
#     }
# )

In [4]:
sample = codesearchnet_dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> code: {row['whole_func_string']}'")


'>>> code: public Boolean isWriteLocked(K token) {
	RWLock<K> lock = locks.get(token);
	if (lock == null) return null;
	return lock.isWriteLocked();
    }'

'>>> code: @Override
    public int getLevel() {
        Level level = log4jLogger.getLevel();
        if (level == null)
            level = Logger.getRootLogger().getLevel();
        switch (level.toInt()) {
            case Level.TRACE_INT:
                return TRACE;
            case Level.DEBUG_INT:
                return DEBUG;
            case Level.INFO_INT:
                return INFO;
            case Level.WARN_INT:
                return WARN;
            case Level.ERROR_INT:
                return ERROR;
            case Level.FATAL_INT:
                return FATAL;
            default:
                throw new IllegalArgumentException("Unsupported log4j level: " + level);
        }
    }'

'>>> code: public TerminalEmulatorDeviceConfiguration withCursorBlinking(boolean cursorBlinking) {
        if(this.cursorBli

In [5]:
model_checkpoint = "microsoft/codebert-base-mlm"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [6]:
codesearchnet_dataset

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 454451
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 26909
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 15328
    })
})

In [7]:
def tokenize_function(examples):
    result = tokenizer(examples["whole_func_string"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = codesearchnet_dataset.map(
    tokenize_function, batched=True, remove_columns=['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url']
)
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 454451
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 26909
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 15328
    })
})

In [8]:
# #define tokenize function to tokenize the dataset
# def tokenize_function(data):
#     result = tokenizer(data["whole_func_string"])
#     if tokenizer.is_fast:
#         result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
#     return result

# # batched is set to True to activate fast multithreading!
# tokenize_dataset = raw_datasets.map(tokenize_function, batched = True, remove_columns = raw_datasets["train"].column_names)
# tokenize_dataset

In [9]:
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> code {idx} length: {len(sample)}'")

'>>> code 0 length: 65'
'>>> code 1 length: 84'
'>>> code 2 length: 168'


In [10]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated code length: {total_length}'")

'>>> Concatenated code length: 317'


In [11]:
chunk_size = 128
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 61'


In [12]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [13]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1132440
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61821
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 31402
    })
})

In [14]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

'Name, resourceGroupName, fabricName, containerName), serviceCallback);\n    }</s><s>public Observable<Void> inquireAsync(String vaultName, String resourceGroupName, String fabricName, String containerName, String filter) {\n        return inquireWithServiceResponseAsync(vaultName, resourceGroupName, fabricName, containerName, filter).map(new Func1<ServiceResponse<Void>, Void>() {\n            @Override\n            public'

In [15]:
tokenizer.decode(lm_datasets["train"][1]["labels"])

'Name, resourceGroupName, fabricName, containerName), serviceCallback);\n    }</s><s>public Observable<Void> inquireAsync(String vaultName, String resourceGroupName, String fabricName, String containerName, String filter) {\n        return inquireWithServiceResponseAsync(vaultName, resourceGroupName, fabricName, containerName, filter).map(new Func1<ServiceResponse<Void>, Void>() {\n            @Override\n            public'

In [16]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [17]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> <s>public void inquire(String vault<mask>, String resourceGroupName, String fabric<mask>, String containerName) {
       <mask>WithServiceResponse<mask>(vaultName, resource<mask>Name, fabricName<mask> container<mask>).to Aboveocking().single().body();
 <mask>  }</s><s>public ServiceFuture<Void> inquireAsync<mask><mask> vaultName, String resourceGroupName, String fabricName, String containerName, final ServiceCallback<V<mask>><mask><mask>) {
  <mask>     JPMorgan ServiceFuture.fromResponse(inquireWithServiceResponseAsync(vault'

'>>> <mask>, resourceGroupName, fabricName, containerName), serviceCallback);<mask> <mask>  }</s><s><mask> Observable<Void><mask>Async(String vaultName, String resourceGroup<mask>, String fabric<mask>,<mask> containerName, String filter) {
      <mask> return inquireWithServiceResponseAsync<mask><mask>aultName<mask> resource<mask>Name, fabricName, containerName<mask><mask>).map(new Fun<mask>1<ServiceResponse<Void<mask> Void>()<mask>
     flask<mask>      @

In [18]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [19]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> <s>public<mask> inquire<mask>String vaultName<mask> String resourceGroupName<mask> String fabricName, String containerName<mask> {
       <mask><mask><mask><mask><mask>(vaultName<mask><mask><mask><mask>, fabricName, containerName).toBlocking().single().<mask>();<mask><mask><mask><mask> }</s><s>public ServiceFuture<Void> inquireAsync(String vaultName, String resourceGroupName, String<mask><mask>, String containerName, final ServiceCallback<Void> serviceCallback) {
        return ServiceFuture<mask><mask><mask>(inquireWithServiceResponseAsync(<mask><mask>'

'>>> Name,<mask><mask><mask>, fabricName<mask> containerName),<mask><mask>);
   <mask></s><s>public Observable<Void> inquireAsync<mask>String vaultName, String resourceGroupName, String fabricName,<mask> containerName, String filter) {<mask><mask><mask><mask><mask><mask><mask><mask> return inquireWithServiceResponseAsync(vaultName, resourceGroupName, fabricName<mask> containerName, filter).map(new Func1<ServiceResponse<Void>,<ma

In [20]:
# train_size = 5000
# test_size = int(0.1 * train_size)
# valid_size = int(0.1 * train_size)

# downsampled_dataset = lm_datasets["train"].train_test_split(
#     train_size=train_size, test_size=test_size, seed=42
# )
# downsampled_dataset

In [21]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
from datasets import Dataset

train_size = 12000
valid_size = 1500

# # Split the dataset into train and validation sets
# train_dataset, valid_dataset = lm_datasets["train"].train_test_split(train_size=train_size, test_size=valid_size, seed=42)

# # Print sizes of each split
# print(f"Train dataset size: {len(train_dataset)}")
# print(f"Validation dataset size: {len(valid_dataset)}")

# Split the dataset into train, validation, and test sets
train_dataset = lm_datasets["train"].shuffle(seed=42).select(range(train_size))
valid_dataset = lm_datasets["validation"].shuffle(seed=42).select(range(valid_size))

# Print sizes of each split
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(valid_dataset)}")


Train dataset size: 12000
Validation dataset size: 1500


In [23]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(train_dataset) // batch_size
# print(train_dataset["train"])
# print(len(train_dataset["train"]))
# print(train_dataset)

training_args = TrainingArguments(
    output_dir="MLM_FinetunedModel",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)

In [24]:
from transformers import Trainer

trainer = Trainer(
    model= AutoModelForMaskedLM.from_pretrained(model_checkpoint),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print (train_dataset)
print (valid_dataset)

Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 12000
})
Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 1500
})


In [25]:
import os
os.environ['WANDB_NOTEBOOK_NAME'] = 'MLM_wholewordmask.ipynb'
os.environ['WANDB_MODE'] = 'disabled'
#  WANDB_MODE=disabled

In [26]:
# import math

# eval_results = trainer.evaluate()
# print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
import math

# Evaluate the model
eval_results = trainer.evaluate()

# Calculate perplexity
perplexity = math.exp(eval_results['eval_loss'])


# Calculate loss
loss = eval_results['eval_loss']

print(f">>> Perplexity: {perplexity:.2f}")
print(f">>> Loss: {loss:.2f}")



>>> Perplexity: 3.53
>>> Loss: 1.26


In [27]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.516606
2,0.578400,0.541988
3,0.578400,0.510669


TrainOutput(global_step=282, training_loss=0.5571449766767785, metrics={'train_runtime': 246.4513, 'train_samples_per_second': 146.073, 'train_steps_per_second': 1.144, 'total_flos': 2369389169664000.0, 'train_loss': 0.5571449766767785, 'epoch': 3.0})

In [28]:
# Evaluate the model
eval_results = trainer.evaluate()

# Calculate perplexity
perplexity = math.exp(eval_results['eval_loss'])

# Calculate loss
loss = eval_results['eval_loss']

print(f">>> Perplexity: {perplexity:.2f}")
print(f">>> Loss: {loss:.2f}")


>>> Perplexity: 1.70
>>> Loss: 0.53


In [29]:
model = "MLM_FinetunedModel"

pred_model = pipeline("fill-mask", model = "MLM_FinetunedModel")

text = "public boolean noNamesBound() {\n\t\tfinal Node node = getNamesBoundNode(false);\n\t\treturn <mask>!= null? node.getChildrenNames().isEmpty() : true;\n\t}"

preds = pred_model(text)
print(preds)

[{'score': 0.9990767240524292, 'token': 37908, 'token_str': ' node', 'sequence': 'public boolean noNamesBound() {\n\t\tfinal Node node = getNamesBoundNode(false);\n\t\treturn node!= null? node.getChildrenNames().isEmpty() : true;\n\t}'}, {'score': 0.00031342700822278857, 'token': 23796, 'token_str': ' null', 'sequence': 'public boolean noNamesBound() {\n\t\tfinal Node node = getNamesBoundNode(false);\n\t\treturn null!= null? node.getChildrenNames().isEmpty() : true;\n\t}'}, {'score': 8.079819963313639e-05, 'token': 4095, 'token_str': ' parent', 'sequence': 'public boolean noNamesBound() {\n\t\tfinal Node node = getNamesBoundNode(false);\n\t\treturn parent!= null? node.getChildrenNames().isEmpty() : true;\n\t}'}, {'score': 7.529496360803023e-05, 'token': 9749, 'token_str': ' root', 'sequence': 'public boolean noNamesBound() {\n\t\tfinal Node node = getNamesBoundNode(false);\n\t\treturn root!= null? node.getChildrenNames().isEmpty() : true;\n\t}'}, {'score': 6.670878065051511e-05, 'token

In [36]:
model = "MLM_FinetunedModel"
pred_model = pipeline("fill-mask", model=model)
text = "public class HelloWorld {\n\tpublic static void main(String[] args) { \n\t\tSystem.out.<mask>('Hello,World!'); \n} \n}"

# Get predictions
preds = pred_model(text)
print(preds)

# Sort predictions by score in descending order
sorted_preds = sorted(preds, key=lambda x: x['score'], reverse=True)
print(sorted_preds)
# Determine the rank of the correct answer
correct_answer = preds[0]['token_str']
print(correct_answer)
correct_rank = next(i+1 for i, pred in enumerate(sorted_preds) if pred['token_str'] == correct_answer)
print(correct_rank)

# Compute the reciprocal ranks
reciprocal_ranks = [1 / rank for rank in range(1, len(sorted_preds) + 1)]
print(reciprocal_ranks)

# Calculate Mean Reciprocal Rank
mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)

print("Mean Reciprocal Rank (MRR):", mrr)

[{'score': 0.9578849077224731, 'token': 49396, 'token_str': 'println', 'sequence': "public class HelloWorld {\n\tpublic static void main(String[] args) { \n\t\tSystem.out.println('Hello,World!'); \n} \n}"}, {'score': 0.031606901437044144, 'token': 17265, 'token_str': 'print', 'sequence': "public class HelloWorld {\n\tpublic static void main(String[] args) { \n\t\tSystem.out.print('Hello,World!'); \n} \n}"}, {'score': 0.005591296125203371, 'token': 29631, 'token_str': 'write', 'sequence': "public class HelloWorld {\n\tpublic static void main(String[] args) { \n\t\tSystem.out.write('Hello,World!'); \n} \n}"}, {'score': 0.002985660918056965, 'token': 49775, 'token_str': 'printf', 'sequence': "public class HelloWorld {\n\tpublic static void main(String[] args) { \n\t\tSystem.out.printf('Hello,World!'); \n} \n}"}, {'score': 0.0005502876010723412, 'token': 995, 'token_str': 'out', 'sequence': "public class HelloWorld {\n\tpublic static void main(String[] args) { \n\t\tSystem.out.out('Hello,W

In [31]:
# import pandas as pd

# # Get the correct answer (assuming it's the first mask prediction)
# correct_answer = preds[0]['token_str']
# print(correct_answer)

# # Create a DataFrame from the predictions
# df = pd.DataFrame(preds)

# # Sort the DataFrame by score in descending order
# df_sorted = df.sort_values(by='score', ascending=False)
# print(df_sorted)

# # Reset the index of the sorted DataFrame
# df_sorted.reset_index(drop=True, inplace=True)

# # Determine the rank of the correct answer
# correct_rank = df_sorted.index[df_sorted['token_str'] == correct_answer].tolist()[0] + 1  # Add 1 to start ranks from 1
# print(correct_rank)

# # Calculate the reciprocal ranks
# df_sorted['rank'] = df_sorted.index + 1
# print(df_sorted['rank'])
# df_sorted['reciprocal_rank'] = 1 / df_sorted['rank']

# # Calculate Mean Reciprocal Rank (MRR)
# mrr = df_sorted['reciprocal_rank'].mean()

# print("Mean Reciprocal Rank (MRR):", mrr)

# #The correct answer ("create") is at index 0 after sorting. So, its rank is 1.
# # The ranks would be [1, 2, 3, 4, 5], and the reciprocal ranks would be [1.0, 0.5, 0.333, 0.25, 0.2].
# # MRR is the mean of the reciprocal ranks, which is (1.0 + 0.5 + 0.333 + 0.25 + 0.2) / 5 = 0.45666666666666667.

In [32]:
# import pandas as pd

# # Sample outputs
# data = {
#     'score': [0.948937, 0.018123, 0.002204, 0.001642, 0.001627],
#     'token': [1045, 10516, 37131, 1119, 146],
#     'token_str': ['create', 'evaluate', 'eval', 'build', 'make'],
# }

# # df = pd.DataFrame(data)

# # Assuming 'create' is the correct token for the query
# ground_truth = 'create'

# # Sort the outputs by score
# sorted_df = df.sort_values(by='score', ascending=False)
# print(sorted_df)

# # Find the rank of the correct token
# rank = sorted_df.index[sorted_df['token_str'] == ground_truth][0] + 1
# print(rank)

# # Calculate reciprocal rank
# RR = 1 / rank

# print("Reciprocal Rank (RR):", RR)

# # Calculate Mean Reciprocal Rank (MRR)
# MRR = df.apply(lambda row: 1 / (sorted_df.index[sorted_df['token_str'] == row['token_str']][0] + 1), axis=1).mean()

# print("Mean Reciprocal Rank (MRR):", MRR)

In [37]:
# Get predictions for 50 masked texts
input_file = "/home/user1-selab3/shradha_test/jsoninput/whole_func_strings2.txt"

with open(input_file, "r") as file:
    text_list = file.readlines()

pred_model = pipeline("fill-mask", model=model)

# List to store reciprocal ranks for each masked text
reciprocal_ranks = []

for i in range(50):
    # Generate a masked text (assuming 'text' is a list of 50 masked texts)
    text = text_list[i]
    # print(text)

    # Get predictions for the current masked text
    preds = pred_model(text)
    print(preds)
    print("*" * 50)

    # Sort predictions by score in descending order
    sorted_preds = sorted(preds, key=lambda x: int(x['score']), reverse=True)
    print(sorted_preds)

    # Determine the rank of the correct answer
    correct_answer = preds[0]['token_str']
    print(correct_answer)
    correct_rank = next((i + 1 for i, pred in enumerate(sorted_preds) if pred['token_str'] == correct_answer), 0)
    print(correct_rank)

    # Compute the reciprocal rank for the current masked text
    reciprocal_rank = 1 / correct_rank if correct_rank != 0 else 0
    print(reciprocal_rank)

    # Append reciprocal rank to the list
    reciprocal_ranks.append(reciprocal_rank)

# Calculate the Mean Reciprocal Rank (MRR) across all masked texts
mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)
print(sum(reciprocal_ranks))
print(len(reciprocal_ranks))

print("Mean Reciprocal Rank (MRR):", mrr)

[{'score': 0.9996681213378906, 'token': 37908, 'token_str': ' node', 'sequence': "'public boolean noNamesBound() {\\n\\t\\tfinal Node node = getNamesBoundNode(false);\\n\\t\\treturn node!= null? node.getChildrenNames().isEmpty() : true;\\n\\t}'\n"}, {'score': 0.00010552719322731718, 'token': 23796, 'token_str': ' null', 'sequence': "'public boolean noNamesBound() {\\n\\t\\tfinal Node node = getNamesBoundNode(false);\\n\\t\\treturn null!= null? node.getChildrenNames().isEmpty() : true;\\n\\t}'\n"}, {'score': 4.391767652123235e-05, 'token': 4095, 'token_str': ' parent', 'sequence': "'public boolean noNamesBound() {\\n\\t\\tfinal Node node = getNamesBoundNode(false);\\n\\t\\treturn parent!= null? node.getChildrenNames().isEmpty() : true;\\n\\t}'\n"}, {'score': 3.752904376597144e-05, 'token': 9749, 'token_str': ' root', 'sequence': "'public boolean noNamesBound() {\\n\\t\\tfinal Node node = getNamesBoundNode(false);\\n\\t\\treturn root!= null? node.getChildrenNames().isEmpty() : true;\\n\\

In [38]:
pred_model = pipeline("fill-mask", model=model_checkpoint)

# Path to the file containing masked texts
masked_texts_file = "/home/user1-selab3/shradha_test/jsoninput/whole_func_strings2.txt"
# Path to the file containing ground truth texts
ground_truth_file = "/home/user1-selab3/shradha_test/jsoninput/ground_truth.txt"

# List to store reciprocal ranks for each masked text
reciprocal_ranks = []

# Read the masked texts and ground truth texts from their respective files
with open(masked_texts_file, "r") as masked_file, open(ground_truth_file, "r") as truth_file:
    masked_texts = masked_file.readlines()
    ground_truth_texts = truth_file.readlines()

# # Iterate through each masked text and its corresponding ground truth text
# for masked_text, truth_text in zip(masked_texts, ground_truth_texts):
#     preds = pred_model(masked_text)
#     print(preds)
#     print("**")
#     print(ground_truth_texts)

#     # Determine the correct answer
#     correct_answer = None
#     # print("##")
#     #print(pred['sequence'])
#     # print("*")
#     # print(truth_text.strip())
#     for pred in preds:
#         if pred['sequence'] == truth_text.strip(): 
#             correct_answer = pred['sequence']
#             print (correct_answer)
#             break

#     if correct_answer:
#         correct_rank = next((i + 1 for i, pred in enumerate(preds) if pred['sequence'] == correct_answer), 0)

#         reciprocal_ranks.append(1 / correct_rank if correct_rank != 0 else 0)

# mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)

# print("Mean Reciprocal Rank (MRR):", mrr)


Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [39]:
len(ground_truth_texts)

54

MRR

Test 1
Input1: void foo() { token1 = token2.[MASK]; token3 = token 1 + token4; }, Input2 = token-a 
output: token-a (20%), token-b (19%), token-c (18%), token-d (16%), token-e (7%)

MRR = 1 (in case token-a)
MRR = 2/5 (in case token-b)
MRR = 3/5 (in case token-c)
#----------------------------------------------------
Test 2
Input1: void foo() { token1 = token2.[MASK]; token3 = token 1 + token4; }, Input2 = token-x 
output: token-a (20%), token-b (19%), token-c (18%), token-d (16%), token-e (7%)

MRR = 1 (in case token-a)
MRR = 0 (in case token-x)
#----------------------------------------------------

Total MRR = 50%





In [40]:
# Iterate through each masked text and its corresponding ground truth text
for masked_text, truth_text in zip(masked_texts, ground_truth_texts):
    # Get predictions for the current masked text
    preds = pred_model(masked_text)
    
    # Sort predictions by score in descending order
    sorted_preds = sorted(preds, key=lambda x: x['score'], reverse=True)
    
    # Get the sequence with the highest score
    highest_score_sequence = sorted_preds[0]['sequence']
    # print(highest_score_sequence.strip())
    # print("&&")
    # print(truth_text.strip())
    # Determine the correct answer
    correct_answer = None
    # if highest_score_sequence.strip() == truth_text.strip():
    #     correct_answer = highest_score_sequence
    if truth_text.strip() in highest_score_sequence:
        correct_answer = highest_score_sequence
        #print("corr")
        print(correct_answer)

    if correct_answer:
        # Determine the rank of the correct answer in the predictions
        correct_rank = next((i + 1 for i, pred in enumerate(sorted_preds) if pred['sequence'] == correct_answer), 0)
        print (correct_rank)
        
        reciprocal_ranks.append(1 / correct_rank if correct_rank != 0 else 0) # Append reciprocal rank to the list

# Calculate the Mean Reciprocal Rank (MRR) across all masked texts
mrr = sum(reciprocal_ranks) / len(ground_truth_texts)

# Print the Mean Reciprocal Rank (MRR)
print("Mean Reciprocal Rank (MRR):", mrr)

'public Long getLong(final String key) {\n\t\tNumber number = (Number) map.get(key);\n\n\t\tif (number == null) {\n\t\t\treturn null;\n\t\t}\n\t\tif (number instanceof Long) {\n\t\t\treturn (Long) number;\n\t\t}\n\t\treturn number.longValue();\n\t}'

1
'public static Duration between(Date start, Date end)\n\t{\n\t\treturn new Duration(end.getTime() - start.getTime());\n\t}'

1
'public int writeString(String s) throws IOException {\n    int length = s.length();\n    int count = writeVInt(length);\n    count += writeChars(s, 0, length);\n    return count;\n  }'

1
'protected boolean isDelayed(IMetric metric) {\n        long delay = clock.now().getMillis() - metric.getCollectionTime();\n        return delay > MAX_AGE_ALLOWED;\n    }'   

1
'protected final void addApplication(String name, Class<? extends ExecS_Application> clazz){\n\t\tthis.classmap.put(name, clazz);\n\t}'

1
'private boolean haveVisitedNodeAlready(Node node, Stack<Node> cycleDetectionStack) {\n    for (Node cycleNode : c

In [39]:
# # Read the ground truth texts from the file
# ground_truth_texts = []
# with open(ground_truth_file, "r") as file:
#     for line in file:
#         ground_truth_texts.append(line.strip())

# # Initialize list to store MRR values for each text
# mrr_values = []

# # Iterate through each text and its corresponding ground truth
# for text, truth_text in zip(masked_texts, ground_truth_texts):
#     # Predict masked tokens
#     preds = pred_model(text)
#     correct_answer = None
#     # print("##")
#     # print(pred['sequence'])
#     # print("*")
#     # print(truth_text.strip())
    
#     # Iterate through predictions to find the correct answer
#     for pred in preds:
#         if pred['sequence'] == truth_text.strip():
#             correct_answer = pred['sequence']
#             print(correct_answer)
#             break
    
#     if correct_answer is not None:
#         # Determine the rank of the correct answer
#         correct_rank = next(i+1 for i, pred in enumerate(preds) if pred['sequence'] == correct_answer)
        
#         # Calculate the reciprocal rank
#         reciprocal_rank = 1 / correct_rank
        
#         # Append reciprocal rank to the list
#         mrr_values.append(reciprocal_rank)

# # Calculate average MRR across all texts
# avg_mrr = np.mean(mrr_values)

# # Print average MRR
# print("Average Mean Reciprocal Rank (MRR) across all texts:", avg_mrr)


'protected Optional<Boolean> isMethodAllowed(String method) {\n        if (allowedMethods.isEmpty()) {\n            return Optional.empty();\n        }\n        return Optional.of(allowedMethods.contains(method));\n    }'
Average Mean Reciprocal Rank (MRR) across all texts: 1.0


In [33]:
# # Read input text file containing 50 texts
# input_file = "/home/user1-selab3/shradha_test/jsoninput/whole_func_strings2.txt"

# with open(input_file, "r") as file:
#     input_texts = file.readlines()

# # Initialize list to store MRR values for each text
# mrr_values = []

# # Initialize pipeline for masked token prediction
# pred_model = pipeline("fill-mask", model=model_checkpoint)

# # Iterate through each text
# for text in input_texts:
#     # Predict masked tokens
#     preds = pred_model(text, top_k=1)  # Predict top 5 tokens
#     correct_answer = None
#     print("Predictions:", preds)
#     print("Mask token:", tokenizer.mask_token)
    
#     # Iterate through predictions to find the correct answer
#     # Iterate through predictions to find the correct answer
#     for pred in preds:
#         # Check if 'token_str' key exists in the dictionary
#         if 'token_str' in pred and pred['token_str'] == tokenizer.mask_token:
#             correct_answer = pred['token_str']
#             print(correct_answer)
#             break
    
#     if correct_answer is not None:
#         # Determine the rank of the correct answer
#         correct_rank = next(i+1 for i, pred in enumerate(preds) if pred['token_str'] == correct_answer)
        
#         # Calculate the reciprocal rank
#         reciprocal_rank = 1 / correct_rank
#         print(f"Reciprocal rank: {reciprocal_rank}")
        
#         # Append reciprocal rank to the list
#         mrr_values.append(reciprocal_rank)

# # Calculate average MRR across all texts
# avg_mrr = np.mean(mrr_values)
# print(f"Number of input texts: {len(input_texts)}")
# print(f"Number of reciprocal ranks calculated: {len(mrr_values)}")

# # Print average MRR
# print("Average Mean Reciprocal Rank (MRR) across all texts:", avg_mrr)

Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Predictions: [{'score': 0.9997387528419495, 'token': 37908, 'token_str': ' node', 'sequence': "'public boolean noNamesBound() {\\n\\t\\tfinal Node node = getNamesBoundNode(false);\\n\\t\\treturn node!= null? node.getChildrenNames().isEmpty() : true;\\n\\t}'\n"}]
Mask token: <mask>
Predictions: [{'score': 0.9827080368995667, 'token': 346, 'token_str': ' number', 'sequence': "'public Long getLong(final String key) {\\n\\t\\tNumber number = (Number) map.get(key);\\n\\n\\t\\tif (number == null) {\\n\\t\\t\\treturn null;\\n\\t\\t}\\n\\t\\tif (number instanceof Long) {\\n\\t\\t\\treturn (Long) number;\\n\\t\\t}\\n\\t\\treturn number.longValue();\\n\\t}'\n"}]
Mask token: <mask>
Predictions: [{'score': 0.757841169834137, 'token': 29662, 'token_str': 'register', 'sequence': "'public void registerComponent(final Class component) {\\n\\t\\tString name = resolveBaseComponentName(component);\\n\\t\\tregister(name, component);\\n\\t}'\n"}]
Mask token: <mask>
Predictions: [{'score': 0.637135207653045

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [33]:
# model = "MLM_FinetunedModel"
# pred_model = pipeline("fill-mask", model=model)
# text = "public Evaluation create(SimpleNode node, Object source)\n    {\n        return <mask>(node, source, false);\n    }"

# # Get predictions
# preds = pred_model(text)
# print(preds)

# # Sort predictions by score in descending order
# sorted_preds = sorted(preds, key=lambda x: x['score'], reverse=True)
# print(sorted_preds)
# # Determine the rank of the correct answer
# correct_answer = preds[0]['token_str']
# print(correct_answer)
# correct_rank = next(i+1 for i, pred in enumerate(sorted_preds) if pred['token_str'] == correct_answer)
# print(correct_rank)

# # Compute the reciprocal ranks
# reciprocal_ranks = [1 / rank for rank in range(1, len(sorted_preds) + 1)]
# print(reciprocal_ranks)

# # Calculate Mean Reciprocal Rank
# mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)

# print("Mean Reciprocal Rank (MRR):", mrr)


[{'score': 0.9904773235321045, 'token': 1045, 'token_str': ' create', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return create(node, source, false);\n    }'}, {'score': 0.0020682169124484062, 'token': 10516, 'token_str': ' evaluate', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return evaluate(node, source, false);\n    }'}, {'score': 0.0005082740099169314, 'token': 146, 'token_str': ' make', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return make(node, source, false);\n    }'}, {'score': 0.00044398324098438025, 'token': 1119, 'token_str': ' build', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return build(node, source, false);\n    }'}, {'score': 0.00033154641278088093, 'token': 120, 'token_str': ' get', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return get(node, source, false);

In [None]:
# import pandas as pd

# # Get the correct answer (assuming it's the first mask prediction)
# correct_answer = preds[0]['token_str']
# print(correct_answer)

# # Create a DataFrame from the predictions
# df = pd.DataFrame(preds)

# # Sort the DataFrame by score in descending order
# df_sorted = df.sort_values(by='score', ascending=False)
# print(df_sorted)

# # Reset the index of the sorted DataFrame
# df_sorted.reset_index(drop=True, inplace=True)

# # Determine the rank of the correct answer
# correct_rank = df_sorted.index[df_sorted['token_str'] == correct_answer].tolist()[0] + 1  # Add 1 to start ranks from 1
# print(correct_rank)

# # Calculate the reciprocal ranks
# df_sorted['rank'] = df_sorted.index + 1
# print(df_sorted['rank'])
# df_sorted['reciprocal_rank'] = 1 / df_sorted['rank']

# # Calculate Mean Reciprocal Rank (MRR)
# mrr = df_sorted['reciprocal_rank'].mean()

# print("Mean Reciprocal Rank (MRR):", mrr)

# #The correct answer ("create") is at index 0 after sorting. So, its rank is 1.
# # The ranks would be [1, 2, 3, 4, 5], and the reciprocal ranks would be [1.0, 0.5, 0.333, 0.25, 0.2].
# # MRR is the mean of the reciprocal ranks, which is (1.0 + 0.5 + 0.333 + 0.25 + 0.2) / 5 = 0.45666666666666667.

 create
      score  token  token_str  \
0  0.948937   1045     create   
1  0.018123  10516   evaluate   
2  0.002204  37131       eval   
3  0.001642   1119      build   
4  0.001627    146       make   

                                            sequence  
0  public Evaluation create(SimpleNode node, Obje...  
1  public Evaluation create(SimpleNode node, Obje...  
2  public Evaluation create(SimpleNode node, Obje...  
3  public Evaluation create(SimpleNode node, Obje...  
4  public Evaluation create(SimpleNode node, Obje...  
1
0    1
1    2
2    3
3    4
4    5
Name: rank, dtype: int64
Mean Reciprocal Rank (MRR): 0.45666666666666667


In [34]:
# import pandas as pd

# # Sample outputs
# data = {
#     'score': [0.948937, 0.018123, 0.002204, 0.001642, 0.001627],
#     'token': [1045, 10516, 37131, 1119, 146],
#     'token_str': ['create', 'evaluate', 'eval', 'build', 'make'],
# }

# df = pd.DataFrame(data)

# # Assuming 'create' is the correct token for the query
# ground_truth = 'create'

# # Sort the outputs by score
# sorted_df = df.sort_values(by='score', ascending=False)
# print(sorted_df)

# # Find the rank of the correct token
# rank = sorted_df.index[sorted_df['token_str'] == ground_truth][0] + 1
# print(rank)

# # Calculate reciprocal rank
# RR = 1 / rank

# print("Reciprocal Rank (RR):", RR)

# # Calculate Mean Reciprocal Rank (MRR)
# MRR = df.apply(lambda row: 1 / (sorted_df.index[sorted_df['token_str'] == row['token_str']][0] + 1), axis=1).mean()

# print("Mean Reciprocal Rank (MRR):", MRR)


      score  token token_str
0  0.948937   1045    create
1  0.018123  10516  evaluate
2  0.002204  37131      eval
3  0.001642   1119     build
4  0.001627    146      make
1
Reciprocal Rank (RR): 1.0
Mean Reciprocal Rank (MRR): 0.45666666666666667


In [37]:
# import pandas as pd

# # Get predictions for 50 masked texts
# pred_model = pipeline("fill-mask", model=model)

# # List to store reciprocal ranks for each masked text
# reciprocal_ranks = []

# for i in range(50):
#     # Generate a masked text (assuming 'text' is a list of 50 masked texts)
#     text = text_list[i]

#     # Get predictions for the current masked text
#     preds = pred_model(text)

#     # Create a DataFrame from the predictions
#     df = pd.DataFrame(preds)
#     #print(df)

#     if 'score' in df.columns:
#         # Sort the DataFrame by score in descending order
#         df_sorted = df.sort_values(by='score', ascending=False)
#         print(df_sorted)

#         # Reset the index of the sorted DataFrame
#         df_sorted.reset_index(drop=True, inplace=True)
#         #print(df_sorted.reset_index(drop=True, inplace=True))

#         # Find the index of the correct answer in the DataFrame
#         correct_index = df_sorted[df_sorted['token_str'] == text.split()[0]].index.tolist()
#         # print("*")
#         # print(correct_index)

#         if correct_index:
#             # Determine the rank of the correct answer
#             correct_rank = correct_index[0] + 1  # Add 1 to start ranks from 1

#             # Calculate the reciprocal rank
#             reciprocal_rank = 1 / correct_rank
#         else:
#             # If the correct token is not found, set reciprocal rank to 0
#             reciprocal_rank = 0
#     else:
#         # If the DataFrame doesn't contain 'score' column, set reciprocal rank to 0
#         reciprocal_rank = 0

#     # Append reciprocal rank to the list
#     reciprocal_ranks.append(reciprocal_rank)

# # Calculate the Mean Reciprocal Rank (MRR) across all masked texts
# mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)

# print("Mean Reciprocal Rank (MRR):", mrr)


      score  token token_str  \
0  0.999668  37908      node   
1  0.000106  23796      null   
2  0.000044   4095    parent   
3  0.000038   9749      root   
4  0.000026    295         n   

                                            sequence  
0  'public boolean noNamesBound() {\n\t\tfinal No...  
1  'public boolean noNamesBound() {\n\t\tfinal No...  
2  'public boolean noNamesBound() {\n\t\tfinal No...  
3  'public boolean noNamesBound() {\n\t\tfinal No...  
4  'public boolean noNamesBound() {\n\t\tfinal No...  
      score  token token_str  \
0  0.944000    346    number   
1  0.009550    923     value   
2  0.006553   5456       map   
3  0.005361    762       key   
4  0.004527    898    result   

                                            sequence  
0  'public Long getLong(final String key) {\n\t\t...  
1  'public Long getLong(final String key) {\n\t\t...  
2  'public Long getLong(final String key) {\n\t\t...  
3  'public Long getLong(final String key) {\n\t\t...  
4  'publi

In [38]:
# # List to store reciprocal ranks for each masked text
# reciprocal_ranks = []

# for i in range(50):
#     # Generate a masked text (assuming 'text' is a list of 50 masked texts)
#     text = text_list[i]

#     # Get predictions for the current masked text
#     preds = pred_model(text)
#     print(preds)

#     if preds:
#         # Extract the first token from predictions as the correct token
#         correct_token = preds[0]['token_str']

#         # Find the index of the correct token in the list of tokens
#         correct_index = [pred['token_str'] for pred in preds].index(correct_token)

#         # Determine the rank of the correct answer
#         correct_rank = correct_index + 1  # Add 1 to start ranks from 1

#         # Calculate the reciprocal rank
#         reciprocal_rank = 1 / correct_rank
#     else:
#         # If there are no predictions, set reciprocal rank to 0
#         reciprocal_rank = 0

#     # Append reciprocal rank to the list
#     reciprocal_ranks.append(reciprocal_rank)

# # Calculate the Mean Reciprocal Rank (MRR) across all masked texts
# mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)

# print("Mean Reciprocal Rank (MRR):", mrr)


[{'score': 0.9996681213378906, 'token': 37908, 'token_str': ' node', 'sequence': "'public boolean noNamesBound() {\\n\\t\\tfinal Node node = getNamesBoundNode(false);\\n\\t\\treturn node!= null? node.getChildrenNames().isEmpty() : true;\\n\\t}'\n"}, {'score': 0.00010552719322731718, 'token': 23796, 'token_str': ' null', 'sequence': "'public boolean noNamesBound() {\\n\\t\\tfinal Node node = getNamesBoundNode(false);\\n\\t\\treturn null!= null? node.getChildrenNames().isEmpty() : true;\\n\\t}'\n"}, {'score': 4.391767652123235e-05, 'token': 4095, 'token_str': ' parent', 'sequence': "'public boolean noNamesBound() {\\n\\t\\tfinal Node node = getNamesBoundNode(false);\\n\\t\\treturn parent!= null? node.getChildrenNames().isEmpty() : true;\\n\\t}'\n"}, {'score': 3.752904376597144e-05, 'token': 9749, 'token_str': ' root', 'sequence': "'public boolean noNamesBound() {\\n\\t\\tfinal Node node = getNamesBoundNode(false);\\n\\t\\treturn root!= null? node.getChildrenNames().isEmpty() : true;\\n\\

In [90]:
# # List to store reciprocal ranks for each masked text
# reciprocal_ranks = []

# for preds in input_texts:
#     if preds:
#         # Extract the first token string from predictions as the correct token
#         correct_token = preds[0]['token_str']

#         # Find the index of the correct token in the list of tokens
#         correct_index = [pred['token_str'] for pred in preds].index(correct_token)

#         # Determine the rank of the correct answer
#         correct_rank = correct_index + 1  # Add 1 to start ranks from 1

#         # Calculate the reciprocal rank
#         reciprocal_rank = 1 / correct_rank
#     else:
#         # If there are no predictions, set reciprocal rank to 0
#         reciprocal_rank = 0

#     # Append reciprocal rank to the list
#     reciprocal_ranks.append(reciprocal_rank)

# # Calculate the Mean Reciprocal Rank (MRR) across all masked texts
# mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)

# print("Mean Reciprocal Rank (MRR):", mrr)


TypeError: string indices must be integers, not 'str'