In [1]:
from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset, load_dataset, DatasetDict
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch
from transformers import pipeline
from torch.nn import CrossEntropyLoss
from torch.utils.data.dataloader import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator,notebook_launcher
from transformers import get_scheduler
from huggingface_hub import Repository, get_full_repo_name
from transformers import AutoModelForMaskedLM
from transformers import default_data_collator
import os
import math
import time
import argparse

In [2]:
codesearchnet_dataset = load_dataset("code_search_net", "java")
codesearchnet_dataset

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 454451
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 26909
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 15328
    })
})

In [3]:
sample = codesearchnet_dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> code: {row['whole_func_string']}'")



'>>> code: public Boolean isWriteLocked(K token) {
	RWLock<K> lock = locks.get(token);
	if (lock == null) return null;
	return lock.isWriteLocked();
    }'

'>>> code: @Override
    public int getLevel() {
        Level level = log4jLogger.getLevel();
        if (level == null)
            level = Logger.getRootLogger().getLevel();
        switch (level.toInt()) {
            case Level.TRACE_INT:
                return TRACE;
            case Level.DEBUG_INT:
                return DEBUG;
            case Level.INFO_INT:
                return INFO;
            case Level.WARN_INT:
                return WARN;
            case Level.ERROR_INT:
                return ERROR;
            case Level.FATAL_INT:
                return FATAL;
            default:
                throw new IllegalArgumentException("Unsupported log4j level: " + level);
        }
    }'

'>>> code: public TerminalEmulatorDeviceConfiguration withCursorBlinking(boolean cursorBlinking) {
        if(this.cursorBli

In [4]:
model_checkpoint = "microsoft/codebert-base-mlm"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)


In [5]:
codesearchnet_dataset

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 454451
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 26909
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 15328
    })
})

In [6]:
def tokenize_function(examples):
    result = tokenizer(examples["whole_func_string"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = codesearchnet_dataset.map(
    tokenize_function, batched=True, remove_columns=['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url']
)
tokenized_datasets


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 454451
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 26909
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 15328
    })
})

In [7]:
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> code {idx} length: {len(sample)}'")


'>>> code 0 length: 65'
'>>> code 1 length: 84'
'>>> code 2 length: 168'


In [8]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated code length: {total_length}'")


'>>> Concatenated code length: 317'


In [9]:
chunk_size = 128
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")


'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 61'


In [10]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result


In [11]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1132440
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61821
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 31402
    })
})

In [12]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

'Name, resourceGroupName, fabricName, containerName), serviceCallback);\n    }</s><s>public Observable<Void> inquireAsync(String vaultName, String resourceGroupName, String fabricName, String containerName, String filter) {\n        return inquireWithServiceResponseAsync(vaultName, resourceGroupName, fabricName, containerName, filter).map(new Func1<ServiceResponse<Void>, Void>() {\n            @Override\n            public'

In [13]:
tokenizer.decode(lm_datasets["train"][1]["labels"])

'Name, resourceGroupName, fabricName, containerName), serviceCallback);\n    }</s><s>public Observable<Void> inquireAsync(String vaultName, String resourceGroupName, String fabricName, String containerName, String filter) {\n        return inquireWithServiceResponseAsync(vaultName, resourceGroupName, fabricName, containerName, filter).map(new Func1<ServiceResponse<Void>, Void>() {\n            @Override\n            public'

In [14]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [15]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> <s><mask> void inquire(String<mask>Name<mask> String<mask> physicistName<mask> String fabricName,<mask> containerName) {
 <mask><mask><mask> <mask>  inquireWithServiceResponseAsync(v<mask>Name, resourceGroupName<mask> fabricName, containerName).to<mask>ocking().single().body();
    }</s><s>public ServiceFuture<Void<mask> inquireAsync(<mask> vaultName, String resourceGroupName, String fabricName, String containerName, final ServiceCallback<<mask>oid<mask><mask>Callback<mask> {
       disposal return<mask>Future.fromResponse(inquireWithServiceResponseAsync(vault'

'>>> Name<mask> resourceGroupName, fabricName, containerName), service<mask>);
    }</s><s>public Observ True<Void> inquireAsync<mask>String vaultName, StringavezGroup<mask>, String fabricName, String containerName<mask> String filter)<mask>
<mask>       return inquireWithServiceResponse<mask>(vault<mask>, resourceGroupName, fabricName, containerName,<mask>ismamap(new Func1<<mask>Response<<mask><mask>>, Void>()<mask>
<mas

In [16]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [17]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> <s><mask> void inquire(String vaultName, String resourceGroupName, String fabricName,<mask> containerName)<mask>
        inquireWithServiceResponseAsync(<mask><mask><mask>, resourceGroupName, fabricName, containerName<mask>toBlocking().<mask>().body();
    }</s><s><mask> ServiceFuture<mask>Void> inquireAsync<mask>String vaultName,<mask> resourceGroupName, String fabricName,<mask> containerName<mask> final<mask><mask><Void> serviceCallback) {
        return ServiceFuture.fromResponse(inquireWithServiceResponseAsync(vault'

'>>> Name, resourceGroupName, fabricName, containerName), serviceCallback);
   <mask></s><s>public<mask><mask><Void> inquireAsync(<mask> vaultName, String resourceGroupName, String fabricName,<mask> containerName,<mask> filter) {
        return inquireWithServiceResponseAsync(vaultName,<mask><mask><mask>, fabricName,<mask><mask>,<mask>).map(new<mask><mask>1<mask>ServiceResponse<Void>, Void>() {
            @Override
            public'


In [18]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [19]:
train_size = 8000
test_size = 1000

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1000
    })
})

In [20]:
# batch_size = 64
# # Show the training loss with every epoch
# logging_steps = len(downsampled_dataset["train"]) // batch_size

# training_args = TrainingArguments(
#     output_dir="MLM_FinetunedModel",
#     overwrite_output_dir=True,
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     weight_decay=0.01,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     push_to_hub=True,
#     fp16=True,
#     logging_steps=logging_steps,
#     remove_unused_columns=False
# )


# torch.cuda.is_initialized()

True

In [21]:
# os.environ['WANDB_NOTEBOOK_NAME'] = 'MLM_wholewordmask.ipynb'
# os.environ['WANDB_MODE'] = 'disabled'
# #  WANDB_MODE=disabled

In [22]:
# model_checkpoint

'microsoft/codebert-base-mlm'

In [23]:
# trainer = Trainer(
#     model=AutoModelForMaskedLM.from_pretrained(model_checkpoint),
#     args=training_args,
#     train_dataset=downsampled_dataset["train"],
#     eval_dataset=downsampled_dataset["test"],
#     data_collator=whole_word_masking_data_collator,
#     tokenizer=tokenizer,
# )

Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [24]:
# # Evaluate the model
# eval_results = trainer.evaluate()

# # Calculate perplexity
# perplexity = math.exp(eval_results['eval_loss'])

# # Calculate loss
# loss = eval_results['eval_loss']

# print(f">>> Perplexity: {perplexity:.2f}")
# print(f">>> Loss: {loss:.2f}")





>>> Perplexity: 9.09
>>> Loss: 2.21


In [25]:
# trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.089174
2,1.242900,1.033669
3,1.242900,1.02511


TrainOutput(global_step=237, training_loss=1.1980971002377538, metrics={'train_runtime': 243.8762, 'train_samples_per_second': 123.013, 'train_steps_per_second': 0.972, 'total_flos': 1974490974720000.0, 'train_loss': 1.1980971002377538, 'epoch': 3.0})

In [26]:
# # Evaluate the model
# eval_results = trainer.evaluate()

# # Calculate perplexity
# perplexity = math.exp(eval_results['eval_loss'])


# # Calculate loss
# loss = eval_results['eval_loss']

# print(f">>> Perplexity: {perplexity:.2f}")
# print(f">>> Loss: {loss:.2f}")

>>> Perplexity: 2.78
>>> Loss: 1.02


In [27]:
# trainer.push_to_hub()

events.out.tfevents.1711752702.oisit-selab3:   0%|          | 0.00/311 [00:00<?, ?B/s]

events.out.tfevents.1711820889.oisit-selab3:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

events.out.tfevents.1711820621.oisit-selab3:   0%|          | 0.00/5.89k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.73k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/shradha01/MLM_FinetunedModel/commit/1a7ee64e0565e242afce266df7820007d6e100da', commit_message='End of training', commit_description='', oid='1a7ee64e0565e242afce266df7820007d6e100da', pr_url=None, pr_revision=None, pr_num=None)

In [20]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = whole_word_masking_data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [21]:
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [22]:
model_name = "MLM_FinetunedModel_accel"
repo_name = get_full_repo_name(model_name)
repo_name 

'shradha01/MLM_FinetunedModel_accel'

In [23]:
output_dir = model_name
repo = Repository(output_dir, clone_from=repo_name)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
/home/user1-selab3/shradha_test/MLM_FinetunedModel_accel is already a clone of https://huggingface.co/shradha01/MLM_FinetunedModel_accel. Make sure you pull the latest changes with `repo.git_pull()`.


In [24]:
def training_function():

    # set batch size to 32, a larger bacth size when using a more powerful gpu
    batch_size = 32

    train_dataloader = DataLoader(downsampled_dataset["train"], shuffle=True, batch_size=batch_size, collate_fn=whole_word_masking_data_collator)
    eval_dataloader = DataLoader(downsampled_dataset["test"], batch_size=batch_size, collate_fn=whole_word_masking_data_collator)

    # initialize pretrained bert model
    model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)
    # set the optimizer
    optimizer = AdamW(model.parameters(), lr=5e-5)

    # initialize accelerator for training
    accelerator = Accelerator()
    model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(model, optimizer, train_dataloader, eval_dataloader)

    # set the number of epochs which is set to 30
    num_train_epochs = 5
    num_update_steps_per_epoch = len(train_dataloader)
    num_training_steps = num_train_epochs * num_update_steps_per_epoch

    # define the learning rate scheduler for training
    lr_scheduler = get_scheduler("linear",optimizer=optimizer,num_warmup_steps=0,num_training_steps=num_training_steps)

    progress_bar = tqdm(range(num_training_steps))

    for epoch in range(num_train_epochs):
        # Training
        model.train()
        for batch in train_dataloader:
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)

        # Evaluation
        model.eval()
        losses = []
        for step, batch in enumerate(eval_dataloader):
            with torch.no_grad():
                outputs = model(**batch)
            loss = outputs.loss # <===== Added.
            losses.append(accelerator.gather(loss.repeat(batch_size)))
            # loss = outputs.loss
            # losses.append(accelerator.gather(loss.repeat(batch_size)))

        # losses = torch.cat(losses)
        # losses = losses[: len(eval_dataset)]
        loss = torch.mean(torch.cat(losses))
        print(f">>> Epoch {epoch}: Loss: {loss.item()}")

        # perplexity metric used for mask language model training
        try:
            #perplexity = math.exp(torch.mean(losses))
            perplexity = torch.exp(torch.tensor(loss))
        except OverflowError:
            perplexity = float("inf")
        print(f">>> Epoch {epoch}: Perplexity: {perplexity.item()}")

        # Calculate probabilities
        losses_tensor = torch.cat(losses)  # Concatenate the list of tensors into a single tensor
        # losses_np = losses_tensor.cpu().numpy()  # Convert concatenated tensor to NumPy array
        # probabilities = torch.nn.functional.softmax(torch.tensor(losses_np), dim=0)  # Calculate probabilities
        probabilities = torch.nn.functional.softmax(-losses_tensor, dim=0)  # Taking negative of losses_tensor to ensure proper softmax calculation

        # Calculate entropy
        #entropy = -torch.sum(probabilities * torch.log(probabilities))
        entropy = -torch.sum(probabilities * torch.log(probabilities + 1e-20)) 
        print(f">>> Epoch {epoch}: Entropy: {entropy.item()}")  # Print entropy

        # Save model
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
        if accelerator.is_main_process:
            tokenizer.save_pretrained(output_dir)
            repo.push_to_hub(
                commit_message=f"Training in progress epoch {epoch}", blocking=False
            )

notebook_launcher(training_function, num_processes= 2)

Launching training on 2 GPUs.


Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a 

>>> Epoch 0: Loss: 1.0228627920150757>>> Epoch 0: Loss: 1.0228627920150757



  perplexity = torch.exp(torch.tensor(loss))
  perplexity = torch.exp(torch.tensor(loss))


>>> Epoch 0: Perplexity: 2.7811450958251953>>> Epoch 0: Perplexity: 2.7811450958251953

>>> Epoch 0: Entropy: 6.916427135467529>>> Epoch 0: Entropy: 6.916427135467529



 20%|██        | 126/625 [00:43<09:56,  1.20s/it]Several commits (2) will be pushed upstream.
 40%|████      | 250/625 [01:54<02:03,  3.03it/s]  

>>> Epoch 1: Loss: 0.9158941507339478>>> Epoch 1: Loss: 0.9158941507339478



  perplexity = torch.exp(torch.tensor(loss))


>>> Epoch 1: Perplexity: 2.499008893966675>>> Epoch 1: Perplexity: 2.499008893966675

>>> Epoch 1: Entropy: 6.9206223487854>>> Epoch 1: Entropy: 6.9206223487854



 60%|██████    | 375/625 [03:08<01:22,  3.02it/s]  

>>> Epoch 2: Loss: 0.9623615145683289>>> Epoch 2: Loss: 0.9623615145683289


  perplexity = torch.exp(torch.tensor(loss))



>>> Epoch 2: Perplexity: 2.6178712844848633>>> Epoch 2: Perplexity: 2.6178712844848633

>>> Epoch 2: Entropy: 6.915360450744629>>> Epoch 2: Entropy: 6.915360450744629



 80%|████████  | 500/625 [04:23<00:41,  3.01it/s]

>>> Epoch 3: Loss: 0.9179523587226868>>> Epoch 3: Loss: 0.9179523587226868



  perplexity = torch.exp(torch.tensor(loss))


>>> Epoch 3: Perplexity: 2.504157543182373>>> Epoch 3: Perplexity: 2.504157543182373

>>> Epoch 3: Entropy: 6.91518497467041>>> Epoch 3: Entropy: 6.91518497467041



100%|██████████| 625/625 [05:37<00:00,  3.00it/s]

>>> Epoch 4: Loss: 0.8899433016777039>>> Epoch 4: Loss: 0.8899433016777039



  perplexity = torch.exp(torch.tensor(loss))


>>> Epoch 4: Perplexity: 2.4349915981292725>>> Epoch 4: Perplexity: 2.4349915981292725

>>> Epoch 4: Entropy: 6.922123432159424>>> Epoch 4: Entropy: 6.922123432159424



100%|██████████| 625/625 [05:40<00:00,  1.83it/s]
100%|██████████| 625/625 [05:55<00:00,  1.76it/s]


In [35]:
batch_size = 32
train_dataloader = DataLoader(downsampled_dataset["train"], shuffle=True, batch_size=batch_size, collate_fn=whole_word_masking_data_collator)
eval_dataloader = DataLoader(downsampled_dataset["test"], batch_size=batch_size, collate_fn=whole_word_masking_data_collator)

# Iterate over train_dataloader
print("Train DataLoader:")
for batch in train_dataloader:
    print(batch)
    break  # Print only the first batch for brevity

# Iterate over eval_dataloader
print("\nEval DataLoader:")
for batch in eval_dataloader:
    print(batch)
    break  # Print only the first batch for brevity


Train DataLoader:
{'input_ids': tensor([[15698, 37908, 12350,  ..., 46308,  4397, 50118],
        [50121, 50118, 50117,  ..., 50264, 50117,  1594],
        [50264, 50264,  1615,  ..., 50264, 50264, 50264],
        ...,
        [ 1386, 50264, 50264,  ...,  1437,  1437,  1437],
        [42645,    22, 50118,  ...,  1437,  1437,  1437],
        [45696,  2072, 47006,  ..., 50140,  1437,  1437]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]]), 'labels': tensor([[ -100,  -100,  -100,  ...,  -100,  -100,  -100],
        [ -100,  -100,  -100,  ..., 50117,  -100,  -100],
        [  185,  1268,  -100,  ...,  1437,  1437,  1437],
        ...,
        [ -100, 50118,  1437,  ...,  -100,  -100,  -100],
        [ -100,  -100,  -100,  ...,  -100,  -100,  -100],
        [ -100,  -100,  -100,  ...,  -100,  -100,  -100

Checking the finetuned model

In [25]:
pred_model = pipeline("fill-mask", model = "shradha01/MLM_FinetunedModel_accel")

text = "private static void associateFactory(Map<String, ScriptEngineFactory> associations, String association, ScriptEngineFactory factory)\n\t{\n\tif (association == null || factory == null) throw new NullPointerException();\n\tassociations.put(association, <mask>);}"

preds = pred_model(text)
print(preds)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

[{'score': 0.9976598024368286, 'token': 5566, 'token_str': ' factory', 'sequence': 'private static void associateFactory(Map<String, ScriptEngineFactory> associations, String association, ScriptEngineFactory factory)\n\t{\n\tif (association == null || factory == null) throw new NullPointerException();\n\tassociations.put(association, factory);}'}, {'score': 0.0005281823687255383, 'token': 12126, 'token_str': ' factories', 'sequence': 'private static void associateFactory(Map<String, ScriptEngineFactory> associations, String association, ScriptEngineFactory factory)\n\t{\n\tif (association == null || factory == null) throw new NullPointerException();\n\tassociations.put(association, factories);}'}, {'score': 0.00038350705290213227, 'token': 5259, 'token_str': ' association', 'sequence': 'private static void associateFactory(Map<String, ScriptEngineFactory> associations, String association, ScriptEngineFactory factory)\n\t{\n\tif (association == null || factory == null) throw new NullPoi

MRRTest 1

Input1: void foo() { token1 = token2.[MASK]; token3 = token 1 + token4; }, Input2 = token-a 
output: token-a (99%), token-b (19%), token-c (18%), token-d (16%), token-e (7%)MRR = 1 (in case token-a)
MRR = 2/5 (in case token-b)
MRR = 3/5 (in case token-c)
#----------------------------------------------------

Test 2
Input1: void foo() { token1 = token2.[MASK]; token3 = token 1 + token4; }, Input2 = token-x 
output: token-a (20%), token-b (19%), token-c (18%), token-d (16%), token-e (7%)MRR = 1 (in case token-a)
MRR = 0 (in case token-x)

#----------------------------------------------------Total MRR = 50%
#----------------------------------------------------We need to remove overwap b/w trainig and testing datasets in CodeSearchNet.
For test dataset, we need to creat unseen datasets, open source repositories (e.g., RoBERTa datasets)

Using pandas

In [28]:
import pandas as pd
# Iterate through each masked text and its corresponding ground truth text
pred_model = pipeline("fill-mask", model="shradha01/MLM_FinetunedModel_accel")

# Path to the file containing masked texts and ground truth texts
# masked_texts_file = "/home/user1-selab3/shradha_test/jsoninput/whole_func_strings2.txt"
# ground_truth_file = "/home/user1-selab3/shradha_test/jsoninput/ground_truth.txt"
masked_texts_file = "/home/user1-selab3/shradha_test/jsoninput/outputs.txt"
ground_truth_file = "/home/user1-selab3/shradha_test/jsoninput/output_java.txt"

# List to store reciprocal ranks for each masked text
reciprocal_ranks = []

# Read the masked texts and ground truth texts from their respective files
with open(masked_texts_file, "r") as masked_file, open(ground_truth_file, "r") as truth_file:
    masked_texts = masked_file.readlines()
    ground_truth_texts = truth_file.readlines()
    
# Initialize an empty list to store predictions
all_preds = []

# Initialize a counter for masked token IDs
masked_token_id_counter = 0

# Initialize a counter for ground truth text line IDs
ground_truth_line_id_counter = 0

# Initialize an empty list to store ground truth texts and their IDs
ground_truth_data = []

# Iterate through masked texts and ground truth texts
for masked_text, truth_text in zip(masked_texts, ground_truth_texts):
    # Increment the masked token ID counter for each new masked token
    masked_token_id_counter += 1
    
    # Get predictions for the current masked text
    preds = pred_model(masked_text, top_k= 10)
    
    # Initialize an empty list to store predictions for the current masked text
    masked_text_preds = []

    # Iterate through predictions for the current masked text
    for rank, pred in enumerate(sorted(preds, key=lambda x: x['score'], reverse=True), start=1):
        # Create a dictionary for each prediction with required fields
        pred_dict = {
            'token_id': masked_token_id_counter,
            'rank': rank,
            'score': pred['score'],
            'token': pred['token'],
            'token_str': pred['token_str'],
            'sequence': pred['sequence']
        }
        # Append the prediction dictionary to the list of predictions for the current masked text
        masked_text_preds.append(pred_dict)

    # Append the list of predictions for the current masked text to the list of all predictions
    all_preds.extend(masked_text_preds)

    # Increment the ground truth line ID counter
    ground_truth_line_id_counter += 1

    # Store ground truth text and its ID
    ground_truth_data.append({'ground_truth_text': truth_text, 'token_id': ground_truth_line_id_counter})

    # Print ground truth text
    print("-----\nSample", ground_truth_line_id_counter)
    print("Ground truth:\n", truth_text.strip())
    print("Input:\n", masked_text.strip())

    # Print predicted results sorted by probability scores
    print("Predicted results (sorted by probability scores):")
    for result in masked_text_preds:
        print(f"   Result {result['rank']}: {result['token_str']}, {result['score']:.4f}, {result['sequence']}")
    print("-----")

# Convert the list of predictions into a DataFrame
preds_df = pd.DataFrame(all_preds)

# Convert the list of ground truth data into a DataFrame
ground_truth_df = pd.DataFrame(ground_truth_data)

# Print the DataFrame containing predictions
print("\nPredictions DataFrame:")
print(preds_df)

# import pandas as pd
# # Iterate through each masked text and its corresponding ground truth text
# pred_model = pipeline("fill-mask", model="shradha01/MLM_FinetunedModel_accel")

# # Path to the file containing masked texts and ground truth texts
# # masked_texts_file = "/home/user1-selab3/shradha_test/jsoninput/whole_func_strings2.txt"
# # ground_truth_file = "/home/user1-selab3/shradha_test/jsoninput/ground_truth.txt"
# masked_texts_file = "/home/user1-selab3/shradha_test/jsoninput/outputs.txt"
# ground_truth_file = "/home/user1-selab3/shradha_test/jsoninput/output_java.txt"

# # List to store reciprocal ranks for each masked text
# reciprocal_ranks = []

# # Read the masked texts and ground truth texts from their respective files
# with open(masked_texts_file, "r") as masked_file, open(ground_truth_file, "r") as truth_file:
#     masked_texts = masked_file.readlines()
#     ground_truth_texts = truth_file.readlines()
    
# # Initialize an empty list to store predictions
# all_preds = []

# # Initialize a counter for masked token IDs
# masked_token_id_counter = 0

# # Initialize a counter for ground truth text line IDs
# ground_truth_line_id_counter = 0

# # Initialize an empty list to store ground truth texts and their IDs
# ground_truth_data = []

# # Iterate through masked texts and ground truth texts
# for masked_text, truth_text in zip(masked_texts, ground_truth_texts):
#     # Increment the masked token ID counter for each new masked token
#     masked_token_id_counter += 1
    
#     # Get predictions for the current masked text
#     preds = pred_model(masked_text, top_k= 10)
    
#     # Initialize an empty list to store predictions for the current masked text
#     masked_text_preds = []
    
#     # Iterate through predictions for the current masked text
#     for rank, pred in enumerate(sorted(preds, key=lambda x: x['score'], reverse=True), start=1):
#         # Create a dictionary for each prediction with required fields
#         pred_dict = {
#             'token_id': masked_token_id_counter,
#             'rank': rank,
#             'score': pred['score'],
#             'token': pred['token'],
#             'token_str': pred['token_str'],
#             'sequence': pred['sequence']
#         }
#         # Append the prediction dictionary to the list of predictions for the current masked text
#         masked_text_preds.append(pred_dict)
    
#     # Append the list of predictions for the current masked text to the list of all predictions
#     all_preds.extend(masked_text_preds)
    
#     # Increment the ground truth line ID counter
#     ground_truth_line_id_counter += 1
    
#     # Store ground truth text and its ID
#     ground_truth_data.append({'ground_truth_text': truth_text, 'token_id': ground_truth_line_id_counter})

# # Convert the list of predictions into a DataFrame
# preds_df = pd.DataFrame(all_preds)

# # Convert the list of ground truth data into a DataFrame
# ground_truth_df = pd.DataFrame(ground_truth_data)

# # Print the DataFrame containing predictions
# print("Predictions DataFrame:")
# print(preds_df)

# # Print the DataFrame containing ground truth text
# print("\nGround Truth DataFrame:")
# print(ground_truth_df)

-----
Sample 1
Ground truth:
 public static int factorial(int n) {\n\tif (n == 0)\n\treturn 1;\n\telse\n\treturn n * factorial(n - 1);}
Input:
 public static int factorial(int n) {\n\tif (n == 0)\n\treturn 1;\n\telse\n\t<mask> n * factorial(n - 1);}
Predicted results (sorted by probability scores):
   Result 1: return, 0.9947, public static int factorial(int n) {\n\tif (n == 0)\n\treturn 1;\n\telse\n\treturn n * factorial(n - 1);}

   Result 2:  return, 0.0040, public static int factorial(int n) {\n\tif (n == 0)\n\treturn 1;\n\telse\n\t return n * factorial(n - 1);}

   Result 3: //, 0.0003, public static int factorial(int n) {\n\tif (n == 0)\n\treturn 1;\n\telse\n\t// n * factorial(n - 1);}

   Result 4: Return, 0.0001, public static int factorial(int n) {\n\tif (n == 0)\n\treturn 1;\n\telse\n\tReturn n * factorial(n - 1);}

   Result 5: ++, 0.0001, public static int factorial(int n) {\n\tif (n == 0)\n\treturn 1;\n\telse\n\t++ n * factorial(n - 1);}

   Result 6: select, 0.0000, publi

In [29]:
results = preds_df.merge(ground_truth_df, how='left', on=['token_id'])

def fill_sequence(row):
    if row['sequence'] in row['ground_truth_text']:
        return row['sequence']
    else:
        return None

# Apply the function to the 'sequence' column
results['match_sequence'] = results.apply(fill_sequence, axis=1)

# Display the results DataFrame
print(results['match_sequence'])

0      public static int factorial(int n) {\n\tif (n ...
1                                                   None
2                                                   None
3                                                   None
4                                                   None
                             ...                        
555                                                 None
556                                                 None
557                                                 None
558                                                 None
559                                                 None
Name: match_sequence, Length: 560, dtype: object


In [30]:
len(results['match_sequence'])

560

In [31]:
results['match_sequence'].fillna('None', inplace=True)

# Define the file path where you want to save the values
output_file_path = "match_sequence_values.txt"

# Open the file in write mode
with open(output_file_path, 'w') as output_file:
    # Write each value in results['match_sequence'] to the file
    for value in results['match_sequence']:
        output_file.write(str(value) + '\n')

# Print confirmation message
print("Match sequence values have been saved to:", output_file_path)

Match sequence values have been saved to: match_sequence_values.txt


In [32]:
results['match_sequence'].fillna('None', inplace=True)
# Group by 'token_id' and 'match_sequence', taking the minimum rank
relevances_rank = results.groupby(['token_id', 'match_sequence'])['rank'].min()

print(relevances_rank)

token_id  match_sequence                                                                                                                                                                     
1         None                                                                                                                                                                                   2
          public static int factorial(int n) {\n\tif (n == 0)\n\treturn 1;\n\telse\n\treturn n * factorial(n - 1);}\n                                                                            1
2         None                                                                                                                                                                                   2
          public static boolean isPrime(int num) {\n\tif (num <= 1)\n\treturn false;\n\tfor (int i = 2; i <= Math.sqrt(num); i++) {\n\tif (num % i == 0)\n\treturn false;}\n\treturn true;}\n    1
3         None                

In [33]:
ranks = relevances_rank[relevances_rank.index.get_level_values('match_sequence') != 'None']

print(ranks)

token_id  match_sequence                                                                                                                                                                                                                                                                                                                                                                                                    
1         public static int factorial(int n) {\n\tif (n == 0)\n\treturn 1;\n\telse\n\treturn n * factorial(n - 1);}\n                                                                                                                                                                                                                                                                                                           1
2         public static boolean isPrime(int num) {\n\tif (num <= 1)\n\treturn false;\n\tfor (int i = 2; i <= Math.sqrt(num); i++) {\n\tif (num % i == 0)\n\treturn false;

In [34]:
reciprocal_ranks = 1 / (ranks)
reciprocal_ranks

token_id  match_sequence                                                                                                                                                                                                                                                                                                                                                                                                    
1         public static int factorial(int n) {\n\tif (n == 0)\n\treturn 1;\n\telse\n\treturn n * factorial(n - 1);}\n                                                                                                                                                                                                                                                                                                           1.0
2         public static boolean isPrime(int num) {\n\tif (num <= 1)\n\treturn false;\n\tfor (int i = 2; i <= Math.sqrt(num); i++) {\n\tif (num % i == 0)\n\treturn fals

In [35]:
reciprocal_ranks.mean()

0.9444444444444444