In [1]:
from collections import defaultdict
from tqdm import tqdm
from datasets import Dataset, load_dataset, DatasetDict
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
import torch
from transformers import pipeline
from torch.nn import CrossEntropyLoss
from torch.utils.data.dataloader import DataLoader
from torch.optim import AdamW
from accelerate import Accelerator,notebook_launcher
from transformers import get_scheduler
from huggingface_hub import Repository, get_full_repo_name
from transformers import AutoModelForMaskedLM
from transformers import default_data_collator
import math
import time
import argparse

In [2]:
codesearchnet_dataset = load_dataset("code_search_net", "java")
codesearchnet_dataset

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 454451
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 26909
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 15328
    })
})

In [3]:
# ds_train = load_dataset("code_search_net", "java", split="train")
# ds_test = load_dataset("code_search_net", "java", split="test")
# ds_valid = load_dataset("code_search_net", "java", split="validation")
# raw_datasets = DatasetDict(
#     {
#         "train": ds_train.shuffle().select(range(12000)), #train_size)), # "train": ds_train,  # .shuffle().select(range(50000)),
#         "test": ds_test.shuffle().select(range(1500)),
#         "valid": ds_valid.shuffle().select(range(1500)) # "valid": ds_valid,  # .shuffle().select(range(500))
#     }
# )

In [4]:
sample = codesearchnet_dataset["train"].shuffle(seed=42).select(range(3))

for row in sample:
    print(f"\n'>>> code: {row['whole_func_string']}'")


'>>> code: public Boolean isWriteLocked(K token) {
	RWLock<K> lock = locks.get(token);
	if (lock == null) return null;
	return lock.isWriteLocked();
    }'

'>>> code: @Override
    public int getLevel() {
        Level level = log4jLogger.getLevel();
        if (level == null)
            level = Logger.getRootLogger().getLevel();
        switch (level.toInt()) {
            case Level.TRACE_INT:
                return TRACE;
            case Level.DEBUG_INT:
                return DEBUG;
            case Level.INFO_INT:
                return INFO;
            case Level.WARN_INT:
                return WARN;
            case Level.ERROR_INT:
                return ERROR;
            case Level.FATAL_INT:
                return FATAL;
            default:
                throw new IllegalArgumentException("Unsupported log4j level: " + level);
        }
    }'

'>>> code: public TerminalEmulatorDeviceConfiguration withCursorBlinking(boolean cursorBlinking) {
        if(this.cursorBli

In [8]:
model_checkpoint = "microsoft/codebert-base-mlm"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [9]:
codesearchnet_dataset

DatasetDict({
    train: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 454451
    })
    test: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 26909
    })
    validation: Dataset({
        features: ['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url'],
        num_rows: 15328
    })
})

In [10]:
def tokenize_function(examples):
    result = tokenizer(examples["whole_func_string"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = codesearchnet_dataset.map(
    tokenize_function, batched=True, remove_columns=['repository_name', 'func_path_in_repository', 'func_name', 'whole_func_string', 'language', 'func_code_string', 'func_code_tokens', 'func_documentation_string', 'func_documentation_tokens', 'split_name', 'func_code_url']
)
tokenized_datasets

Map:   0%|          | 0/26909 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (955 > 512). Running this sequence through the model will result in indexing errors


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 454451
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 26909
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 15328
    })
})

In [11]:
# #define tokenize function to tokenize the dataset
# def tokenize_function(data):
#     result = tokenizer(data["whole_func_string"])
#     if tokenizer.is_fast:
#         result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
#     return result

# # batched is set to True to activate fast multithreading!
# tokenize_dataset = raw_datasets.map(tokenize_function, batched = True, remove_columns = raw_datasets["train"].column_names)
# tokenize_dataset

In [12]:
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> code {idx} length: {len(sample)}'")

'>>> code 0 length: 65'
'>>> code 1 length: 84'
'>>> code 2 length: 168'


In [13]:
concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated code length: {total_length}'")

'>>> Concatenated code length: 317'


In [14]:
chunk_size = 128
chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

'>>> Chunk length: 128'
'>>> Chunk length: 128'
'>>> Chunk length: 61'


In [15]:
def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [16]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/26909 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 1132440
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61821
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 31402
    })
})

In [17]:
tokenizer.decode(lm_datasets["train"][1]["input_ids"])

'Name, resourceGroupName, fabricName, containerName), serviceCallback);\n    }</s><s>public Observable<Void> inquireAsync(String vaultName, String resourceGroupName, String fabricName, String containerName, String filter) {\n        return inquireWithServiceResponseAsync(vaultName, resourceGroupName, fabricName, containerName, filter).map(new Func1<ServiceResponse<Void>, Void>() {\n            @Override\n            public'

In [18]:
tokenizer.decode(lm_datasets["train"][1]["labels"])

'Name, resourceGroupName, fabricName, containerName), serviceCallback);\n    }</s><s>public Observable<Void> inquireAsync(String vaultName, String resourceGroupName, String fabricName, String containerName, String filter) {\n        return inquireWithServiceResponseAsync(vaultName, resourceGroupName, fabricName, containerName, filter).map(new Func1<ServiceResponse<Void>, Void>() {\n            @Override\n            public'

In [19]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [20]:
samples = [lm_datasets["train"][i] for i in range(2)]
for sample in samples:
    _ = sample.pop("word_ids")

for chunk in data_collator(samples)["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.



'>>> <s>public void inquire(String vaultName, String resourceGroupName<mask> String fabricName<mask> String containerName) {
 <mask>   <mask>  inquireWithServiceResponseAsync(vaultName, resourceGroupName, fabricName, containerName).to<mask>ocking().single().body();
 <mask>  }</s><s><mask> ServiceFuture<mask><mask>oid<mask> inquireAsync(String vaultName, String resourceGroupName, String fabric<mask>, String containerName, final ServiceCallback<V<mask>> serviceCallback) {
<mask>  <mask>    return ServiceFuture.fromResponse(inquireWithServiceResponseAsync(vault'

'>>> Name, resourceGroupName<mask> fabricName, containerName),<mask>Callback);73    }</s><s>public Observable<mask> Rankingoid> inquire<mask>(<mask> vault<mask>, String resourceGroupName, String fabric<mask>, String containerName<mask> String filter) {
 <mask>      return inquireWithServiceResponseAsync(v congenName, resourceGroupName, fabricName, containerName, filter).<mask>(new Func1<ServiceResponse<V<mask><mask> Void>() {
 G

In [21]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

In [22]:
samples = [lm_datasets["train"][i] for i in range(2)]
batch = whole_word_masking_data_collator(samples)

for chunk in batch["input_ids"]:
    print(f"\n'>>> {tokenizer.decode(chunk)}'")


'>>> <s><mask><mask> inquire(String vaultName, String resourceGroupName, String<mask><mask>, String containerName)<mask>
        inquireWithServiceResponseAsync<mask>vaultName,<mask><mask><mask><mask> fabricName, containerName).toBlocking().single<mask><mask>();
    }</s><s><mask> ServiceFuture<Void<mask> inquireAsync<mask>String vaultName, String resourceGroupName<mask> String fabricName, String containerName<mask> final ServiceCallback<Void<mask> serviceCallback) {
       <mask> ServiceFuture.fromResponse(inquireWithServiceResponseAsync(vault'

'>>> Name<mask> resourceGroupName<mask> fabricName, containerName), serviceCallback);
    }</s><s><mask> Observable<Void<mask> inquireAsync(String<mask><mask>, String resourceGroupName<mask> String fabricName, String containerName<mask> String filter<mask><mask>
        return inquireWithServiceResponseAsync<mask>vaultName, resourceGroupName, fabricName,<mask><mask>,<mask>).map<mask>new<mask><mask><mask><ServiceResponse<Void>,<mask>>() {
    

In [None]:
# train_size = 5000
# test_size = int(0.1 * train_size)
# valid_size = int(0.1 * train_size)

# downsampled_dataset = lm_datasets["train"].train_test_split(
#     train_size=train_size, test_size=test_size, seed=42
# )
# downsampled_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 500
    })
})

In [24]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [25]:
from datasets import Dataset

train_size = 4000
valid_size = 500
test_size = 500

# Split the dataset into train, validation, and test sets
train_dataset = lm_datasets["train"].shuffle(seed=42).select(range(train_size))
remaining_dataset = lm_datasets["train"].shuffle(seed=42).select(range(train_size, len(lm_datasets["train"])))
valid_dataset = remaining_dataset.shuffle(seed=42).select(range(valid_size))
test_dataset = remaining_dataset.shuffle(seed=42).select(range(valid_size, valid_size + test_size))

# Print sizes of each split
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(valid_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")


Train dataset size: 4000
Validation dataset size: 500
Test dataset size: 500


In [26]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(train_dataset) // batch_size
# print(train_dataset["train"])
# print(len(train_dataset["train"]))
# print(train_dataset)

training_args = TrainingArguments(
    output_dir="MLM_FinetunedModel",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)

In [27]:
from transformers import Trainer

trainer = Trainer(
    model= AutoModelForMaskedLM.from_pretrained(model_checkpoint),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print (train_dataset)
print (test_dataset)

Some weights of the model checkpoint at microsoft/codebert-base-mlm were not used when initializing RobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 4000
})
Dataset({
    features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
    num_rows: 500
})


In [28]:
import os
os.environ['WANDB_NOTEBOOK_NAME'] = 'MLM_wholewordmask.ipynb'
os.environ['WANDB_MODE'] = 'disabled'
#  WANDB_MODE=disabled

In [29]:
# import math

# eval_results = trainer.evaluate()
# print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
import math

# Evaluate the model
eval_results = trainer.evaluate()

# Calculate perplexity
perplexity = math.exp(eval_results['eval_loss'])


# Calculate loss
loss = eval_results['eval_loss']

print(f">>> Perplexity: {perplexity:.2f}")
print(f">>> Loss: {loss:.2f}")



>>> Perplexity: 3.50
>>> Loss: 1.25


In [30]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.553422
2,0.677300,0.514992
3,0.677300,0.512817


TrainOutput(global_step=96, training_loss=0.6415141224861145, metrics={'train_runtime': 80.5957, 'train_samples_per_second': 148.891, 'train_steps_per_second': 1.191, 'total_flos': 789796389888000.0, 'train_loss': 0.6415141224861145, 'epoch': 3.0})

In [31]:
# Evaluate the model
eval_results = trainer.evaluate()

# Calculate perplexity
perplexity = math.exp(eval_results['eval_loss'])

# Calculate loss
loss = eval_results['eval_loss']

print(f">>> Perplexity: {perplexity:.2f}")
print(f">>> Loss: {loss:.2f}")


>>> Perplexity: 1.63
>>> Loss: 0.49


In [32]:
model = "MLM_FinetunedModel"

pred_model = pipeline("fill-mask", model = "MLM_FinetunedModel")

text = "public Evaluation create(SimpleNode node, Object source)\n    {\n        return <mask>(node, source, false);\n    }"

preds = pred_model(text)
print(preds)

[{'score': 0.9904773235321045, 'token': 1045, 'token_str': ' create', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return create(node, source, false);\n    }'}, {'score': 0.0020682169124484062, 'token': 10516, 'token_str': ' evaluate', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return evaluate(node, source, false);\n    }'}, {'score': 0.0005082740099169314, 'token': 146, 'token_str': ' make', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return make(node, source, false);\n    }'}, {'score': 0.00044398324098438025, 'token': 1119, 'token_str': ' build', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return build(node, source, false);\n    }'}, {'score': 0.00033154641278088093, 'token': 120, 'token_str': ' get', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return get(node, source, false);

In [33]:
model = "MLM_FinetunedModel"
pred_model = pipeline("fill-mask", model=model)
text = "public Evaluation create(SimpleNode node, Object source)\n    {\n        return <mask>(node, source, false);\n    }"

# Get predictions
preds = pred_model(text)
print(preds)

# Sort predictions by score in descending order
sorted_preds = sorted(preds, key=lambda x: x['score'], reverse=True)
print(sorted_preds)
# Determine the rank of the correct answer
correct_answer = preds[0]['token_str']
print(correct_answer)
correct_rank = next(i+1 for i, pred in enumerate(sorted_preds) if pred['token_str'] == correct_answer)
print(correct_rank)

# Compute the reciprocal ranks
reciprocal_ranks = [1 / rank for rank in range(1, len(sorted_preds) + 1)]
print(reciprocal_ranks)

# Calculate Mean Reciprocal Rank
mrr = sum(reciprocal_ranks) / len(reciprocal_ranks)

print("Mean Reciprocal Rank (MRR):", mrr)


[{'score': 0.9904773235321045, 'token': 1045, 'token_str': ' create', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return create(node, source, false);\n    }'}, {'score': 0.0020682169124484062, 'token': 10516, 'token_str': ' evaluate', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return evaluate(node, source, false);\n    }'}, {'score': 0.0005082740099169314, 'token': 146, 'token_str': ' make', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return make(node, source, false);\n    }'}, {'score': 0.00044398324098438025, 'token': 1119, 'token_str': ' build', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return build(node, source, false);\n    }'}, {'score': 0.00033154641278088093, 'token': 120, 'token_str': ' get', 'sequence': 'public Evaluation create(SimpleNode node, Object source)\n    {\n        return get(node, source, false);

In [None]:
import pandas as pd

# Get the correct answer (assuming it's the first mask prediction)
correct_answer = preds[0]['token_str']
print(correct_answer)

# Create a DataFrame from the predictions
df = pd.DataFrame(preds)

# Sort the DataFrame by score in descending order
df_sorted = df.sort_values(by='score', ascending=False)
print(df_sorted)

# Reset the index of the sorted DataFrame
df_sorted.reset_index(drop=True, inplace=True)

# Determine the rank of the correct answer
correct_rank = df_sorted.index[df_sorted['token_str'] == correct_answer].tolist()[0] + 1  # Add 1 to start ranks from 1
print(correct_rank)

# Calculate the reciprocal ranks
df_sorted['rank'] = df_sorted.index + 1
print(df_sorted['rank'])
df_sorted['reciprocal_rank'] = 1 / df_sorted['rank']

# Calculate Mean Reciprocal Rank (MRR)
mrr = df_sorted['reciprocal_rank'].mean()

print("Mean Reciprocal Rank (MRR):", mrr)

#The correct answer ("create") is at index 0 after sorting. So, its rank is 1.
# The ranks would be [1, 2, 3, 4, 5], and the reciprocal ranks would be [1.0, 0.5, 0.333, 0.25, 0.2].
# MRR is the mean of the reciprocal ranks, which is (1.0 + 0.5 + 0.333 + 0.25 + 0.2) / 5 = 0.45666666666666667.

 create
      score  token  token_str  \
0  0.948937   1045     create   
1  0.018123  10516   evaluate   
2  0.002204  37131       eval   
3  0.001642   1119      build   
4  0.001627    146       make   

                                            sequence  
0  public Evaluation create(SimpleNode node, Obje...  
1  public Evaluation create(SimpleNode node, Obje...  
2  public Evaluation create(SimpleNode node, Obje...  
3  public Evaluation create(SimpleNode node, Obje...  
4  public Evaluation create(SimpleNode node, Obje...  
1
0    1
1    2
2    3
3    4
4    5
Name: rank, dtype: int64
Mean Reciprocal Rank (MRR): 0.45666666666666667


In [34]:
import pandas as pd

# Sample outputs
data = {
    'score': [0.948937, 0.018123, 0.002204, 0.001642, 0.001627],
    'token': [1045, 10516, 37131, 1119, 146],
    'token_str': ['create', 'evaluate', 'eval', 'build', 'make'],
}

df = pd.DataFrame(data)

# Assuming 'create' is the correct token for the query
ground_truth = 'create'

# Sort the outputs by score
sorted_df = df.sort_values(by='score', ascending=False)
print(sorted_df)

# Find the rank of the correct token
rank = sorted_df.index[sorted_df['token_str'] == ground_truth][0] + 1
print(rank)

# Calculate reciprocal rank
RR = 1 / rank

print("Reciprocal Rank (RR):", RR)

# Calculate Mean Reciprocal Rank (MRR)
MRR = df.apply(lambda row: 1 / (sorted_df.index[sorted_df['token_str'] == row['token_str']][0] + 1), axis=1).mean()

print("Mean Reciprocal Rank (MRR):", MRR)


      score  token token_str
0  0.948937   1045    create
1  0.018123  10516  evaluate
2  0.002204  37131      eval
3  0.001642   1119     build
4  0.001627    146      make
1
Reciprocal Rank (RR): 1.0
Mean Reciprocal Rank (MRR): 0.45666666666666667
