In [None]:
import torch
import datasets
import pandas as pd

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("cross-encoder/ms-marco-TinyBERT-L-2-v2")
tokenizer = AutoTokenizer.from_pretrained("cross-encoder/ms-marco-TinyBERT-L-2-v2")

In [None]:
dataset = load_dataset("Tevatron/msmarco-passage", split="train[:1%]")

In [None]:
device = "cuda"
model.to(device)
model.eval()

In [None]:
def get_scores(sample, query, passage_type):
    
    new_sample = sample.copy()
    
    passages = [passage["text"] for passage in sample[passage_type]]
    
    tmp_queries = [query] * len(passages)
    features = tokenizer(tmp_queries, passages,  padding=True, truncation=True, return_tensors="pt")
    features.to(device)
    
    with torch.no_grad():
        scores = model(**features).logits
    for idx, ele in enumerate(new_sample[passage_type]):
            ele["teacher_score"] = scores[idx].item()
            
    return new_sample

In [None]:
def update_dataset(dataset):
    new_dataset=[]
    
    for sample in dataset:

        query = sample["query"]

        sample_w_teacher_score = get_scores(sample, query, "positive_passages")
        sample.update(sample_w_teacher_score)


        sample_w_teacher_score = get_scores(sample, query, "negative_passages")
        sample.update(sample_w_teacher_score)        

        new_dataset.append(sample)
    
    return new_dataset

In [None]:
dataset_w_teacher_scores = update_dataset(dataset)

In [None]:
dataset_w_teacher_scores = datasets.Dataset.from_pandas(pd.DataFrame(data=dataset_w_teacher_scores))

In [None]:
dataset_w_teacher_scores.save_to_disk("new_dataset/")