In [10]:
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import load_dataset

model_name_1 = "alinet/bart-base-squad-qg"
bart_squad_tokenizer = BartTokenizer.from_pretrained(model_name_1)
bart_squad = BartForConditionalGeneration.from_pretrained(model_name_1)

model_name_2 = "alinet/bart-base-balanced-qg"
bart_balanced_ra_tokenizer= BartTokenizer.from_pretrained(model_name_2)
bart_balanced_ra = BartForConditionalGeneration.from_pretrained(model_name_2)

In [11]:
def run_model(input_string, model, tokenizer, **generator_args):
  input_ids = tokenizer.encode(input_string, return_tensors="pt")
  res = model.generate(input_ids, **generator_args)
  output = tokenizer.batch_decode(res, skip_special_tokens=True)
  return output

def generate_questions(data):
  source = data['source']
  
  squad_question4b = run_model(source, bart_squad, bart_squad_tokenizer, max_length=100, num_beams=4)
  balanced_question4b = run_model(source, bart_balanced_ra, bart_balanced_ra_tokenizer, max_length=100, num_beams=4)

  data['squad_question4b'] = squad_question4b[0]
  data['balanced_question4b'] = balanced_question4b[0]

  return data

def contain_unique_question_context(data, unique_sources):
  if data['source'] in unique_sources:
    return False
  else:
    unique_sources.add(data['source'])
    return True
  

In [12]:
balanced_ds = load_dataset("alinet/balanced_qg", "default", split='validation')

unique_sources = set()
balanced_ds = balanced_ds.filter(contain_unique_question_context, fn_kwargs={"unique_sources": unique_sources})

balanced_ds = (
  balanced_ds
  .add_column("squad_question4b", [None] * len(balanced_ds))
  .add_column("balanced_question4b", [None] * len(balanced_ds))
  .map(generate_questions)
)

Map:   0%|          | 0/1346 [00:00<?, ? examples/s]

In [13]:
balanced_ds.to_csv("../../data/compare_baseline_balanced_4b.csv")

Creating CSV from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

1954505