In [1]:
from datasets import load_dataset

ds = load_dataset("cbasu/Med-EASi")

print(ds)
print("Train dataset sample: ")
ds['train'][0]

  from .autonotebook import tqdm as notebook_tqdm


DatasetDict({
    train: Dataset({
        features: ['Expert', 'Simple', 'Annotation', 'sim', 'sentence_sim', 'compression', 'expert_fk_grade', 'expert_ari', 'layman_fk_grade', 'layman_ari', 'umls_expert', 'umls_layman', 'expert_terms', 'layman_terms', 'idx'],
        num_rows: 1397
    })
    validation: Dataset({
        features: ['Expert', 'Simple', 'Annotation', 'sim', 'sentence_sim', 'compression', 'expert_fk_grade', 'expert_ari', 'layman_fk_grade', 'layman_ari', 'umls_expert', 'umls_layman', 'expert_terms', 'layman_terms', 'idx'],
        num_rows: 196
    })
    test: Dataset({
        features: ['Expert', 'Simple', 'Annotation', 'sim', 'sentence_sim', 'compression', 'expert_fk_grade', 'expert_ari', 'layman_fk_grade', 'layman_ari', 'umls_expert', 'umls_layman', 'expert_terms', 'layman_terms', 'idx'],
        num_rows: 300
    })
})
Train dataset sample: 


{'Expert': '75-90 % of the affected people have mild intellectual disability.',
 'Simple': "People with syndromic intellectual disabilities may have a `` typical look. ''",
 'Annotation': "<del>75-90 % of the</del> <rep>affected people  have mild intellectual disability.<by>People with syndromic intellectual disabilities</rep> <ins>may have a `` typical look. ''</ins>",
 'sim': 0.48951049,
 'sentence_sim': 0.639872432,
 'compression': 1.2,
 'expert_fk_grade': 12.7,
 'expert_ari': 12.4,
 'layman_fk_grade': 13.1,
 'layman_ari': 15.1,
 'umls_expert': "[[{'start': 41, 'end': 64, 'ngram': 'intellectual disability', 'term': 'intellectual disability', 'cui': 'C3714756', 'similarity': 1.0, 'semtypes': {'T048'}, 'preferred': 1, 'preferred_term': None}, {'start': 41, 'end': 64, 'ngram': 'intellectual disability', 'term': 'Intellectual disability', 'cui': 'C3714756', 'similarity': 0.9090909090909091, 'semtypes': {'T048'}, 'preferred': 0, 'preferred_term': None}, {'start': 41, 'end': 64, 'ngram': 

In [2]:
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")


def preprocess_and_tokenizer(batch):
    inputs = batch['Expert']
    targets = batch['Simple']

    model_inputs = tokenizer(inputs, truncation=True, max_length=256)
    labels = tokenizer(text_target=targets, truncation=True, max_length=256)["input_ids"]

    model_inputs["labels"] = labels
    return model_inputs


col_names = ds["train"].column_names
tokenized_ds = ds.map(preprocess_and_tokenizer, batched=True, remove_columns=col_names)

tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1397
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 196
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})

In [3]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainer, Seq2SeqTrainingArguments

model_name = "google/flan-t5-base"

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="checkpoints/flan_t5_baseline",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    learning_rate=3e-5,
    num_train_epochs=5,
    warmup_steps=1000,
    logging_steps=50,
    save_steps=500,
    eval_steps=500,
    load_best_model_at_end=True,
    tpu_num_cores=8,
    predict_with_generate=True,
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)


trainer.train()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 