Note - random split of evaluation and training tasks determined at download, divided as follows as a static option:
Evaluation:
1. subtask002_quoref_answer_generation  --Answer Generation(AG)                          
2. subtask003_mctaco_question_generation_event_duration -- Question Generation(QG)
3. subtask005_mctaco_wrong_answer_generation_event_duration -- Incorrect Answer Generation(IAG)
4. subtask008_mctaco_wrong_answer_generation_transient_stationary -- IAG
5. subtask022_cosmosqa_passage_inappropriate_binary -- Classification(CF)
6. subtask033_winogrande_answer_generation -- AG
7. subtask034_winogrande_question_modification_object -- Minimal Text Modification(MM)
8. subtask039_qasc_find_overlapping_words -- Verification(VF)
9. subtask040_qasc_question_generation -- QG
10. subtask044_essential_terms_identifying_essential_words -- VF
11. subtask045_miscellaneous_sentence_paraphrasing -- MM
12. subtask052_multirc_identify_bad_question -- CF
Currently randomly generates a subset

In [1]:
import random
import copy
import torch

device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'

categories = {'QG': ['subtask001_quoref_question_generation', 
                     'subtask003_mctaco_question_generation_event_duration', 
                     'subtask006_mctaco_question_generation_transient_stationary', 
                     'subtask009_mctaco_question_generation_event_ordering',
                     'subtask012_mctaco_question_generation_absolute_timepoint',
                     'subtask015_mctaco_question_generation_frequency',
                     'subtask023_cosmosqa_question_generation',
                     'subtask026_drop_question_generation',
                     'subtask031_winogrande_question_generation_object',
                     'subtask032_winogrande_question_generation_person',
                     'subtask040_qasc_question_generation',
                     'subtask048_multirc_question_generation',
                     'subtask060_ropes_question_generation4'],
              'AG': ['subtask002_quoref_answer_generation', 
                     'subtask004_mctaco_answer_generation_event_duration', 
                     'subtask007_mctaco_answer_generation_transient_stationary',
                     'subtask010_mctaco_answer_generation_event_ordering',
                     'subtask013_mctaco_answer_generation_absolute_timepoint',
                     'subtask016_mctaco_answer_generation_frequency',
                     'subtask024_cosmosqa_answer_generation',
                     'subtask028_drop_answer_generation',
                     'subtask033_winogrande_answer_generation',
                     'subtask041_qasc_answer_generation',
                     'subtask043_essential_terms_answering_incomplete_questions',
                     'subtask047_misc_answering_science_questions',
                     'subtask051_multirc_correct_answer_single_sentence',
                     'subtask054_multirc_write_correct_answer',
                     'subtask058_multirc_question_answering',
                     'subtask061_ropes_answer_generation4'],
              'IAG': ['subtask005_mctaco_wrong_answer_generation_event_duration', 
                      'subtask008_mctaco_wrong_answer_generation_transient_stationary',
                      'subtask011_mctaco_wrong_answer_generation_event_ordering',
                      'subtask014_mctaco_wrong_answer_generation_absolute_timepoint',
                      'subtask017_mctaco_wrong_answer_generation_frequency',
                      'subtask025_cosmosqa_incorrect_answer_generation',
                      'subtask042_qasc_incorrect_option_generation',
                      'subtask055_multirc_write_incorrect_answer'],
              'CF': ['subtask018_mctaco_temporal_reasoning_presence',
                     'subtask019_mctaco_temporal_reasoning_category',
                     'subtask020_mctaco_span_based_question',
                     'subtask021_mctaco_grammatical_logical',
                     'subtask022_cosmosqa_passage_inappropriate_binary',
                     'subtask027_drop_answer_type_generation',
                     'subtask046_miscellaenous_question_typing',
                     'subtask049_multirc_questions_needed_to_answer',
                     'subtask050_multirc_answerability',
                     'subtask052_multirc_identify_bad_question',
                     'subtask056_multirc_classify_correct_answer',
                     'subtask057_multirc_classify_incorrect_answer',
                     ],
              'MM': ['subtask029_winogrande_full_object',
                     'subtask030_winogrande_full_person',
                     'subtask034_winogrande_question_modification_object',
                     'subtask035_winogrande_question_modification_person',
                     'subtask036_qasc_topic_word_to_generate_related_fact',
                     'subtask037_qasc_generate_related_fact',
                     'subtask038_qasc_combined_fact',
                     'subtask045_miscellaneous_sentence_paraphrasing',
                     'subtask053_multirc_correct_bad_question',
                     'subtask059_ropes_story_generation4'],
              'VF': ['subtask039_qasc_find_overlapping_words',
                     'subtask044_essential_terms_identifying_essential_words',
                     ],
              }

# Move two random subtasks from each category into the evaluation subtasks
trainingPrompts = copy.deepcopy(categories)
evaluationPrompts = {'QG': [], 'AG': [], 'IAG': [], 'CF': [], 'MM': [], 'VF': []}
for key in trainingPrompts.keys():
    subtask = random.choice(trainingPrompts[key])
    trainingPrompts[key].remove(subtask)
    evaluationPrompts[key].append(subtask)
    subtask = random.choice(trainingPrompts[key])
    trainingPrompts[key].remove(subtask)
    evaluationPrompts[key].append(subtask)


In [22]:
# Run this block after to use preset subtasks, do not run to use random subtasks for evaluation
trainingPrompts = copy.deepcopy(categories)
evaluationPrompts = {'QG': ['subtask003_mctaco_question_generation_event_duration',
                            'subtask040_qasc_question_generation'],
                     'AG': ['subtask002_quoref_answer_generation',
                            'subtask033_winogrande_answer_generation'],
                     'IAG': ['subtask005_mctaco_wrong_answer_generation_event_duration',
                             'subtask008_mctaco_wrong_answer_generation_transient_stationary'],
                     'CF': ['subtask022_cosmosqa_passage_inappropriate_binary',
                            'subtask052_multirc_identify_bad_question'],
                     'MM': ['subtask034_winogrande_question_modification_object',
                            'subtask045_miscellaneous_sentence_paraphrasing'],
                     'VF': ['subtask039_qasc_find_overlapping_words',
                            'subtask044_essential_terms_identifying_essential_words']}

for key in trainingPrompts.keys():
    for subtask in evaluationPrompts[key]:
        trainingPrompts[key].remove(subtask)

In [2]:
# Instructions Encoding - add pos/neg examples later

def no_examples_encoding(task, inp):
    return f"""Definition: {task['Definition']}
Prompt: {task['Prompt']}
Things to Avoid: {task['Things to Avoid']}
Emphasis&Caution: {task['Emphasis & Caution']}
Input: {inp}
Output:"""

In [5]:
# Models
from transformers import BartTokenizer, BartModel, GPT2Tokenizer, GPT2Model, BartForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration

random_number_model = (lambda **x: random.choice(['One', 'Two', 'Three', 'Four', 'Five', 'Six', 'Seven', 'Eight', 'Nine']))
random_tokenizer = BartTokenizer.from_pretrained('facebook/bart-base') #just using this so it takes the same inputs, output not important for random

# Needs to be pretrained, takes a long time when untrained(might also be bad code)
bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

# For testing a slightly better baseline than random, should look at GPT3
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
gpt2_model = GPT2Model.from_pretrained('gpt2')

# Second fine-tuned model to test
t5_tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-small")
t5_model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-small')

bart_model.to(device)
gpt2_model.to(device)
t5_model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
              (wo): 

In [6]:
from transformers import TrainerCallback

class LoggingCallback(TrainerCallback):
    def __init__(self, log_path):
        self.log_path = log_path
        
    def on_log(self, args, state, control, logs=None, **kwargs):
        _ = logs.pop("total_flos", None)
        if state.is_local_process_zero:
            with open(self.log_path, "a") as f:
                f.write(json.dumps(logs) + "\n")


In [11]:
# Create pandas dataframe of samples, using no_examples_encoding, and process data into usable form
import pandas as pd
from datasets import Dataset, DatasetDict
import json

training_dict = {'Instructions': [], 'Outputs': []}
testing_dict = {'Instructions': [], 'Outputs': []}

for category in trainingPrompts.keys():
    for task in trainingPrompts[category]:
        with open('./app_static_tasks_sample/' + task + '.json') as json_file:
            subtask = json.load(json_file)
            for instance in subtask['Instances']:
                string_encoding = no_examples_encoding(subtask, instance['input'])
                training_dict['Instructions'].append(string_encoding)
                training_dict['Outputs'].append(instance['output'][0])

for category in evaluationPrompts.keys():
    for task in evaluationPrompts[category]:
        with open('./app_static_tasks_sample/' + task + '.json') as json_file:
            subtask = json.load(json_file)
            for instance in subtask['Instances']:
                string_encoding = no_examples_encoding(subtask, instance['input'])
                testing_dict['Instructions'].append(string_encoding)
                testing_dict['Outputs'].append(instance['output'][0])
                
df_training = pd.DataFrame(training_dict)
df_testing = pd.DataFrame(testing_dict)

df_training = df_training.sample(5000)
df_testing = df_testing.sample(5000)

training_dataset = Dataset.from_pandas(df_training)
testing_dataset = Dataset.from_pandas(df_testing)

def convert_to_features(example_batch):
    input_encodings = t5_tokenizer.batch_encode_plus(example_batch['Instructions'], pad_to_max_length=True, max_length=128, truncation=True)
    target_encodings = t5_tokenizer.batch_encode_plus(example_batch['Outputs'], pad_to_max_length=True, max_length=128, truncation=True)
    
    labels = target_encodings['input_ids']
    
    encodings = {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': labels,
    }

    return encodings
    
trainingDelimiter = int(len(df_training) * (3/4))

# Create Dataset in huggingface acceptable format, for now using all tasks(will probably take way too long)
for_finetuning = DatasetDict(
    train=training_dataset.shuffle(seed=1111).select(range(trainingDelimiter)),
    val=training_dataset.shuffle(seed=1111).select(range(trainingDelimiter, len(df_training)))
)

tokenized_data_t5 = for_finetuning.map(
    convert_to_features,
    batched=True,
    batch_size=16
)

tokenized_data_t5 = tokenized_data_t5.remove_columns(["Instructions"])
tokenized_data_t5.set_format("torch")

  0%|          | 0/235 [00:00<?, ?ba/s]



  0%|          | 0/79 [00:00<?, ?ba/s]

In [12]:
# all of this needs much more work - figure out fine-tuning seq2seq models, dataset probably also needs to be processed better
from torch.utils.data import DataLoader
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

#train_dataloader = DataLoader(tokenized_data_bart['train'], batch_size=1)
#eval_dataloader = DataLoader(tokenized_data_bart['val'], batch_size=1)

arguments = Seq2SeqTrainingArguments(
    output_dir="t5_trainer_0",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    learning_rate=2e-5,
    load_best_model_at_end=True,
    seed=224,
)

# def compute_metrics(eval_pred):
#     """Called at the end of validation. Gives accuracy"""
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     # calculates the accuracy
#     return {"accuracy": np.mean(predictions == labels)}


trainer = Seq2SeqTrainer(
    model=t5_model,
    args=arguments,
    train_dataset=tokenized_data_t5['train'],
    eval_dataset=tokenized_data_t5['val'],
    tokenizer=t5_tokenizer,
)

trainer.add_callback(LoggingCallback("t5_trainer_0/log.jsonl"))

train_result = trainer.train()
#trainer.save_model()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: Outputs, __index_level_0__. If Outputs, __index_level_0__ are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3750
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 2814
  Number of trainable parameters = 76961152


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [10]:
test = training_dict['Instructions'][0]

t5_model = T5ForConditionalGeneration.from_pretrained('t5_trainer_0/checkpoint-2814')

print(test)

output = t5_model.generate(**t5_tokenizer(test, return_tensors="pt"), max_length=128)

print(t5_tokenizer.batch_decode(output)[0])

loading configuration file t5_trainer_0/checkpoint-2814\config.json
Model config T5Config {
  "_name_or_path": "google/flan-t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "d_ff": 1024,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 8,
  "num_heads": 6,
  "num_layers": 8,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "no_repeat_ngram_size": 3,
      "num_beams": 4,
      "prefix": "summarize: "
    },
    "translation_

Definition: In this task, you're given passages that contain mentions of names of people, places, or things. Some of these mentions refer to the same person, place, or thing. Your job is to write questions that evaluate one's understanding of such references. Good questions are expected to link pronouns (she, her, him, his, their, etc.) or other mentions to people, places, or things to which they may refer. 
Prompt: Write a question about the passage.
Things to Avoid: 1. Avoid questions that can be answered correctly without actually understanding the paragraph. 2. Avoid questions that do not link phrases referring to the same entity, 3. Avoid questions that have multiple answers. 

Emphasis&Caution: 1. For each of your questions the answer should be one or more phrases in the paragraph, 2. The answer for each question should be unambiguous. 
Output:
<pad> What is the name of the person who was part of the Geological Survey of Canada at the time of the report?</s>


In [48]:
# Process dataset into better form

# class WritingTasksDataset(torch.utils.data.Dataset):
    
#     def __init__(self, prompts):
#         #super(WritingTasksDataset).__init__()
#         self.entries = prompts
        
#     def __len__(self):
#         return len(self.entries)
    
#     def __getitem__(self, idx):
#         if (torch.is_tensor(idx)):
#             idx = idx.tolist()
        
#         return {'input': self.entries.iloc[idx, 0], 'output': self.entries.iloc[idx, 1]}

    
# trainingDelemiter = int(trainingData.__len__() * (3/4))
# indecies = 

# trainingData = WritingTasksDataset(df_training)
# evalData = WritingTasksDataset(df_eval)
# testingData = WritingTasksDataset(df_testing)


In [1]:
# Fine-tune BART (could probably be done much more efficiently)
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm

num_epochs = 3
# Calculate total number of prompts


In [4]:
# Evaluation(currently for models trained on no_examples_encoding)
import numpy as np
import json
from torchmetrics.text.rouge import ROUGEScore

#models = {'Random': random_number_model, 'Bart': bart_model}
#tokenizers = {'Random': random_tokenizer, 'Bart': bart_tokenizer}
models = {'GPT2': gpt2_model}
tokenizers = {'GPT2': gpt2_tokenizer}
scores = {}
scorer = ROUGEScore(rouge_keys=('rougeL'))

for category in evaluationPrompts.keys():
    scoresForCategory = {}
    for model in models.keys():
        scoresForCategory[model] = []
    for task in evaluationPrompts[category]:
        with open('./app_static_tasks_sample/' + task + '.json') as json_file:
            subtask = json.load(json_file)
            for instance in subtask['Instances']:
                string_encoding = no_examples_encoding(subtask, instance['input'])
                for model in models.keys():
                    tokenizer = tokenizers[model]
                    inputs = tokenizer(string_encoding, return_tensors="pt") # this might need to be more general
                    if (len(inputs) >= 1024):
                        print('ignored prompt')
                        continue #temporary fix to avoid prompts which are too long
                    try: #FIX THIS LATER
                        outputs = models[model](**inputs)
                        rgeScores = scorer(instance['output'], outputs)
                        scoresForCategory[model].append(rgeScores['rougeL_fmeasure'].item())
                    except:
                        continue
    for model in models.keys():
        scoresForCategory[model] = sum(scoresForCategory[model]) / len(scoresForCategory[model])
    scores[category] = scoresForCategory
    print(category + " complete")

print(scores)

  warn(f"Failed to load image Python extension: {e}")


QG complete
AG complete
IAG complete
CF complete
MM complete
VF complete
{'QG': {'GPT2': 0.045351071792734045}, 'AG': {'GPT2': 0.06611021189375639}, 'IAG': {'GPT2': 0.04102925021885609}, 'CF': {'GPT2': 0.0}, 'MM': {'GPT2': 0.06392074578889599}, 'VF': {'GPT2': 0.006320624665350138}}
