In [22]:
import torch
import json
import re
import numpy as np
from dataclasses import dataclass, field
import random

import pandas as pd
from datasets import Dataset, load_dataset
from tqdm import tqdm
from data.discovery_con import LABELS

from transformers import (
    AutoModelForCausalLM,
    AutoModelForMultipleChoice, 
    AutoTokenizer,
    AutoConfig
)

from transformers import (
    EvalPrediction,
    Trainer,
    default_data_collator,
    TrainingArguments,
    HfArgumentParser
)
from generate import DatasetGenerate, AdversarialFiltering
from config import decoding_options
from utils import compute_metrics

%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["WANDB_DISABLED"]="true"

In [4]:
df = pd.read_json('data/ctrl_main.json')
dataset = Dataset.from_pandas(df)

original_dataset = load_dataset('discovery', 'discovery', split='train[:7%]')

Reusing dataset discovery (/home/nlp/.cache/huggingface/datasets/discovery/discovery/1.0.0/f08ced5950fb93854c70d20fc70d1583766d7219cda67e65d197dfe9ec3775ca)


In [16]:
def preprocess_function(examples):
    prompt = examples['context'] + '</s>' + examples['marker']
    choice_0, choice_1, choice_2 = str(examples['ground_truth']), str(examples['option_0']), str(examples['option_1'])
    choice_3 = str(examples['option_2'])
    choices = [choice_0, choice_1, choice_2, choice_3]
    random.shuffle(choices)
    encoding = tokenizer([prompt, prompt, prompt, prompt], choices, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
    encoding["label"] = choices.index(choice_0)
    return encoding

In [17]:
config = AutoConfig.from_pretrained('roberta-large', num_labels=4)
tokenizer = AutoTokenizer.from_pretrained('roberta-large')
model = AutoModelForMultipleChoice.from_pretrained('roberta-large', config=config)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForMultipleChoice: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for pre

In [18]:
remove_column_names = ['option_0', 'option_1', 'option_2', 'ground_truth', 'marker', 'context']
dataset = dataset.map(preprocess_function, remove_columns=remove_column_names)

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [24]:
training_args = TrainingArguments(output_dir = '/home/nlp/apex/experiment/roberta/',
                                 per_device_train_batch_size=32, num_train_epochs=1)

In [20]:
for param in model.roberta.parameters():
    param.requires_grad = False

In [23]:
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
)

In [25]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=189, training_loss=1.3175509821170222, metrics={'train_runtime': 136.7218, 'train_samples_per_second': 1.382, 'total_flos': 4912507270656000, 'epoch': 3.0})

In [26]:
preds = trainer.predict(dataset)

In [28]:
tokenizer = AutoTokenizer.from_pretrained('ctrl')
model = AutoModelForCausalLM.from_pretrained('/home/nlp/apex/experiment/ctrl/')

In [29]:
generate_dataset = DatasetGenerate(original_dataset, LABELS, tokenizer, model, decoding_options)af.generate

In [30]:
af = AdversarialFiltering(generate_dataset, model, preds, decoding_options)

In [31]:
af.generate_new_samples()

100%|██████████| 1791/1791 [00:00<00:00, 78335.66it/s]
  1%|          | 11/1791 [00:51<2:18:09,  4.66s/it]


KeyboardInterrupt: 