In [20]:
import torch
import json
import re
import numpy as np
from dataclasses import dataclass, field
import random

import pandas as pd
from datasets import Dataset, load_dataset
from tqdm import tqdm
from data.discovery_con import LABELS

from transformers import (
    AutoModelForCausalLM,
    AutoModelForMultipleChoice, 
    AutoTokenizer,
    AutoConfig
)

from transformers import (
    EvalPrediction,
    Trainer,
    default_data_collator,
    TrainingArguments,
    HfArgumentParser
)
from generate import DatasetGenerate, AdversarialFiltering
from config import decoding_options
from utils import compute_metrics

%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [21]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["WANDB_DISABLED"]="true"

In [4]:
df = pd.read_json('data/ctrl_main.json')
dataset = Dataset.from_pandas(df)

original_dataset = load_dataset('discovery', 'discovery', split='train[:7%]')

Reusing dataset discovery (/home/nlp/.cache/huggingface/datasets/discovery/discovery/1.0.0/f08ced5950fb93854c70d20fc70d1583766d7219cda67e65d197dfe9ec3775ca)


In [3]:
valid_ds = load_dataset('discovery', 'discovery', split='validation')

Reusing dataset discovery (/home/nlp/.cache/huggingface/datasets/discovery/discovery/1.0.0/f08ced5950fb93854c70d20fc70d1583766d7219cda67e65d197dfe9ec3775ca)


In [4]:
df = pd.read_json('data/ctrl_valid_1.json')
gen_ds = Dataset.from_pandas(df)

# original_dataset = load_dataset('discovery', 'discovery', split='train[:7%]')

In [19]:
valid_ds[len(gen_ds)-1]

{'idx': 2609,
 'label': 95,
 'sentence1': 'the parties in favor of negotiation, on the other hand, survived their defeat, and drew some strength from the weakness of the parties opposed to it.',
 'sentence2': 'In this party in favor of negotiation there were many intellectuals, whose perspicacity and depth of thought no longer need demonstration.'}

In [16]:
gen_ds[-1]

{'context': 'the parties in favor of negotiation, on the other hand, survived their defeat, and drew some strength from the weakness of the parties opposed to it.',
 'marker': 'naturally',
 'ground_truth': 'In this party in favor of negotiation there were many intellectuals, whose perspicacity and depth of thought no longer need demonstration.',
 'option_0': 'The negotiations were renewed at once',
 'option_1': 'It was not long before the negotiations were resumed but they were conducted on very different principles from those which had guided them in the former instance At length it was agreed that the terms of surrender should be submitted to the arbitration of Great Britain and the United States with the assurance of their respective ministers that they would abide by the decision',
 'option_2': 'It is with these two tendencies that the present controversy has been principally waged in the North and the South At a certain period the natural policy of the North was to treat propositi

In [21]:
# valid_o = load_dataset('discovery', 'discovery', split='validation')

In [24]:
valid_o[10]

{'idx': 10,
 'label': 149,
 'sentence1': 'Both Brown and Pavarotti have passed on since this performance, though not as a direct result of it.',
 'sentence2': 'This duet-which took place in Modena in 2002-somehow managed to put both outsized talents into one massively melodramatic song without spiraling out of control.'}

In [16]:
def preprocess_function(examples):
    prompt = examples['context'] + '</s>' + examples['marker']
    choice_0, choice_1, choice_2 = str(examples['ground_truth']), str(examples['option_0']), str(examples['option_1'])
    choice_3 = str(examples['option_2'])
    choices = [choice_0, choice_1, choice_2, choice_3]
    random.shuffle(choices)
    encoding = tokenizer([prompt, prompt, prompt, prompt], choices, return_tensors='pt', max_length=96, padding='max_length', truncation=True)
    encoding["label"] = choices.index(choice_0)
    return encoding

In [4]:
# config = AutoConfig.from_pretrained('roberta-large', num_labels=4)
tokenizer = AutoTokenizer.from_pretrained('ctrl')
# model = AutoModelForMultipleChoice.from_pretrained('roberta-large', config=config)

In [11]:
tokenizer.control_codes

{'Pregnancy': 168629,
 'Christianity': 7675,
 'Explain': 106423,
 'Fitness': 63440,
 'Saving': 63163,
 'Ask': 27171,
 'Ass': 95985,
 'Joke': 163509,
 'Questions': 45622,
 'Thoughts': 49605,
 'Retail': 52342,
 'Feminism': 164338,
 'Writing': 11992,
 'Atheism': 192263,
 'Netflix': 48616,
 'Computing': 39639,
 'Opinion': 43213,
 'Alone': 44967,
 'Funny': 58917,
 'Gaming': 40358,
 'Human': 4088,
 'India': 1331,
 'Joker': 77138,
 'Diet': 36206,
 'Legal': 11859,
 'Norman': 4939,
 'Tip': 72689,
 'Weight': 52343,
 'Movies': 46273,
 'Running': 23425,
 'Science': 2090,
 'Horror': 37793,
 'Confession': 60572,
 'Finance': 12250,
 'Politics': 16360,
 'Scary': 191985,
 'Support': 12654,
 'Technologies': 32516,
 'Teenage': 66160,
 'Event': 32769,
 'Learned': 67460,
 'Notion': 182770,
 'Wikipedia': 37583,
 'Books': 6665,
 'Extract': 76050,
 'Confessions': 102701,
 'Conspiracy': 75932,
 'Links': 63674,
 'Narcissus': 150425,
 'Relationship': 54766,
 'Relationships': 134796,
 'Reviews': 41671,
 'News': 4

In [12]:
tokenizer.encode('pregnancy')

[30520]

In [6]:
prompt = "In 1991, the remains of Russian Tsar Nicholas II and his family \
(except for Alexei and Maria) are discovered. \
The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the \
remainder of the story. 1883 Western Siberia, \
a young Grigori Rasputin is asked by his father and a group of men to perform magic. \
Rasputin has a vision and denounces one of the men as a horse thief. Although his \
father initially slaps him for making such an accusation, Rasputin watches as the \
man is chased outside and beaten. Twenty years later, Rasputin sees a vision of \
the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, \
with people, even a bishop, begging for his blessing. <eod> </s> <eos>"

In [8]:
encoded_prompt = tokenizer.encode(prompt, add_special_tokens=False)

In [10]:
tokenizer.decode(encoded_prompt)

"In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) are discovered. The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the remainder of the story. 1883 Western Siberia, a young Grigori Rasputin is asked by his father and a group of men to perform magic. Rasputin has a vision and denounces one of the men as a horse thief. Although his father initially slaps him for making such an accusation, Rasputin watches as the man is chased outside and beaten. Twenty years later, Rasputin sees a vision of the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, with people, even a bishop, begging for his blessing. <eod> </s> <eos>"

In [13]:
tokenizer.encode("Pregnancy")

[168629]

In [5]:
tokenizer.control_codes

{'Pregnancy': 168629,
 'Christianity': 7675,
 'Explain': 106423,
 'Fitness': 63440,
 'Saving': 63163,
 'Ask': 27171,
 'Ass': 95985,
 'Joke': 163509,
 'Questions': 45622,
 'Thoughts': 49605,
 'Retail': 52342,
 'Feminism': 164338,
 'Writing': 11992,
 'Atheism': 192263,
 'Netflix': 48616,
 'Computing': 39639,
 'Opinion': 43213,
 'Alone': 44967,
 'Funny': 58917,
 'Gaming': 40358,
 'Human': 4088,
 'India': 1331,
 'Joker': 77138,
 'Diet': 36206,
 'Legal': 11859,
 'Norman': 4939,
 'Tip': 72689,
 'Weight': 52343,
 'Movies': 46273,
 'Running': 23425,
 'Science': 2090,
 'Horror': 37793,
 'Confession': 60572,
 'Finance': 12250,
 'Politics': 16360,
 'Scary': 191985,
 'Support': 12654,
 'Technologies': 32516,
 'Teenage': 66160,
 'Event': 32769,
 'Learned': 67460,
 'Notion': 182770,
 'Wikipedia': 37583,
 'Books': 6665,
 'Extract': 76050,
 'Confessions': 102701,
 'Conspiracy': 75932,
 'Links': 63674,
 'Narcissus': 150425,
 'Relationship': 54766,
 'Relationships': 134796,
 'Reviews': 41671,
 'News': 4

In [18]:
remove_column_names = ['option_0', 'option_1', 'option_2', 'ground_truth', 'marker', 'context']
dataset = dataset.map(preprocess_function, remove_columns=remove_column_names)

HBox(children=(FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [24]:
training_args = TrainingArguments(output_dir = '/home/nlp/apex/experiment/roberta/',
                                 per_device_train_batch_size=32, num_train_epochs=1)

In [20]:
for param in model.roberta.parameters():
    param.requires_grad = False

In [23]:
trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=default_data_collator,
)

In [25]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=189, training_loss=1.3175509821170222, metrics={'train_runtime': 136.7218, 'train_samples_per_second': 1.382, 'total_flos': 4912507270656000, 'epoch': 3.0})

In [26]:
preds = trainer.predict(dataset)

In [28]:
tokenizer = AutoTokenizer.from_pretrained('ctrl')
model = AutoModelForCausalLM.from_pretrained('/home/nlp/apex/experiment/ctrl/')

In [29]:
generate_dataset = DatasetGenerate(original_dataset, LABELS, tokenizer, model, decoding_options)af.generate

In [30]:
af = AdversarialFiltering(generate_dataset, model, preds, decoding_options)

In [31]:
af.generate_new_samples()

100%|██████████| 1791/1791 [00:00<00:00, 78335.66it/s]
  1%|          | 11/1791 [00:51<2:18:09,  4.66s/it]


KeyboardInterrupt: 