# Summarizing (w/ GPT-2)

## Packages

In [1]:
from utils.json_utils import read_json
from datasets import Dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## Tokenizing

In [2]:
nonTokenizedWsubj_docs = read_json("18_docs_with_subjects.json")
#nonTokenizedWsubj_docs

In [3]:
varied_set_adjectives = read_json("14_varied_set_adjectives_definitions.json")
#varied_set_adjectives

In [4]:
training_sents = list(map(lambda x : x["doc"], nonTokenizedWsubj_docs))
#training_sents

## Transformer

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch

2023-01-24 16:32:57.371341: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-24 16:32:57.444756: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-24 16:32:57.444767: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-24 16:32:57.864495: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [6]:
model_checkpoint = "distilgpt2"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [8]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [9]:
tokenize_fn = lambda doc : tokenizer(
        doc,
        truncation=True,
        max_length=1024
    )

tokenized_dataset = list(map(tokenize_fn, training_sents))

In [10]:
tokenized_dataset = pd.DataFrame(tokenized_dataset)
tokenized_dataset = Dataset.from_pandas(tokenized_dataset)

In [11]:
model = AutoModelForCausalLM.from_pretrained(
    model_checkpoint, pad_token_id=tokenizer.eos_token_id)

In [12]:
device = f"cuda:{torch.cuda.current_device()}" if torch.cuda.is_available() else "cpu"

In [13]:
training_args = TrainingArguments(
    f"{model_checkpoint}-finetuned-docs-coref-black-clover",
    num_train_epochs=2,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,    
    learning_rate=2e-5,
    weight_decay=0.01,
    do_eval=True, # eval en validation set
    evaluation_strategy="steps", # eval en validation set
    eval_steps=100,
    save_steps=100, # checkpoint model every 500 steps
    logging_dir='./logs', # logging
    logging_strategy="steps",
    logging_steps=1,
    fp16=False, # float16 en training (only on CUDA)
    push_to_hub=False,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset, #.select(range(0, 128)),
    eval_dataset=tokenized_dataset, #.select(range(0, 128)),
)

In [15]:
train_output = trainer.train()

***** Running training *****
  Num examples = 52
  Num Epochs = 2
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 12
  0%|          | 0/12 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  8%|▊         | 1/12 [00:30<05:33, 30.35s/it]

{'loss': 3.8829, 'learning_rate': 1.8333333333333333e-05, 'epoch': 0.17}


 17%|█▋        | 2/12 [01:05<05:33, 33.36s/it]

{'loss': 3.9362, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.33}


 25%|██▌       | 3/12 [01:35<04:44, 31.57s/it]

{'loss': 3.9006, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.5}


 33%|███▎      | 4/12 [02:02<04:00, 30.03s/it]

{'loss': 3.6338, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.67}


 42%|████▏     | 5/12 [02:22<03:03, 26.16s/it]

{'loss': 3.7165, 'learning_rate': 1.1666666666666668e-05, 'epoch': 0.83}


 50%|█████     | 6/12 [02:27<01:55, 19.17s/it]

{'loss': 3.6796, 'learning_rate': 1e-05, 'epoch': 1.0}


 58%|█████▊    | 7/12 [02:47<01:36, 19.25s/it]

{'loss': 3.8655, 'learning_rate': 8.333333333333334e-06, 'epoch': 1.17}


 67%|██████▋   | 8/12 [03:05<01:15, 19.00s/it]

{'loss': 3.6831, 'learning_rate': 6.666666666666667e-06, 'epoch': 1.33}


 75%|███████▌  | 9/12 [03:35<01:06, 22.31s/it]

{'loss': 3.8989, 'learning_rate': 5e-06, 'epoch': 1.5}


 83%|████████▎ | 10/12 [04:03<00:48, 24.13s/it]

{'loss': 3.763, 'learning_rate': 3.3333333333333333e-06, 'epoch': 1.67}


 92%|█████████▏| 11/12 [04:32<00:25, 25.54s/it]

{'loss': 3.4472, 'learning_rate': 1.6666666666666667e-06, 'epoch': 1.83}


100%|██████████| 12/12 [04:37<00:00, 19.47s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 12/12 [04:37<00:00, 23.15s/it]

{'loss': 3.6197, 'learning_rate': 0.0, 'epoch': 2.0}
{'train_runtime': 277.8513, 'train_samples_per_second': 0.374, 'train_steps_per_second': 0.043, 'train_loss': 3.7522533535957336, 'epoch': 2.0}





In [16]:
def generate(
    prompt=None, max_length=1024, max_new_tokens=20, greedy=True, model=model, tokenizer=tokenizer, device=device
):
    """None stands for beggining of sequence.
    NOTE si bien parece que GPT2 puede generar a partir de BOS token, la 
    documentacion es poco clara. Ademas hicimos nuestro finetuning sin BOS token.
    Entonces solo vamos a usar la funcion pasandole un contexto.

    Ver:
    https://github.com/huggingface/transformers/issues/3311#issuecomment-601264426
    https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/generate_unconditional_samples.py#L60
    """
    do_sample = False if greedy else True
    # model.eval() to set dropout and batch normalization layers to evaluation 
    # mode before running inference
    if prompt:
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
        model.eval()
        outputs = model.generate(input_ids, do_sample=do_sample, max_new_tokens=max_new_tokens)
    else:
        model.eval()
        outputs = model.generate(do_sample=do_sample, max_new_tokens=max_new_tokens)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

## Summarize

In [17]:
generate("Microsoft's CEO is ", greedy=False)

["Microsoft's CEO is ick. ick.. I know where this is, but I\u202am not kidding.\n"]

In [18]:
import re

In [19]:
non_greedy_generated_descriptions = []
for doc in nonTokenizedWsubj_docs:
    for subject in doc["subjects"]:
        generated_doc = generate(doc["doc"] + " " + subject + " can be described as ", max_length=1024, greedy=False)
        if len(re.split(doc["doc"], generated_doc[0])) >= 2:
            generated_desc = re.sub(r"\s+", " ", (re.split(r"[.;:!?]",list(map(lambda x : x.strip(), re.split(doc["doc"], generated_doc[0])))[1]))[0])
            non_greedy_generated_descriptions.append(generated_desc)

In [20]:
greedy_generated_descriptions = []
for doc in nonTokenizedWsubj_docs[:10]:
    for subject in doc["subjects"]:
        generated_doc = generate(doc["doc"] + " " + subject + " can be described as ", max_length=1024, greedy=True)
        if len(re.split(doc["doc"], generated_doc[0])) >= 2:
            generated_desc = re.sub(r"\s+", " ", (re.split(r"[.;:!?]",list(map(lambda x : x.strip(), re.split(doc["doc"], generated_doc[0])))[1]))[0])
            greedy_generated_descriptions.append(generated_desc)

In [21]:
re.split("\.", "hol.")

['hol', '']

In [22]:
non_greedy_generated_descriptions

['Yuno can be described as icky, but it is not clear what type of magic the devil possesses',
 'Asta can be described as erythematic',
 'Lily can be described as vernacular',
 'Yuno can be described as ichyunzu, or an average white man',
 "Orsi can be described as iced at night by the magic knight and the girls' school",
 'Asta can be described as ersatz',
 'Lily can be described as 게륿다',
 'Finral can be described as ikinyo, but he cannot distinguish himself from the blue-haired black wizard',
 'Nozel can be described as 价博镶。 While at the start of the test, the students are',
 'Gordon can be described as ссорлань, a type of mage',
 'Rill can be described as 伕人身因慜',
 'Jack can be described as ive heartless, not a good fighter',
 'Dorothy can be described as urchinish, who makes an offensive move',
 'Yami can be described as 三即。 The first few candidates, one of which is not an actual wizard, show great',
 'Fuegoleon can be described as _____',
 'Yuno can be described as ikinyukan, but As

In [23]:
greedy_generated_descriptions

['Yuno can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Asta can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Lily can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Yuno can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Orsi can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Asta can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Lily can be described as iced tea, but the boys are not very good at it',
 'Finral can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Nozel can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Gordon can be described as',
 'Rill can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Jack can be described as',
 'Dorothy can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Yami can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Fuegoleon can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Yuno can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Sekke can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Charlotte can be described as',
 'Asta can be described as 『『『『『『『『『『『『『『『『『『『『',
 'William can be described as',
 'Yuno can be described as 『『『『『『『『『

In [24]:
non_greedy_generated_descriptions_simple = []
for doc in nonTokenizedWsubj_docs:
    for subject in doc["subjects"]:
        generated_doc = generate(subject + " can be described as ", max_length=1024, greedy=False)
        generated_desc = re.sub(r"\s+", " ", (re.split(r"[.;:!?]", generated_doc[0].strip())[0]))
        non_greedy_generated_descriptions_simple.append(generated_desc)

In [25]:
greedy_generated_descriptions_simple = []
for doc in nonTokenizedWsubj_docs[:10]:
    for subject in doc["subjects"]:
        generated_doc = generate(subject + " can be described as ", max_length=1024, greedy=True)
        generated_desc = re.sub(r"\s+", " ", (re.split(r"[.;:!?]", generated_doc[0].strip())[0]))
        greedy_generated_descriptions_simple.append(generated_desc)

In [26]:
non_greedy_generated_descriptions_simple

['Yuno can be described as icky',
 'Asta can be described as --------------,›,',
 'Lily can be described as 『Won 不陻』',
 'Yuno can be described as 今慒展事。 I can think of them as 今',
 'Orsi can be described as a super superman, the hero of the movie, the character of the film, and the',
 'Asta can be described as ˈɛn̄jɒdəˈsɭnəz',
 'Lily can be described as 七足, but 七足 makes a huge difference',
 'Finral can be described as "Cerberus\'s light-haired red mage apprentice',
 'Nozel can be described as iced and hot, sweet and creamy',
 'Gordon can be described as',
 'Rill can be described as ____',
 'Jack can be described as icky',
 'Dorothy can be described as eryxic, the term eryxic, or the term eryxal',
 'Yami can be described as iced coffee',
 'Fuegoleon can be described as ____ ____ ____ ____ ____ ____ ____ ____ ____ ____',
 'Yuno can be described as 企田奇毇吉, or 們丅吉 (',
 'Sekke can be described as iano',
 'Charlotte can be described as urchin, although her expression has not been taken into ac

In [27]:
greedy_generated_descriptions_simple

['Yuno can be described as 『『『『『『『『『『『『『『『『『『『』',
 'Asta can be described as a “a “a “a “a “a “a',
 'Lily can be described as icky, but she is also a very good person',
 'Yuno can be described as 『『『『『『『『『『『『『『『『『『『』',
 'Orsi can be described as icky, but he is also a very good player',
 'Asta can be described as a “a “a “a “a “a “a',
 'Lily can be described as icky, but she is also a very good person',
 'Finral can be described as a “a “a “a “a “a “a',
 'Nozel can be described as iced with a very strong, strong, strong, and very strong',
 'Gordon can be described as a “a “a “a “a “a “a',
 'Rill can be described as a “a “a “a “a “a “a',
 'Jack can be described as a “a “a “a “a “a “a',
 'Dorothy can be described as a “a “a “a “a “a “a',
 'Yami can be described as',
 'Fuegoleon can be described as ichthyroid',
 'Yuno can be described as 『『『『『『『『『『『『『『『『『『『』',
 'Sekke can be described as a “a “a “a “a “a “a',
 'Charlotte can be described as a “a “a “a “a “a “a',
 'Asta can be described as