# Summarizing (w/ GPT-2)

## Packages

In [1]:
from utils.json_utils import read_json
from datasets import Dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## Tokenizing

In [2]:
nonTokenizedWsubj_docs = read_json("18_docs_with_subjects_non_coref.json")
#nonTokenizedWsubj_docs

In [3]:
varied_set_adjectives = read_json("14_varied_set_adjectives_definitions.json")
#varied_set_adjectives

In [4]:
training_sents = list(map(lambda x : x["doc"], nonTokenizedWsubj_docs))
#training_sents

## Transformer

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch

2023-01-24 16:19:32.464451: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-24 16:19:32.686827: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-24 16:19:32.686841: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-24 16:19:33.354570: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [6]:
model_checkpoint = "distilgpt2"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [8]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [9]:
tokenize_fn = lambda doc : tokenizer(
        doc,
        truncation=True,
        max_length=1024
    )

tokenized_dataset = list(map(tokenize_fn, training_sents))

In [10]:
tokenized_dataset = pd.DataFrame(tokenized_dataset)
tokenized_dataset = Dataset.from_pandas(tokenized_dataset)

In [11]:
model = AutoModelForCausalLM.from_pretrained(
    model_checkpoint, pad_token_id=tokenizer.eos_token_id)

In [12]:
device = f"cuda:{torch.cuda.current_device()}" if torch.cuda.is_available() else "cpu"

In [13]:
training_args = TrainingArguments(
    f"{model_checkpoint}-finetuned-docs-non-coref-black-clover",
    num_train_epochs=2,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,    
    learning_rate=2e-5,
    weight_decay=0.01,
    do_eval=True, # eval en validation set
    evaluation_strategy="steps", # eval en validation set
    eval_steps=100,
    save_steps=100, # checkpoint model every 500 steps
    logging_dir='./logs', # logging
    logging_strategy="steps",
    logging_steps=1,
    fp16=False, # float16 en training (only on CUDA)
    push_to_hub=False,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset, #.select(range(0, 128)),
    eval_dataset=tokenized_dataset, #.select(range(0, 128)),
)

In [15]:
train_output = trainer.train()

***** Running training *****
  Num examples = 52
  Num Epochs = 2
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 12
  0%|          | 0/12 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  8%|▊         | 1/12 [00:23<04:16, 23.36s/it]

{'loss': 4.0201, 'learning_rate': 1.8333333333333333e-05, 'epoch': 0.17}


 17%|█▋        | 2/12 [00:47<03:55, 23.57s/it]

{'loss': 4.0488, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.33}


 25%|██▌       | 3/12 [01:07<03:20, 22.30s/it]

{'loss': 3.9715, 'learning_rate': 1.5000000000000002e-05, 'epoch': 0.5}


 33%|███▎      | 4/12 [01:26<02:46, 20.78s/it]

{'loss': 3.8043, 'learning_rate': 1.3333333333333333e-05, 'epoch': 0.67}


 42%|████▏     | 5/12 [01:41<02:12, 18.89s/it]

{'loss': 3.9545, 'learning_rate': 1.1666666666666668e-05, 'epoch': 0.83}


 50%|█████     | 6/12 [01:46<01:23, 13.97s/it]

{'loss': 3.8902, 'learning_rate': 1e-05, 'epoch': 1.0}


 58%|█████▊    | 7/12 [02:03<01:15, 15.16s/it]

{'loss': 3.9044, 'learning_rate': 8.333333333333334e-06, 'epoch': 1.17}


 67%|██████▋   | 8/12 [02:19<01:00, 15.19s/it]

{'loss': 3.8203, 'learning_rate': 6.666666666666667e-06, 'epoch': 1.33}


 75%|███████▌  | 9/12 [02:41<00:52, 17.54s/it]

{'loss': 4.1155, 'learning_rate': 5e-06, 'epoch': 1.5}


 83%|████████▎ | 10/12 [03:03<00:37, 18.91s/it]

{'loss': 3.9303, 'learning_rate': 3.3333333333333333e-06, 'epoch': 1.67}


 92%|█████████▏| 11/12 [03:24<00:19, 19.30s/it]

{'loss': 3.6333, 'learning_rate': 1.6666666666666667e-06, 'epoch': 1.83}


100%|██████████| 12/12 [03:28<00:00, 14.75s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 12/12 [03:28<00:00, 17.37s/it]

{'loss': 3.7667, 'learning_rate': 0.0, 'epoch': 2.0}
{'train_runtime': 208.3832, 'train_samples_per_second': 0.499, 'train_steps_per_second': 0.058, 'train_loss': 3.904994567235311, 'epoch': 2.0}





In [16]:
def generate(
    prompt=None, max_length=1024, max_new_tokens=20, greedy=True, model=model, tokenizer=tokenizer, device=device
):
    """None stands for beggining of sequence.
    NOTE si bien parece que GPT2 puede generar a partir de BOS token, la 
    documentacion es poco clara. Ademas hicimos nuestro finetuning sin BOS token.
    Entonces solo vamos a usar la funcion pasandole un contexto.

    Ver:
    https://github.com/huggingface/transformers/issues/3311#issuecomment-601264426
    https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/generate_unconditional_samples.py#L60
    """
    do_sample = False if greedy else True
    # model.eval() to set dropout and batch normalization layers to evaluation 
    # mode before running inference
    if prompt:
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
        model.eval()
        outputs = model.generate(input_ids, do_sample=do_sample, max_new_tokens=max_new_tokens)
    else:
        model.eval()
        outputs = model.generate(do_sample=do_sample, max_new_tokens=max_new_tokens)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

## Summarize

In [17]:
generate("Microsoft's CEO is ", greedy=False)

['Microsoft\'s CEO is icky with the idea of using the name "Cynical". "It\'s a joke," he']

In [18]:
import re

In [19]:
non_greedy_generated_descriptions = []
for doc in nonTokenizedWsubj_docs:
    for subject in doc["subjects"]:
        generated_doc = generate(doc["doc"] + " " + subject + " can be described as ", max_length=1024, greedy=False)
        if len(re.split(doc["doc"], generated_doc[0])) >= 2:
            generated_desc = re.sub(r"\s+", " ", (re.split(r"[.;:!?]",list(map(lambda x : x.strip(), re.split(doc["doc"], generated_doc[0])))[1]))[0])
            non_greedy_generated_descriptions.append(generated_desc)

In [20]:
greedy_generated_descriptions = []
for doc in nonTokenizedWsubj_docs[:10]:
    for subject in doc["subjects"]:
        generated_doc = generate(doc["doc"] + " " + subject + " can be described as ", max_length=1024, greedy=True)
        if len(re.split(doc["doc"], generated_doc[0])) >= 2:
            generated_desc = re.sub(r"\s+", " ", (re.split(r"[.;:!?]",list(map(lambda x : x.strip(), re.split(doc["doc"], generated_doc[0])))[1]))[0])
            greedy_generated_descriptions.append(generated_desc)

In [21]:
re.split("\.", "hol.")

['hol', '']

In [22]:
non_greedy_generated_descriptions

['Yuno can be described as ixix (x) or ix (y) and the devil (y) appear',
 'Asta can be described as Âslightly taller than the rest - but that is not true',
 'Lily can be described as ~~ Yuki Chiba',
 'Yuno can be described as izumi, a soft, kind, outgoing and very kind, with a quiet, gentle tone,',
 'Orsi can be described as 不鬼, the strongest of Magic Knights, although that nickname is also also not true',
 'Asta can be described as a dark, bearded man with a golden eyes and a long red beard',
 'Lily can be described as 경우',
 'Finral can be described as † \ue752 \ue752 \ue752 여�',
 'Nozel can be described as iced, but he is sure he has other problems',
 'Gordon can be described as iced at being possessed by all of the creatures and that he is an intelligent spirit',
 'Rill can be described as ichikine, but that does not mean he cannot become the Sorcerer King',
 'Jack can be described as iced over by his father and his son',
 'Dorothy can be described as ichai-chan or as igo-chan',
 'Y

In [23]:
greedy_generated_descriptions

['Yuno can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Asta can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Lily can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Yuno can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Orsi can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Asta can be described as iced and very strong, but he is not strong enough to fight',
 'Lily can be described as iced and very strong, but he is not strong enough to fight',
 'Finral can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Nozel can be described as',
 'Gordon can be described as',
 'Rill can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Jack can be described as',
 'Dorothy can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Yami can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Fuegoleon can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Yuno can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Sekke can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Charlotte can be described as',
 'Asta can be described as 『『『『『『『『『『『『『『『『『『『『',
 'William can be described as 『『『『『『『『『

In [24]:
non_greedy_generated_descriptions_simple = []
for doc in nonTokenizedWsubj_docs:
    for subject in doc["subjects"]:
        generated_doc = generate(subject + " can be described as ", max_length=1024, greedy=False)
        generated_desc = re.sub(r"\s+", " ", (re.split(r"[.;:!?]", generated_doc[0].strip())[0]))
        non_greedy_generated_descriptions_simple.append(generated_desc)

In [25]:
greedy_generated_descriptions_simple = []
for doc in nonTokenizedWsubj_docs[:10]:
    for subject in doc["subjects"]:
        generated_doc = generate(subject + " can be described as ", max_length=1024, greedy=True)
        generated_desc = re.sub(r"\s+", " ", (re.split(r"[.;:!?]", generated_doc[0].strip())[0]))
        greedy_generated_descriptions_simple.append(generated_desc)

In [26]:
non_greedy_generated_descriptions_simple

['Yuno can be described as icky,\u202e️,\u202e️,\u202e️,\u202e️,\u202e',
 'Asta can be described as erylic acid, which does not necessarily indicate the presence of an enzyme in the bane or in',
 'Lily can be described as ike‽, with an eye twinkling out at him',
 'Yuno can be described as 一为来一踰, a character that takes on different roles as a person',
 'Orsi can be described as iced up and dressed as an elf',
 'Asta can be described as 《 〇〇 , or a \u200c々, or a t',
 'Lily can be described as icky, cold, and a very thin-skinned being',
 'Finral can be described as ˞ləm, but the only thing to consider here is that no real-world',
 'Nozel can be described as iced like a cactus',
 'Gordon can be described as (Laughter) “ At the same time, if you can call yourself an',
 'Rill can be described as __________________',
 'Jack can be described as 『Miku›››』, but he is already quite happy about his own',
 'Dorothy can be described as _______ or _______ with a long, heavy fur tail',
 'Yami can be d

In [27]:
greedy_generated_descriptions_simple

['In Hage, a priest finds two babies abandoned outside his church',
 'In Hage, a priest finds two babies abandoned outside his church',
 'In Hage, a priest finds two babies abandoned outside his church',
 'A city is being attacked by a giant demon',
 'A city is being attacked by a giant demon',
 'A city is being attacked by a giant demon',
 'A city is being attacked by a giant demon',
 'The boys arrive in Kikka and register for the Magic Knights Entrance Exam',
 'The boys arrive in Kikka and register for the Magic Knights Entrance Exam',
 'The boys arrive in Kikka and register for the Magic Knights Entrance Exam',
 'The boys arrive in Kikka and register for the Magic Knights Entrance Exam',
 'The boys arrive in Kikka and register for the Magic Knights Entrance Exam',
 'The boys arrive in Kikka and register for the Magic Knights Entrance Exam',
 'The boys arrive in Kikka and register for the Magic Knights Entrance Exam',
 'The boys arrive in Kikka and register for the Magic Knights Entr