# Summarizing (w/ GPT-2)

## Packages

In [1]:
from utils.json_utils import read_json
from datasets import Dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## Tokenizing

In [2]:
nonTokenizedWsubj_docs = read_json("18_docs_with_subjects.json")
#nonTokenizedWsubj_docs

In [3]:
varied_set_adjectives = read_json("14_varied_set_adjectives_definitions.json")
#varied_set_adjectives

In [4]:
training_sents = list(map(lambda x : x["doc"], nonTokenizedWsubj_docs))
#training_sents

## Transformer

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch

2023-01-24 16:46:52.330660: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-24 16:46:52.552869: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-24 16:46:52.552882: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-24 16:46:53.256468: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [6]:
model_checkpoint = "distilgpt2"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [8]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [9]:
tokenize_fn = lambda doc : tokenizer(
        doc,
        truncation=True,
        max_length=1024
    )

tokenized_dataset = list(map(tokenize_fn, training_sents))

In [10]:
tokenized_dataset = pd.DataFrame(tokenized_dataset)
tokenized_dataset = Dataset.from_pandas(tokenized_dataset)

In [11]:
model = AutoModelForCausalLM.from_pretrained(
    model_checkpoint, pad_token_id=tokenizer.eos_token_id)

In [12]:
device = f"cuda:{torch.cuda.current_device()}" if torch.cuda.is_available() else "cpu"

In [13]:
training_args = TrainingArguments(
    f"{model_checkpoint}-finetuned-docs-coref-black-clover",
    num_train_epochs=10,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,    
    learning_rate=2e-5,
    weight_decay=0.01,
    do_eval=True, # eval en validation set
    evaluation_strategy="steps", # eval en validation set
    eval_steps=100,
    save_steps=100, # checkpoint model every 500 steps
    logging_dir='./logs', # logging
    logging_strategy="steps",
    logging_steps=1,
    fp16=False, # float16 en training (only on CUDA)
    push_to_hub=False,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset, #.select(range(0, 128)),
    eval_dataset=tokenized_dataset, #.select(range(0, 128)),
)

In [15]:
train_output = trainer.train()

***** Running training *****
  Num examples = 52
  Num Epochs = 10
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 60
  0%|          | 0/60 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  2%|▏         | 1/60 [00:31<31:02, 31.57s/it]

{'loss': 3.8829, 'learning_rate': 1.9666666666666666e-05, 'epoch': 0.17}


  3%|▎         | 2/60 [01:04<31:27, 32.55s/it]

{'loss': 3.9362, 'learning_rate': 1.9333333333333333e-05, 'epoch': 0.33}


  5%|▌         | 3/60 [01:34<29:42, 31.28s/it]

{'loss': 3.8997, 'learning_rate': 1.9e-05, 'epoch': 0.5}


  7%|▋         | 4/60 [02:02<27:47, 29.78s/it]

{'loss': 3.6292, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.67}


  8%|▊         | 5/60 [02:21<23:58, 26.16s/it]

{'loss': 3.7096, 'learning_rate': 1.8333333333333333e-05, 'epoch': 0.83}


 10%|█         | 6/60 [02:26<17:06, 19.01s/it]

{'loss': 3.6744, 'learning_rate': 1.8e-05, 'epoch': 1.0}


 12%|█▏        | 7/60 [02:46<17:03, 19.32s/it]

{'loss': 3.8449, 'learning_rate': 1.7666666666666668e-05, 'epoch': 1.17}


 13%|█▎        | 8/60 [03:05<16:32, 19.10s/it]

{'loss': 3.664, 'learning_rate': 1.7333333333333336e-05, 'epoch': 1.33}


 15%|█▌        | 9/60 [03:36<19:23, 22.82s/it]

{'loss': 3.869, 'learning_rate': 1.7e-05, 'epoch': 1.5}


 17%|█▋        | 10/60 [04:03<20:06, 24.13s/it]

{'loss': 3.7245, 'learning_rate': 1.6666666666666667e-05, 'epoch': 1.67}


 18%|█▊        | 11/60 [04:30<20:29, 25.09s/it]

{'loss': 3.4008, 'learning_rate': 1.6333333333333335e-05, 'epoch': 1.83}


 20%|██        | 12/60 [04:36<15:14, 19.06s/it]

{'loss': 3.5753, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.0}


 22%|██▏       | 13/60 [05:03<16:51, 21.52s/it]

{'loss': 3.5645, 'learning_rate': 1.5666666666666667e-05, 'epoch': 2.17}


 23%|██▎       | 14/60 [05:25<16:36, 21.67s/it]

{'loss': 3.6017, 'learning_rate': 1.5333333333333334e-05, 'epoch': 2.33}


 25%|██▌       | 15/60 [05:44<15:42, 20.95s/it]

{'loss': 3.8356, 'learning_rate': 1.5000000000000002e-05, 'epoch': 2.5}


 27%|██▋       | 16/60 [06:13<17:01, 23.21s/it]

{'loss': 3.6651, 'learning_rate': 1.4666666666666666e-05, 'epoch': 2.67}


 28%|██▊       | 17/60 [06:38<17:05, 23.85s/it]

{'loss': 3.4161, 'learning_rate': 1.4333333333333334e-05, 'epoch': 2.83}


 30%|███       | 18/60 [06:41<12:18, 17.59s/it]

{'loss': 3.5534, 'learning_rate': 1.4e-05, 'epoch': 3.0}


 32%|███▏      | 19/60 [07:02<12:42, 18.61s/it]

{'loss': 3.5786, 'learning_rate': 1.3666666666666667e-05, 'epoch': 3.17}


 33%|███▎      | 20/60 [07:26<13:27, 20.19s/it]

{'loss': 3.8228, 'learning_rate': 1.3333333333333333e-05, 'epoch': 3.33}


 35%|███▌      | 21/60 [07:55<14:50, 22.83s/it]

{'loss': 3.4739, 'learning_rate': 1.3000000000000001e-05, 'epoch': 3.5}


 37%|███▋      | 22/60 [08:17<14:17, 22.56s/it]

{'loss': 3.4674, 'learning_rate': 1.2666666666666667e-05, 'epoch': 3.67}


 38%|███▊      | 23/60 [08:43<14:39, 23.78s/it]

{'loss': 3.4522, 'learning_rate': 1.2333333333333334e-05, 'epoch': 3.83}


 40%|████      | 24/60 [08:47<10:44, 17.90s/it]

{'loss': 3.2695, 'learning_rate': 1.2e-05, 'epoch': 4.0}


 42%|████▏     | 25/60 [09:14<12:00, 20.60s/it]

{'loss': 3.3497, 'learning_rate': 1.1666666666666668e-05, 'epoch': 4.17}


 43%|████▎     | 26/60 [09:43<12:57, 22.86s/it]

{'loss': 3.5882, 'learning_rate': 1.1333333333333334e-05, 'epoch': 4.33}


 45%|████▌     | 27/60 [10:07<12:48, 23.27s/it]

{'loss': 3.7039, 'learning_rate': 1.1000000000000001e-05, 'epoch': 4.5}


 47%|████▋     | 28/60 [10:29<12:11, 22.87s/it]

{'loss': 3.3551, 'learning_rate': 1.0666666666666667e-05, 'epoch': 4.67}


 48%|████▊     | 29/60 [10:54<12:12, 23.64s/it]

{'loss': 3.3746, 'learning_rate': 1.0333333333333335e-05, 'epoch': 4.83}


 50%|█████     | 30/60 [10:58<08:53, 17.79s/it]

{'loss': 3.607, 'learning_rate': 1e-05, 'epoch': 5.0}


 52%|█████▏    | 31/60 [11:24<09:40, 20.03s/it]

{'loss': 3.3889, 'learning_rate': 9.666666666666667e-06, 'epoch': 5.17}


 53%|█████▎    | 32/60 [11:49<10:05, 21.64s/it]

{'loss': 3.5756, 'learning_rate': 9.333333333333334e-06, 'epoch': 5.33}


 55%|█████▌    | 33/60 [12:11<09:46, 21.73s/it]

{'loss': 3.5836, 'learning_rate': 9e-06, 'epoch': 5.5}


 57%|█████▋    | 34/60 [12:33<09:26, 21.77s/it]

{'loss': 3.2211, 'learning_rate': 8.666666666666668e-06, 'epoch': 5.67}


 58%|█████▊    | 35/60 [13:01<09:54, 23.77s/it]

{'loss': 3.4518, 'learning_rate': 8.333333333333334e-06, 'epoch': 5.83}


 60%|██████    | 36/60 [13:05<07:03, 17.66s/it]

{'loss': 3.4252, 'learning_rate': 8.000000000000001e-06, 'epoch': 6.0}


 62%|██████▏   | 37/60 [13:33<07:59, 20.83s/it]

{'loss': 3.4691, 'learning_rate': 7.666666666666667e-06, 'epoch': 6.17}


 63%|██████▎   | 38/60 [13:59<08:11, 22.34s/it]

{'loss': 3.3795, 'learning_rate': 7.333333333333333e-06, 'epoch': 6.33}


 65%|██████▌   | 39/60 [14:25<08:15, 23.59s/it]

{'loss': 3.4241, 'learning_rate': 7e-06, 'epoch': 6.5}


 67%|██████▋   | 40/60 [14:50<08:00, 24.03s/it]

{'loss': 3.3497, 'learning_rate': 6.666666666666667e-06, 'epoch': 6.67}


 68%|██████▊   | 41/60 [15:09<07:08, 22.54s/it]

{'loss': 3.5161, 'learning_rate': 6.333333333333333e-06, 'epoch': 6.83}


 70%|███████   | 42/60 [15:14<05:09, 17.19s/it]

{'loss': 3.3075, 'learning_rate': 6e-06, 'epoch': 7.0}


 72%|███████▏  | 43/60 [15:40<05:38, 19.94s/it]

{'loss': 3.2266, 'learning_rate': 5.666666666666667e-06, 'epoch': 7.17}


 73%|███████▎  | 44/60 [16:10<06:04, 22.75s/it]

{'loss': 3.4985, 'learning_rate': 5.333333333333334e-06, 'epoch': 7.33}


 75%|███████▌  | 45/60 [16:29<05:28, 21.87s/it]

{'loss': 3.4848, 'learning_rate': 5e-06, 'epoch': 7.5}


 77%|███████▋  | 46/60 [16:52<05:09, 22.11s/it]

{'loss': 3.3181, 'learning_rate': 4.666666666666667e-06, 'epoch': 7.67}


 78%|███████▊  | 47/60 [17:17<04:58, 22.95s/it]

{'loss': 3.4876, 'learning_rate': 4.333333333333334e-06, 'epoch': 7.83}


 80%|████████  | 48/60 [17:21<03:26, 17.18s/it]

{'loss': 3.0792, 'learning_rate': 4.000000000000001e-06, 'epoch': 8.0}


 82%|████████▏ | 49/60 [17:45<03:31, 19.23s/it]

{'loss': 3.2952, 'learning_rate': 3.6666666666666666e-06, 'epoch': 8.17}


 83%|████████▎ | 50/60 [18:11<03:32, 21.26s/it]

{'loss': 3.5206, 'learning_rate': 3.3333333333333333e-06, 'epoch': 8.33}


 85%|████████▌ | 51/60 [18:33<03:12, 21.44s/it]

{'loss': 3.3299, 'learning_rate': 3e-06, 'epoch': 8.5}


 87%|████████▋ | 52/60 [18:59<03:04, 23.00s/it]

{'loss': 3.2996, 'learning_rate': 2.666666666666667e-06, 'epoch': 8.67}


 88%|████████▊ | 53/60 [19:28<02:52, 24.69s/it]

{'loss': 3.4809, 'learning_rate': 2.3333333333333336e-06, 'epoch': 8.83}


 90%|█████████ | 54/60 [19:32<01:51, 18.58s/it]

{'loss': 2.9002, 'learning_rate': 2.0000000000000003e-06, 'epoch': 9.0}


 92%|█████████▏| 55/60 [19:58<01:44, 20.83s/it]

{'loss': 3.5191, 'learning_rate': 1.6666666666666667e-06, 'epoch': 9.17}


 93%|█████████▎| 56/60 [20:28<01:33, 23.39s/it]

{'loss': 3.4286, 'learning_rate': 1.3333333333333334e-06, 'epoch': 9.33}


 95%|█████████▌| 57/60 [20:50<01:09, 23.20s/it]

{'loss': 3.4148, 'learning_rate': 1.0000000000000002e-06, 'epoch': 9.5}


 97%|█████████▋| 58/60 [21:17<00:48, 24.10s/it]

{'loss': 3.2324, 'learning_rate': 6.666666666666667e-07, 'epoch': 9.67}


 98%|█████████▊| 59/60 [21:37<00:23, 23.01s/it]

{'loss': 3.2888, 'learning_rate': 3.3333333333333335e-07, 'epoch': 9.83}


100%|██████████| 60/60 [21:42<00:00, 17.63s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 60/60 [21:42<00:00, 21.71s/it]

{'loss': 3.2367, 'learning_rate': 0.0, 'epoch': 10.0}
{'train_runtime': 1302.6924, 'train_samples_per_second': 0.399, 'train_steps_per_second': 0.046, 'train_loss': 3.493728176752726, 'epoch': 10.0}





In [16]:
def generate(
    prompt=None, max_length=1024, max_new_tokens=20, greedy=True, model=model, tokenizer=tokenizer, device=device
):
    """None stands for beggining of sequence.
    NOTE si bien parece que GPT2 puede generar a partir de BOS token, la 
    documentacion es poco clara. Ademas hicimos nuestro finetuning sin BOS token.
    Entonces solo vamos a usar la funcion pasandole un contexto.

    Ver:
    https://github.com/huggingface/transformers/issues/3311#issuecomment-601264426
    https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/generate_unconditional_samples.py#L60
    """
    do_sample = False if greedy else True
    # model.eval() to set dropout and batch normalization layers to evaluation 
    # mode before running inference
    if prompt:
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
        model.eval()
        outputs = model.generate(input_ids, do_sample=do_sample, max_new_tokens=max_new_tokens)
    else:
        model.eval()
        outputs = model.generate(do_sample=do_sample, max_new_tokens=max_new_tokens)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

## Summarize

In [17]:
generate("Microsoft's CEO is ", greedy=False)

["Microsoft's CEO is iktkej, as do three other business partners:\n\n\n-\nAnd more,"]

In [18]:
import re

In [19]:
non_greedy_generated_descriptions = []
for doc in nonTokenizedWsubj_docs:
    for subject in doc["subjects"]:
        generated_doc = generate(doc["doc"] + " " + subject + " can be described as ", max_length=1024, greedy=False)
        if len(re.split(doc["doc"], generated_doc[0])) >= 2:
            generated_desc = re.sub(r"\s+", " ", (re.split(r"[.;:!?]",list(map(lambda x : x.strip(), re.split(doc["doc"], generated_doc[0])))[1]))[0])
            non_greedy_generated_descriptions.append(generated_desc)

In [20]:
greedy_generated_descriptions = []
for doc in nonTokenizedWsubj_docs[:10]:
    for subject in doc["subjects"]:
        generated_doc = generate(doc["doc"] + " " + subject + " can be described as ", max_length=1024, greedy=True)
        if len(re.split(doc["doc"], generated_doc[0])) >= 2:
            generated_desc = re.sub(r"\s+", " ", (re.split(r"[.;:!?]",list(map(lambda x : x.strip(), re.split(doc["doc"], generated_doc[0])))[1]))[0])
            greedy_generated_descriptions.append(generated_desc)

In [21]:
re.split("\.", "hol.")

['hol', '']

In [22]:
non_greedy_generated_descriptions

['Yuno can be described as 【〈〉」',
 'Asta can be described as iliar',
 'Lily can be described as Rentura',
 'Yuno can be described as iced-up, strong, but extremely well-rounded, although he is not that talented now',
 'Orsi can be described as ~~an arachronistic',
 'Asta can be described as iced and has a huge appetite for the little things',
 'Lily can be described as iced up at night',
 'Finral can be described as 하검',
 'Nozel can be described as Â a strong peasant, but the only problem is that Asta must be defeated quickly before they have',
 'Gordon can be described as iced or strong',
 'Rill can be described as ersatz, but not being able to wield the magic magic to stop Yuno',
 'Jack can be described as ikiny, a man with powers that are difficult to control',
 'Dorothy can be described as vernacular',
 'Yami can be described as 【Magic Mage‡】',
 'Fuegoleon can be described as urchin prince, as having a dark, brown eyes',
 'Yuno can be described as 仝花期, the best magician ever, but t

In [23]:
greedy_generated_descriptions

['Yuno can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Asta can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Lily can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Yuno can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Orsi can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Asta can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Lily can be described as',
 'Finral can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Nozel can be described as',
 'Gordon can be described as',
 'Rill can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Jack can be described as',
 'Dorothy can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Yami can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Fuegoleon can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Yuno can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Sekke can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Charlotte can be described as',
 'Asta can be described as 『『『『『『『『『『『『『『『『『『『『',
 'William can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Yuno can be described as 『『『『『『『『『『『『『『『『『『『『',
 'Yami can be described as 『『『『『『

In [24]:
non_greedy_generated_descriptions_simple = []
for doc in nonTokenizedWsubj_docs:
    for subject in doc["subjects"]:
        generated_doc = generate(subject + " can be described as ", max_length=1024, greedy=False)
        generated_desc = re.sub(r"\s+", " ", (re.split(r"[.;:!?]", generated_doc[0].strip())[0]))
        non_greedy_generated_descriptions_simple.append(generated_desc)

In [25]:
greedy_generated_descriptions_simple = []
for doc in nonTokenizedWsubj_docs[:10]:
    for subject in doc["subjects"]:
        generated_doc = generate(subject + " can be described as ", max_length=1024, greedy=True)
        generated_desc = re.sub(r"\s+", " ", (re.split(r"[.;:!?]", generated_doc[0].strip())[0]))
        greedy_generated_descriptions_simple.append(generated_desc)

In [26]:
non_greedy_generated_descriptions_simple

['Yuno can be described as o arachnodora',
 'Asta can be described as 겝는면 춃을 겝�',
 'Lily can be described as للان سـل دلله شي ولا',
 'Yuno can be described as 귽버벅 as having a small but important role in that role',
 'Orsi can be described as the most amazing teacher in the school',
 'Asta can be described as',
 'Lily can be described as icky as ‐‘‘‘‘‘‘‘‘',
 'Finral can be described as ɼpɼpɹ',
 'Nozel can be described as icky looking',
 'Gordon can be described as 『Aqua Nation) a large force of elves that could annihilate any opponent, except by killing',
 'Rill can be described as well',
 'Jack can be described as iced and a cheerful face, but there are no easy facts to pin down',
 'Dorothy can be described as vernacular',
 'Yami can be described as 他幖幖人。 In a world where there are 2 classes, a',
 'Fuegoleon can be described as Ânélangei and ‘hélinnie’',
 'Yuno can be described as 『Mika』 or 駆騻呼『Nagami』 or �',
 'Sekke can be described as icky, but he is actually a normal, decent guy',
 

In [27]:
greedy_generated_descriptions_simple

['Yuno can be described as 『自己』 and is a member of the group that is also a member of',
 'Asta can be described as a “a“a“a“a“a“a',
 'Lily can be described as icky, but she is also a bit more like a little girl',
 'Yuno can be described as 『自己』 and is a member of the group that is also a member of',
 'Orsi can be described as icky, but he is also a very good person',
 'Asta can be described as a “a“a“a“a“a“a',
 'Lily can be described as icky, but she is also a bit more like a little girl',
 'Finral can be described as a “a “a “a “a “a “a',
 'Nozel can be described as iced up and a bit of a bit of a bit of a bit of a bit of a bit',
 'Gordon can be described as a “a “a “a “a “a “a',
 'Rill can be described as a “a “a “a “a “a “a',
 'Jack can be described as a “a “a “a “a “a “a',
 'Dorothy can be described as icky, but she is also a very good person',
 'Yami can be described as 『自己』 and is a member of the group that is also a member of',
 'Fuegoleon can be described as ichai-like',
 'Yuno