# Summarizing (w/ GPT-2)

## Packages

In [1]:
from utils.json_utils import read_json
from datasets import Dataset
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


## Tokenizing

In [2]:
nonTokenizedWsubj_docs = read_json("18_docs_with_subjects_non_coref.json")
#nonTokenizedWsubj_docs

In [3]:
varied_set_adjectives = read_json("14_varied_set_adjectives_definitions.json")
#varied_set_adjectives

In [4]:
training_sents = []
for adj in varied_set_adjectives:
    training_sents.append("Someone can be described as " + adj + ": " + varied_set_adjectives[adj])

## Transformer

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import torch

2023-01-23 23:05:12.349620: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-23 23:05:12.540630: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-23 23:05:12.540643: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-23 23:05:13.141959: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [6]:
model_checkpoint = "gpt2"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [8]:
tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [9]:
tokenize_fn = lambda doc : tokenizer(
        doc,
        truncation=True,
        max_length=1024
    )

tokenized_dataset = list(map(tokenize_fn, training_sents))

In [10]:
tokenized_dataset = pd.DataFrame(tokenized_dataset)
tokenized_dataset = Dataset.from_pandas(tokenized_dataset)

In [11]:
model = AutoModelForCausalLM.from_pretrained(
    model_checkpoint, pad_token_id=tokenizer.eos_token_id)

In [12]:
device = f"cuda:{torch.cuda.current_device()}" if torch.cuda.is_available() else "cpu"

In [13]:
training_args = TrainingArguments(
    f"{model_checkpoint}-finetuned-variedadj-black-clover",
    num_train_epochs=100,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,    
    learning_rate=2e-5,
    weight_decay=0.01,
    do_eval=True, # eval en validation set
    evaluation_strategy="steps", # eval en validation set
    eval_steps=100,
    save_steps=100, # checkpoint model every 500 steps
    logging_dir='./logs', # logging
    logging_strategy="steps",
    logging_steps=1,
    fp16=False, # float16 en training (only on CUDA)
    push_to_hub=False,
)

In [14]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset, #.select(range(0, 128)),
    eval_dataset=tokenized_dataset, #.select(range(0, 128)),
)

In [15]:
train_output = trainer.train()

***** Running training *****
  Num examples = 50
  Num Epochs = 100
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 10
  Gradient Accumulation steps = 1
  Total optimization steps = 500
  0%|          | 0/500 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  0%|          | 1/500 [00:01<15:17,  1.84s/it]

{'loss': 4.9619, 'learning_rate': 1.9960000000000002e-05, 'epoch': 0.2}


  0%|          | 2/500 [00:03<14:05,  1.70s/it]

{'loss': 4.8097, 'learning_rate': 1.9920000000000002e-05, 'epoch': 0.4}


  1%|          | 3/500 [00:04<13:30,  1.63s/it]

{'loss': 4.2993, 'learning_rate': 1.9880000000000003e-05, 'epoch': 0.6}


  1%|          | 4/500 [00:06<12:59,  1.57s/it]

{'loss': 4.3009, 'learning_rate': 1.9840000000000003e-05, 'epoch': 0.8}


  1%|          | 5/500 [00:07<12:45,  1.55s/it]

{'loss': 3.9344, 'learning_rate': 1.98e-05, 'epoch': 1.0}


  1%|          | 6/500 [00:09<12:59,  1.58s/it]

{'loss': 3.6921, 'learning_rate': 1.976e-05, 'epoch': 1.2}


  1%|▏         | 7/500 [00:11<12:46,  1.56s/it]

{'loss': 3.3057, 'learning_rate': 1.972e-05, 'epoch': 1.4}


  2%|▏         | 8/500 [00:12<12:35,  1.53s/it]

{'loss': 3.4156, 'learning_rate': 1.968e-05, 'epoch': 1.6}


  2%|▏         | 9/500 [00:14<13:12,  1.62s/it]

{'loss': 3.2616, 'learning_rate': 1.9640000000000002e-05, 'epoch': 1.8}


  2%|▏         | 10/500 [00:15<12:49,  1.57s/it]

{'loss': 2.8447, 'learning_rate': 1.9600000000000002e-05, 'epoch': 2.0}


  2%|▏         | 11/500 [00:17<12:57,  1.59s/it]

{'loss': 2.6928, 'learning_rate': 1.9560000000000002e-05, 'epoch': 2.2}


  2%|▏         | 12/500 [00:19<13:28,  1.66s/it]

{'loss': 2.9024, 'learning_rate': 1.9520000000000003e-05, 'epoch': 2.4}


  3%|▎         | 13/500 [00:20<12:58,  1.60s/it]

{'loss': 2.8498, 'learning_rate': 1.948e-05, 'epoch': 2.6}


  3%|▎         | 14/500 [00:22<12:42,  1.57s/it]

{'loss': 2.4809, 'learning_rate': 1.944e-05, 'epoch': 2.8}


  3%|▎         | 15/500 [00:23<12:21,  1.53s/it]

{'loss': 2.5776, 'learning_rate': 1.94e-05, 'epoch': 3.0}


  3%|▎         | 16/500 [00:25<12:55,  1.60s/it]

{'loss': 2.5543, 'learning_rate': 1.936e-05, 'epoch': 3.2}


  3%|▎         | 17/500 [00:27<12:43,  1.58s/it]

{'loss': 2.2065, 'learning_rate': 1.932e-05, 'epoch': 3.4}


  4%|▎         | 18/500 [00:28<12:26,  1.55s/it]

{'loss': 2.4275, 'learning_rate': 1.9280000000000002e-05, 'epoch': 3.6}


  4%|▍         | 19/500 [00:30<12:44,  1.59s/it]

{'loss': 2.2961, 'learning_rate': 1.9240000000000002e-05, 'epoch': 3.8}


  4%|▍         | 20/500 [00:31<12:34,  1.57s/it]

{'loss': 2.4181, 'learning_rate': 1.9200000000000003e-05, 'epoch': 4.0}


  4%|▍         | 21/500 [00:33<12:31,  1.57s/it]

{'loss': 2.4554, 'learning_rate': 1.916e-05, 'epoch': 4.2}


  4%|▍         | 22/500 [00:34<12:17,  1.54s/it]

{'loss': 2.1241, 'learning_rate': 1.912e-05, 'epoch': 4.4}


  5%|▍         | 23/500 [00:36<12:36,  1.59s/it]

{'loss': 2.144, 'learning_rate': 1.908e-05, 'epoch': 4.6}


  5%|▍         | 24/500 [00:37<12:21,  1.56s/it]

{'loss': 2.2665, 'learning_rate': 1.904e-05, 'epoch': 4.8}


  5%|▌         | 25/500 [00:39<12:53,  1.63s/it]

{'loss': 2.1256, 'learning_rate': 1.9e-05, 'epoch': 5.0}


  5%|▌         | 26/500 [00:41<12:51,  1.63s/it]

{'loss': 1.8361, 'learning_rate': 1.896e-05, 'epoch': 5.2}


  5%|▌         | 27/500 [00:42<12:44,  1.62s/it]

{'loss': 1.9749, 'learning_rate': 1.8920000000000002e-05, 'epoch': 5.4}


  6%|▌         | 28/500 [00:44<12:13,  1.55s/it]

{'loss': 2.1573, 'learning_rate': 1.8880000000000002e-05, 'epoch': 5.6}


  6%|▌         | 29/500 [00:46<12:44,  1.62s/it]

{'loss': 1.9688, 'learning_rate': 1.884e-05, 'epoch': 5.8}


  6%|▌         | 30/500 [00:47<12:15,  1.57s/it]

{'loss': 2.1615, 'learning_rate': 1.88e-05, 'epoch': 6.0}


  6%|▌         | 31/500 [00:49<12:10,  1.56s/it]

{'loss': 2.0908, 'learning_rate': 1.876e-05, 'epoch': 6.2}


  6%|▋         | 32/500 [00:50<11:58,  1.54s/it]

{'loss': 1.7803, 'learning_rate': 1.8720000000000004e-05, 'epoch': 6.4}


  7%|▋         | 33/500 [00:52<12:13,  1.57s/it]

{'loss': 1.6771, 'learning_rate': 1.8680000000000004e-05, 'epoch': 6.6}


  7%|▋         | 34/500 [00:53<11:53,  1.53s/it]

{'loss': 1.8444, 'learning_rate': 1.864e-05, 'epoch': 6.8}


  7%|▋         | 35/500 [00:55<12:28,  1.61s/it]

{'loss': 2.0885, 'learning_rate': 1.86e-05, 'epoch': 7.0}


  7%|▋         | 36/500 [00:56<12:08,  1.57s/it]

{'loss': 1.7445, 'learning_rate': 1.8560000000000002e-05, 'epoch': 7.2}


  7%|▋         | 37/500 [00:58<11:57,  1.55s/it]

{'loss': 2.0763, 'learning_rate': 1.8520000000000002e-05, 'epoch': 7.4}


  8%|▊         | 38/500 [00:59<11:46,  1.53s/it]

{'loss': 1.8169, 'learning_rate': 1.8480000000000003e-05, 'epoch': 7.6}


  8%|▊         | 39/500 [01:01<11:52,  1.55s/it]

{'loss': 1.8133, 'learning_rate': 1.8440000000000003e-05, 'epoch': 7.8}


  8%|▊         | 40/500 [01:03<12:30,  1.63s/it]

{'loss': 1.7231, 'learning_rate': 1.8400000000000003e-05, 'epoch': 8.0}


  8%|▊         | 41/500 [01:05<12:36,  1.65s/it]

{'loss': 1.6839, 'learning_rate': 1.8360000000000004e-05, 'epoch': 8.2}


  8%|▊         | 42/500 [01:06<12:59,  1.70s/it]

{'loss': 1.8367, 'learning_rate': 1.832e-05, 'epoch': 8.4}


  9%|▊         | 43/500 [01:08<12:32,  1.65s/it]

{'loss': 1.737, 'learning_rate': 1.828e-05, 'epoch': 8.6}


  9%|▉         | 44/500 [01:09<12:21,  1.63s/it]

{'loss': 1.6827, 'learning_rate': 1.824e-05, 'epoch': 8.8}


  9%|▉         | 45/500 [01:11<12:01,  1.59s/it]

{'loss': 1.7066, 'learning_rate': 1.8200000000000002e-05, 'epoch': 9.0}


  9%|▉         | 46/500 [01:13<12:02,  1.59s/it]

{'loss': 1.7072, 'learning_rate': 1.8160000000000002e-05, 'epoch': 9.2}


  9%|▉         | 47/500 [01:14<11:48,  1.56s/it]

{'loss': 1.6174, 'learning_rate': 1.8120000000000003e-05, 'epoch': 9.4}


 10%|▉         | 48/500 [01:16<12:20,  1.64s/it]

{'loss': 1.4696, 'learning_rate': 1.8080000000000003e-05, 'epoch': 9.6}


 10%|▉         | 49/500 [01:18<12:24,  1.65s/it]

{'loss': 1.644, 'learning_rate': 1.8040000000000003e-05, 'epoch': 9.8}


 10%|█         | 50/500 [01:19<12:06,  1.61s/it]

{'loss': 1.4829, 'learning_rate': 1.8e-05, 'epoch': 10.0}


 10%|█         | 51/500 [01:21<12:32,  1.67s/it]

{'loss': 1.3173, 'learning_rate': 1.796e-05, 'epoch': 10.2}


 10%|█         | 52/500 [01:22<12:01,  1.61s/it]

{'loss': 1.3602, 'learning_rate': 1.792e-05, 'epoch': 10.4}


 11%|█         | 53/500 [01:24<11:36,  1.56s/it]

{'loss': 1.6781, 'learning_rate': 1.788e-05, 'epoch': 10.6}


 11%|█         | 54/500 [01:25<11:25,  1.54s/it]

{'loss': 1.3046, 'learning_rate': 1.7840000000000002e-05, 'epoch': 10.8}


 11%|█         | 55/500 [01:27<11:20,  1.53s/it]

{'loss': 1.6019, 'learning_rate': 1.7800000000000002e-05, 'epoch': 11.0}


 11%|█         | 56/500 [01:28<11:15,  1.52s/it]

{'loss': 1.4191, 'learning_rate': 1.7760000000000003e-05, 'epoch': 11.2}


 11%|█▏        | 57/500 [01:30<11:02,  1.50s/it]

{'loss': 1.5522, 'learning_rate': 1.7720000000000003e-05, 'epoch': 11.4}


 12%|█▏        | 58/500 [01:31<11:22,  1.54s/it]

{'loss': 1.4792, 'learning_rate': 1.768e-05, 'epoch': 11.6}


 12%|█▏        | 59/500 [01:33<11:17,  1.54s/it]

{'loss': 1.3421, 'learning_rate': 1.764e-05, 'epoch': 11.8}


 12%|█▏        | 60/500 [01:35<11:57,  1.63s/it]

{'loss': 1.1951, 'learning_rate': 1.76e-05, 'epoch': 12.0}


 12%|█▏        | 61/500 [01:36<11:38,  1.59s/it]

{'loss': 1.4385, 'learning_rate': 1.756e-05, 'epoch': 12.2}


 12%|█▏        | 62/500 [01:38<11:25,  1.56s/it]

{'loss': 1.3177, 'learning_rate': 1.752e-05, 'epoch': 12.4}


 13%|█▎        | 63/500 [01:39<11:12,  1.54s/it]

{'loss': 1.1673, 'learning_rate': 1.7480000000000002e-05, 'epoch': 12.6}


 13%|█▎        | 64/500 [01:41<11:40,  1.61s/it]

{'loss': 1.2288, 'learning_rate': 1.7440000000000002e-05, 'epoch': 12.8}


 13%|█▎        | 65/500 [01:43<12:02,  1.66s/it]

{'loss': 1.2777, 'learning_rate': 1.7400000000000003e-05, 'epoch': 13.0}


 13%|█▎        | 66/500 [01:44<11:21,  1.57s/it]

{'loss': 1.345, 'learning_rate': 1.736e-05, 'epoch': 13.2}


 13%|█▎        | 67/500 [01:46<11:07,  1.54s/it]

{'loss': 1.304, 'learning_rate': 1.732e-05, 'epoch': 13.4}


 14%|█▎        | 68/500 [01:48<11:53,  1.65s/it]

{'loss': 1.3821, 'learning_rate': 1.728e-05, 'epoch': 13.6}


 14%|█▍        | 69/500 [01:49<11:59,  1.67s/it]

{'loss': 1.2378, 'learning_rate': 1.724e-05, 'epoch': 13.8}


 14%|█▍        | 70/500 [01:51<11:35,  1.62s/it]

{'loss': 1.1521, 'learning_rate': 1.72e-05, 'epoch': 14.0}


 14%|█▍        | 71/500 [01:52<11:20,  1.59s/it]

{'loss': 1.1679, 'learning_rate': 1.7160000000000002e-05, 'epoch': 14.2}


 14%|█▍        | 72/500 [01:54<11:44,  1.65s/it]

{'loss': 1.2431, 'learning_rate': 1.7120000000000002e-05, 'epoch': 14.4}


 15%|█▍        | 73/500 [01:56<11:19,  1.59s/it]

{'loss': 1.0745, 'learning_rate': 1.7080000000000002e-05, 'epoch': 14.6}


 15%|█▍        | 74/500 [01:57<11:24,  1.61s/it]

{'loss': 1.3521, 'learning_rate': 1.704e-05, 'epoch': 14.8}


 15%|█▌        | 75/500 [01:59<11:00,  1.56s/it]

{'loss': 1.2117, 'learning_rate': 1.7e-05, 'epoch': 15.0}


 15%|█▌        | 76/500 [02:00<10:51,  1.54s/it]

{'loss': 1.1343, 'learning_rate': 1.696e-05, 'epoch': 15.2}


 15%|█▌        | 77/500 [02:02<10:55,  1.55s/it]

{'loss': 1.0197, 'learning_rate': 1.692e-05, 'epoch': 15.4}


 16%|█▌        | 78/500 [02:03<10:52,  1.55s/it]

{'loss': 1.0895, 'learning_rate': 1.688e-05, 'epoch': 15.6}


 16%|█▌        | 79/500 [02:05<10:48,  1.54s/it]

{'loss': 1.3487, 'learning_rate': 1.684e-05, 'epoch': 15.8}


 16%|█▌        | 80/500 [02:07<11:30,  1.64s/it]

{'loss': 1.1743, 'learning_rate': 1.6800000000000002e-05, 'epoch': 16.0}


 16%|█▌        | 81/500 [02:09<12:05,  1.73s/it]

{'loss': 0.9679, 'learning_rate': 1.6760000000000002e-05, 'epoch': 16.2}


 16%|█▋        | 82/500 [02:10<12:13,  1.76s/it]

{'loss': 0.9699, 'learning_rate': 1.672e-05, 'epoch': 16.4}


KeyboardInterrupt: 

In [None]:
def generate(
    prompt=None, max_length=1024, max_new_tokens=20, greedy=True, model=model, tokenizer=tokenizer, device=device
):
    """None stands for beggining of sequence.
    NOTE si bien parece que GPT2 puede generar a partir de BOS token, la 
    documentacion es poco clara. Ademas hicimos nuestro finetuning sin BOS token.
    Entonces solo vamos a usar la funcion pasandole un contexto.

    Ver:
    https://github.com/huggingface/transformers/issues/3311#issuecomment-601264426
    https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/generate_unconditional_samples.py#L60
    """
    do_sample = False if greedy else True
    # model.eval() to set dropout and batch normalization layers to evaluation 
    # mode before running inference
    if prompt:
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
        model.eval()
        outputs = model.generate(input_ids, do_sample=do_sample, max_new_tokens=max_new_tokens)
    else:
        model.eval()
        outputs = model.generate(do_sample=do_sample, max_new_tokens=max_new_tokens)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

## Summarize

In [None]:
generate("Microsoft's CEO is ", greedy=False)

["Microsoft's CEO is \xa0familiar with the state of the smartphone industry - how we learn.\nToday's post looks"]

In [None]:
import re

In [None]:
non_greedy_generated_descriptions = []
for doc in nonTokenizedWsubj_docs:
    for subject in doc["subjects"]:
        generated_doc = generate(doc["doc"] + " " + subject + " can be described as ", max_length=1024, greedy=False)
        if len(re.split(doc["doc"], generated_doc[0])) >= 2:
            generated_desc = re.sub(r"\s+", " ", (re.split(r"[.;:!?]",list(map(lambda x : x.strip(), re.split(doc["doc"], generated_doc[0])))[1]))[0])
            non_greedy_generated_descriptions.append(generated_desc)

In [None]:
greedy_generated_descriptions = []
for doc in nonTokenizedWsubj_docs[:10]:
    for subject in doc["subjects"]:
        generated_doc = generate(doc["doc"] + " " + subject + " can be described as ", max_length=1024, greedy=True)
        if len(re.split(doc["doc"], generated_doc[0])) >= 2:
            generated_desc = re.sub(r"\s+", " ", (re.split(r"[.;:!?]",list(map(lambda x : x.strip(), re.split(doc["doc"], generated_doc[0])))[1]))[0])
            greedy_generated_descriptions.append(generated_desc)

In [None]:
re.split("\.", "hol.")

['hol', '']

In [None]:
non_greedy_generated_descriptions

['Yuno can be described as 『Darkness of the Shadow』',
 'Asta can be described as urchin-like',
 'Lily can be described as 『I Am Jin',
 'Yuno can be described as very quiet and composed',
 'Orsi can be described as 『Kung Fu Artist』or even is the "Chosen One of Magic Kingdom"',
 'Asta can be described as disappointed, even annoyed with the way the boys and girls fare',
 'Lily can be described as ~~very good~~',
 'Finral can be described as iced, as he does little but sing, as he is not strong and is often not present with',
 'Nozel can be described as a calm and calm type of man, whose calmness means at ease',
 'Gordon can be described as ________',
 'Rill can be described as 仙平巟翼九, a common form of Chinese 来',
 'Jack can be described as a charismatic man of the heart whose personal charisma can rival that of a high priest',
 'Dorothy can be described as 『Little Evil』',
 'Yami can be described as pessimistic and pessimistic',
 'Fuegoleon can be described as a cunning sorcerer who was abl

In [None]:
greedy_generated_descriptions

['Yuno can be described as as strong as a lion, and has a strong sense of smell',
 'Asta can be described as a kind of devilish, but not evil',
 'Lily can be described as a kind of devilish spirit',
 'Yuno can be described as a very good student, and he is able to learn a lot from his classmates',
 'Orsi can be described as a very good student, and is very knowledgeable about the Magic Knights',
 'Asta can be described as a very strong and strong man',
 'Lily can be described as very cheerful and cheerful',
 'Finral can be described as a strong and experienced fighter',
 'Nozel can be described as as strong as he is, and he is able to defeat Sekke with ease',
 'Gordon can be described as a strong man',
 'Rill can be described as a strong and experienced fighter',
 'Jack can be described as a strong man',
 'Dorothy can be described as a strong woman',
 'Yami can be described as a strong and powerful man',
 'Fuegoleon can be described as a strong and powerful magician',
 'Yuno can be des