# Summarizing (w/ GPT-2)

## Packages

In [1]:
from utils.json_utils import read_json

## Tokenizing

In [2]:
sentences_w_subjects_tokenized = read_json("9_non_lemmatized_tokenized_sentences_black_clover.json")
sentences_w_subjects_tokenized

[{'subjects': ['Yuno'],
  'tokens': ['a',
   'priest',
   'takes',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'inside',
   'and',
   'discovers',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'names',
   'to',
   'be',
   'Yuno',
   'and',
   'Asta',
   '.']},
 {'subjects': ['Asta'],
  'tokens': ['a',
   'priest',
   'takes',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'inside',
   'and',
   'discovers',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'names',
   'to',
   'be',
   'Yuno',
   'and',
   'Asta',
   '.']},
 {'subjects': ['Lily'],
  'tokens': ['Fifteen',
   'years',
   'later',
   ',',
   'Asta',
   'proposes',
   'to',
   'Sister',
   'Lily',
   ',',
   'who',
   'refuses',
   'repeatedly',
   '.']},
 {'subjects': ['Asta'],
  'tokens': ['Fifteen',
   'years',
   'later',
   ',',
   'Asta',
   'proposes',
   'to',
   'Sister',
   'Lily',
   

In [3]:
sentences_w_subjects_tokenized = [
    {
        "subjects" : sentence["subjects"],
        "tokens"  : " ".join(sentence["tokens"])
    } 
    for sentence in sentences_w_subjects_tokenized
]
sentences_w_subjects_tokenized

[{'subjects': ['Yuno'],
  'tokens': 'a priest takes two babies abandoned outside his church inside and discovers two babies abandoned outside his church names to be Yuno and Asta .'},
 {'subjects': ['Asta'],
  'tokens': 'a priest takes two babies abandoned outside his church inside and discovers two babies abandoned outside his church names to be Yuno and Asta .'},
 {'subjects': ['Lily'],
  'tokens': 'Fifteen years later , Asta proposes to Sister Lily , who refuses repeatedly .'},
 {'subjects': ['Asta'],
  'tokens': 'Fifteen years later , Asta proposes to Sister Lily , who refuses repeatedly .'},
 {'subjects': ['Yuno'],
  'tokens': 'Yuno and the other orphans criticize Asta and point out Yuno lack of magic .'},
 {'subjects': ['Asta'],
  'tokens': 'Yuno and the other orphans criticize Asta and point out Yuno lack of magic .'},
 {'subjects': ['Yuno'],
  'tokens': 'Asta tries to show off Asta skills , but Yuno outshines Asta with Asta magic .'},
 {'subjects': ['Asta'],
  'tokens': 'Asta t

In [4]:
max_length = 0
for sentence in sentences_w_subjects_tokenized:
    if len(sentence["tokens"]) > max_length:
        max_length = len(sentence["tokens"])
max_length

610

## Transformer

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
model_checkpoint = "gpt2"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    model_checkpoint, pad_token_id=tokenizer.eos_token_id)

In [9]:
device = f"cuda:{torch.cuda.current_device()}" if torch.cuda.is_available() else "cpu"

In [10]:
def generate(
    prompt=None, max_new_tokens=20, greedy=True, model=model, tokenizer=tokenizer, device=device
):
    """None stands for beggining of sequence.
    NOTE si bien parece que GPT2 puede generar a partir de BOS token, la 
    documentacion es poco clara. Ademas hicimos nuestro finetuning sin BOS token.
    Entonces solo vamos a usar la funcion pasandole un contexto.

    Ver:
    https://github.com/huggingface/transformers/issues/3311#issuecomment-601264426
    https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/generate_unconditional_samples.py#L60
    """
    do_sample = False if greedy else True
    # model.eval() to set dropout and batch normalization layers to evaluation 
    # mode before running inference
    if prompt:
        input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)
        model.eval()
        outputs = model.generate(input_ids, do_sample=do_sample, max_new_tokens=max_new_tokens)
    else:
        model.eval()
        outputs = model.generate(do_sample=do_sample, max_new_tokens=max_new_tokens)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

## Summarize

In [11]:
generate(u"Microsoft's CEO is ", greedy=False)

2023-01-28 15:26:46.037387: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-28 15:26:46.908981: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-28 15:26:46.909028: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


["Microsoft's CEO is uring a future of free and open source solutions for PCs based on the Linux kernel.\n\nA"]

In [12]:
for sentence in sentences_w_subjects_tokenized:
    for subject in sentence["subjects"]:
        predict = "In Black Clover, \"" + sentence["tokens"] + "\". In the previous text, \"" + subject + "\" from Black Clover can be described as "
        print("\n\n--------\n\n", generate(predict, greedy=True)[0], "\n\n--------\n\n")



--------

 In Black Clover, "a priest takes two babies abandoned outside his church inside and discovers two babies abandoned outside his church names to be Yuno and Asta.". In the previous text, "Yuno" from Black Clover can be described as  a priest who takes two babies abandoned outside his church.
In Black Clover, "a priest 

--------




--------

 In Black Clover, "a priest takes two babies abandoned outside his church inside and discovers two babies abandoned outside his church names to be Yuno and Asta.". In the previous text, "Asta" from Black Clover can be described as  a "young girl" who is "a little girl with a big head and a big heart 

--------




--------

 In Black Clover, "Fifteen years later, Asta proposes to Sister Lily, who refuses repeatedly.". In the previous text, "Lily" from Black Clover can be described as  a woman who is a "strong, strong, strong woman" who is "strong, strong 

--------




--------

 In Black Clover, "Fifteen years later, Asta proposes to S