# Summarizing (w/ BERT)

## Packages

In [1]:
import numpy as np

In [2]:
from utils.json_utils import read_json

## Tokenizing

In [3]:
sentences_w_subjects_tokenized = read_json("9_non_lemmatized_tokenized_sentences_black_clover.json")
sentences_w_subjects_tokenized

[{'subjects': ['Yuno'],
  'tokens': ['a',
   'priest',
   'takes',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'inside',
   'and',
   'discovers',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'names',
   'to',
   'be',
   'Yuno',
   'and',
   'Asta',
   '.']},
 {'subjects': ['Asta'],
  'tokens': ['a',
   'priest',
   'takes',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'inside',
   'and',
   'discovers',
   'two',
   'babies',
   'abandoned',
   'outside',
   'his',
   'church',
   'names',
   'to',
   'be',
   'Yuno',
   'and',
   'Asta',
   '.']},
 {'subjects': ['Lily'],
  'tokens': ['Fifteen',
   'years',
   'later',
   ',',
   'Asta',
   'proposes',
   'to',
   'Sister',
   'Lily',
   ',',
   'who',
   'refuses',
   'repeatedly',
   '.']},
 {'subjects': ['Asta'],
  'tokens': ['Fifteen',
   'years',
   'later',
   ',',
   'Asta',
   'proposes',
   'to',
   'Sister',
   'Lily',
   

In [4]:
sentences_w_subjects_tokenized = [
    {
        "subjects" : sentence["subjects"],
        "tokens"  : " ".join(sentence["tokens"])
    } 
    for sentence in sentences_w_subjects_tokenized
]
sentences_w_subjects_tokenized

[{'subjects': ['Yuno'],
  'tokens': 'a priest takes two babies abandoned outside his church inside and discovers two babies abandoned outside his church names to be Yuno and Asta .'},
 {'subjects': ['Asta'],
  'tokens': 'a priest takes two babies abandoned outside his church inside and discovers two babies abandoned outside his church names to be Yuno and Asta .'},
 {'subjects': ['Lily'],
  'tokens': 'Fifteen years later , Asta proposes to Sister Lily , who refuses repeatedly .'},
 {'subjects': ['Asta'],
  'tokens': 'Fifteen years later , Asta proposes to Sister Lily , who refuses repeatedly .'},
 {'subjects': ['Yuno'],
  'tokens': 'Yuno and the other orphans criticize Asta and point out Yuno lack of magic .'},
 {'subjects': ['Asta'],
  'tokens': 'Yuno and the other orphans criticize Asta and point out Yuno lack of magic .'},
 {'subjects': ['Yuno'],
  'tokens': 'Asta tries to show off Asta skills , but Yuno outshines Asta with Asta magic .'},
 {'subjects': ['Asta'],
  'tokens': 'Asta t

In [5]:
max_length = 0
for sentence in sentences_w_subjects_tokenized:
    if len(sentence["tokens"]) > max_length:
        max_length = len(sentence["tokens"])
max_length

610

## Transformer

In [6]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
model_checkpoint = "distilbert-base-cased"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [9]:
model = AutoModelForMaskedLM.from_pretrained(
    model_checkpoint, pad_token_id=tokenizer.eos_token_id)

In [10]:
device = f"cuda:{torch.cuda.current_device()}" if torch.cuda.is_available() else "cpu"

In [11]:
def predict_mask(input_str):
    """Tomamos el camino largo en lugar de usar pipeline
    """
    inputs = tokenizer(input_str, return_tensors="pt")
    mask_index = np.where(inputs['input_ids'] == tokenizer.mask_token_id)
    # .eval() to set dropout and batch normalization layers to evaluation mode
    model.eval()
    outputs = model(**inputs)
    top_5_predictions = torch.softmax(outputs.logits[mask_index], dim=1).topk(5)
    predicted = []
    for i in range(5):
        token = tokenizer.decode(top_5_predictions.indices[0, i])
        prob = top_5_predictions.values[0, i]
        predicted.append({
            "token": token, 
            "prob": prob
        })
    return predicted

## Summarize

In [12]:
predict_mask("Microsoft's CEO is [MASK].")

2023-01-23 17:39:52.346354: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-23 17:39:53.240105: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-01-23 17:39:53.240148: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


[{'token': 'unknown', 'prob': tensor(0.1012, grad_fn=<SelectBackward0>)},
 {'token': 'gay', 'prob': tensor(0.0408, grad_fn=<SelectBackward0>)},
 {'token': 'male', 'prob': tensor(0.0215, grad_fn=<SelectBackward0>)},
 {'token': 'female', 'prob': tensor(0.0147, grad_fn=<SelectBackward0>)},
 {'token': 'white', 'prob': tensor(0.0134, grad_fn=<SelectBackward0>)}]

In [13]:
predicted_by_sentence_by_character = []
for sentence in sentences_w_subjects_tokenized:
    predicted_by_character = dict()
    for subject in sentence["subjects"]:
        #print(sentence["tokens"] + " \"" + subject + "\" can be described as [MASK].")
        #print()
        predicted = predict_mask(sentence["tokens"] + " \"" + subject + "\" can be described as [MASK].")
        predicted_by_character[subject] = predicted
        #for i in range(0, len(predicted)):
            #print(f" {i+1}) {predicted[i]['token']:<20} {predicted[i]['prob']:.3f}")
        #print()
    predicted_by_sentence_by_character.append(predicted_by_character)

: 

: 