In [38]:
import evaluate
import json
import librosa
import pytesseract
import soundfile

import numpy as np
import tensorflow as tf

from datasets import load_dataset
from huggingface_hub import notebook_login
from pytesseract import Output
from transformers import pipeline
from transformers import AutoTokenizer, DataCollatorForSeq2Seq, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import create_optimizer, AdamWeightDecay, TFAutoModelForSeq2SeqLM
from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback
from transformers.pipelines.pt_utils import KeyDataset

In [51]:
f = open('config.json')
data = json.load(f)
f.close()
tokenizer = data['write_token']

In [2]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### [Load Dataset](https://huggingface.co/docs/transformers/tasks/summarization#load-billsum-dataset)

In [3]:
billsum = load_dataset("billsum", split="ca_test")

In [4]:
billsum = billsum.train_test_split(test_size=0.2)

### [Preprocess](https://huggingface.co/docs/transformers/tasks/summarization#preprocess)

In [5]:
checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [16]:
### pytorch
# data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

### tensorflow
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")

### [Evaluate](https://huggingface.co/docs/transformers/tasks/summarization#load-billsum-dataset)

In [17]:
rouge = evaluate.load("rouge")

In [18]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

### [Train](https://huggingface.co/docs/transformers/tasks/summarization#train)

Pytorch

In [11]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    # fp16=True,
    push_to_hub=True,
)

# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_billsum["train"],
#     eval_dataset=tokenized_billsum["test"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
# )

# trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


AttributeError: 'TFT5ForConditionalGeneration' object has no attribute 'to'

In [None]:
# trainer.push_to_hub()

Tensorflow

In [23]:
optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)


All PyTorch model weights were used when initializing TFT5ForConditionalGeneration.

All the weights of TFT5ForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [24]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_billsum["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_test_set = model.prepare_tf_dataset(
    tokenized_billsum["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [26]:
model.compile(optimizer=optimizer)  # No loss argument!

In [29]:
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)

In [57]:
push_to_hub_callback = PushToHubCallback(
    output_dir="my_awesome_billsum_model",
    tokenizer=tokenizer
)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Cloning https://huggingface.co/sanoosha94/my_awesome_bi

In [58]:
callbacks = [metric_callback, push_to_hub_callback]

In [59]:
model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)

Epoch 1/3
Cause: for/else statement not yet supported
Cause: for/else statement not yet supported

ValueError: in user code:

    File "/Users/sindhumadhavan/GitHub/transformers/.venv/lib/python3.9/site-packages/tf_keras/src/engine/training.py", line 2436, in predict_function  *
        return step_function(self, iterator)
    File "/Users/sindhumadhavan/GitHub/transformers/.venv/lib/python3.9/site-packages/tf_keras/src/engine/training.py", line 2409, in run_step  *
        outputs = model.predict_step(data)
    File "/Users/sindhumadhavan/GitHub/transformers/.venv/lib/python3.9/site-packages/tf_keras/src/engine/training.py", line 2377, in predict_step  *
        return self(x, training=False)
    File "/Users/sindhumadhavan/GitHub/transformers/.venv/lib/python3.9/site-packages/tf_keras/src/engine/training.py", line 558, in error_handler  *
        return fn(*args, **kwargs)
    File "/Users/sindhumadhavan/GitHub/transformers/.venv/lib/python3.9/site-packages/tf_keras/src/engine/training.py", line 588, in __call__  *
        return super().__call__(*args, **kwargs)
    File "/Users/sindhumadhavan/GitHub/transformers/.venv/lib/python3.9/site-packages/tf_keras/src/engine/training.py", line 558, in error_handler  *
        return fn(*args, **kwargs)
    File "/Users/sindhumadhavan/GitHub/transformers/.venv/lib/python3.9/site-packages/tf_keras/src/engine/base_layer.py", line 1136, in __call__  *
        outputs = call_fn(inputs, *args, **kwargs)
    File "/var/folders/tm/6zqjkbdn2256qmx7vjl849240000gn/T/__autograph_generated_file98v_l434.py", line 162, in error_handler
        raise ag__.converted_call(ag__.ld(new_e).with_traceback, (ag__.ld(e).__traceback__,), None, fscope_1) from None
    File "/var/folders/tm/6zqjkbdn2256qmx7vjl849240000gn/T/__autograph_generated_file98v_l434.py", line 34, in error_handler
        retval__1 = ag__.converted_call(ag__.ld(fn), tuple(ag__.ld(args)), dict(**ag__.ld(kwargs)), fscope_1)
    File "/var/folders/tm/6zqjkbdn2256qmx7vjl849240000gn/T/__autograph_generated_filettvklrq4.py", line 37, in tf__run_call_with_unpacked_inputs
        retval_ = ag__.converted_call(ag__.ld(func), (ag__.ld(self),), dict(**ag__.ld(unpacked_inputs)), fscope)
    File "/var/folders/tm/6zqjkbdn2256qmx7vjl849240000gn/T/__autograph_generated_fileoa_8znbw.py", line 65, in tf__call
        ag__.if_stmt(ag__.and_(lambda : ag__.ld(input_ids) is not None, lambda : ag__.ld(inputs_embeds) is not None), if_body_2, else_body_2, get_state_2, set_state_2, ('input_ids', 'input_shape'), 2)
    File "/var/folders/tm/6zqjkbdn2256qmx7vjl849240000gn/T/__autograph_generated_fileoa_8znbw.py", line 62, in else_body_2
        ag__.if_stmt(ag__.ld(input_ids) is not None, if_body_1, else_body_1, get_state_1, set_state_1, ('input_ids', 'input_shape'), 2)
    File "/var/folders/tm/6zqjkbdn2256qmx7vjl849240000gn/T/__autograph_generated_fileoa_8znbw.py", line 59, in else_body_1
        ag__.if_stmt(ag__.ld(inputs_embeds) is not None, if_body, else_body, get_state, set_state, ('input_shape',), 1)
    File "/var/folders/tm/6zqjkbdn2256qmx7vjl849240000gn/T/__autograph_generated_fileoa_8znbw.py", line 56, in else_body
        raise ag__.converted_call(ag__.ld(ValueError), (f'You have to specify either {ag__.ld(err_msg_prefix)}input_ids or {ag__.ld(err_msg_prefix)}inputs_embeds',), None, fscope)

    ValueError: Exception encountered when calling layer 'tft5_for_conditional_generation' (type TFT5ForConditionalGeneration).
    
    in user code:
    
        File "/Users/sindhumadhavan/GitHub/transformers/.venv/lib/python3.9/site-packages/transformers/modeling_tf_utils.py", line 1404, in run_call_with_unpacked_inputs  *
            return func(self, **unpacked_inputs)
        File "/Users/sindhumadhavan/GitHub/transformers/.venv/lib/python3.9/site-packages/transformers/models/t5/modeling_tf_t5.py", line 1464, in call  *
            decoder_outputs = self.decoder(
        File "/Users/sindhumadhavan/GitHub/transformers/.venv/lib/python3.9/site-packages/tf_keras/src/engine/training.py", line 558, in error_handler  *
            return fn(*args, **kwargs)
        File "/Users/sindhumadhavan/GitHub/transformers/.venv/lib/python3.9/site-packages/tf_keras/src/engine/base_layer.py", line 1136, in __call__  *
            outputs = call_fn(inputs, *args, **kwargs)
        File "/var/folders/tm/6zqjkbdn2256qmx7vjl849240000gn/T/__autograph_generated_file98v_l434.py", line 162, in error_handler  **
            raise ag__.converted_call(ag__.ld(new_e).with_traceback, (ag__.ld(e).__traceback__,), None, fscope_1) from None
        File "/var/folders/tm/6zqjkbdn2256qmx7vjl849240000gn/T/__autograph_generated_file98v_l434.py", line 34, in error_handler
            retval__1 = ag__.converted_call(ag__.ld(fn), tuple(ag__.ld(args)), dict(**ag__.ld(kwargs)), fscope_1)
        File "/var/folders/tm/6zqjkbdn2256qmx7vjl849240000gn/T/__autograph_generated_filettvklrq4.py", line 37, in tf__run_call_with_unpacked_inputs  **
            retval_ = ag__.converted_call(ag__.ld(func), (ag__.ld(self),), dict(**ag__.ld(unpacked_inputs)), fscope)
        File "/var/folders/tm/6zqjkbdn2256qmx7vjl849240000gn/T/__autograph_generated_fileoa_8znbw.py", line 65, in tf__call  **
            ag__.if_stmt(ag__.and_(lambda : ag__.ld(input_ids) is not None, lambda : ag__.ld(inputs_embeds) is not None), if_body_2, else_body_2, get_state_2, set_state_2, ('input_ids', 'input_shape'), 2)
        File "/var/folders/tm/6zqjkbdn2256qmx7vjl849240000gn/T/__autograph_generated_fileoa_8znbw.py", line 62, in else_body_2
            ag__.if_stmt(ag__.ld(input_ids) is not None, if_body_1, else_body_1, get_state_1, set_state_1, ('input_ids', 'input_shape'), 2)
        File "/var/folders/tm/6zqjkbdn2256qmx7vjl849240000gn/T/__autograph_generated_fileoa_8znbw.py", line 59, in else_body_1
            ag__.if_stmt(ag__.ld(inputs_embeds) is not None, if_body, else_body, get_state, set_state, ('input_shape',), 1)
        File "/var/folders/tm/6zqjkbdn2256qmx7vjl849240000gn/T/__autograph_generated_fileoa_8znbw.py", line 56, in else_body
            raise ag__.converted_call(ag__.ld(ValueError), (f'You have to specify either {ag__.ld(err_msg_prefix)}input_ids or {ag__.ld(err_msg_prefix)}inputs_embeds',), None, fscope)
    
        ValueError: Exception encountered when calling layer 'decoder' (type TFT5MainLayer).
        
        in user code:
        
            File "/Users/sindhumadhavan/GitHub/transformers/.venv/lib/python3.9/site-packages/transformers/modeling_tf_utils.py", line 1404, in run_call_with_unpacked_inputs  *
                return func(self, **unpacked_inputs)
            File "/Users/sindhumadhavan/GitHub/transformers/.venv/lib/python3.9/site-packages/transformers/models/t5/modeling_tf_t5.py", line 763, in call  *
                raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
        
            ValueError: You have to specify either decoder_input_ids or decoder_inputs_embeds
        
        
        Call arguments received by layer 'decoder' (type TFT5MainLayer):
          • input_ids=None
          • attention_mask=None
          • encoder_hidden_states=tf.Tensor(shape=(16, 1024, 512), dtype=float32)
          • encoder_attention_mask=tf.Tensor(shape=(16, 1024), dtype=int32)
          • inputs_embeds=None
          • head_mask=None
          • encoder_head_mask=None
          • past_key_values=None
          • use_cache=True
          • output_attentions=False
          • output_hidden_states=False
          • return_dict=True
          • training=False
    
    
    Call arguments received by layer 'tft5_for_conditional_generation' (type TFT5ForConditionalGeneration):
      • input_ids={'input_ids': 'tf.Tensor(shape=(16, 1024), dtype=int64)', 'attention_mask': 'tf.Tensor(shape=(16, 1024), dtype=int64)'}
      • attention_mask=None
      • decoder_input_ids=None
      • decoder_attention_mask=None
      • head_mask=None
      • decoder_head_mask=None
      • encoder_outputs=None
      • past_key_values=None
      • inputs_embeds=None
      • decoder_inputs_embeds=None
      • labels=None
      • use_cache=None
      • output_attentions=None
      • output_hidden_states=None
      • return_dict=None
      • training=False


### [Inference](https://huggingface.co/docs/transformers/tasks/summarization#inference)

In [60]:
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."

In [61]:
summarizer = pipeline("summarization", model="sanoosha94/my_awesome_billsum_model")
# summarizer = pipeline("summarization", model="stevhliu/my_awesome_billsum_model")
summarizer(text)

OSError: sanoosha94/my_awesome_billsum_model does not appear to have a file named config.json. Checkout 'https://huggingface.co/sanoosha94/my_awesome_billsum_model/main' for available files.