In [None]:
!pip install transformers datasets evaluate rouge_score



In [None]:
from datasets import load_dataset

billsum = load_dataset("billsum", split="train")
billsum = billsum.train_test_split(test_size=0.3)

billsum["train"][5]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


{'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nThe Legislature finds and declares all of the following:\n(a) Educators and policymakers have long acknowledged that the skills and competencies needed to be an effective teacher are supported through early and structured mentoring and assessment.\n(b) Induction programs help beginning teachers transition into the profession by providing standards-based, individualized assistance that combines the application of theory with intensive mentor-based support and formative assessment.\n(c) In 1998, California created its two-tiered teaching credential system and established the completion of a statewide, standards-based induction program, Beginning Teacher Support and Assessment (BTSA), as a path toward a clear credential.\n(d) Until 2009, the state provided $4,000 per participating teacher to BTSA providers as part of the Teacher Credentialing Block Grant.\n(e) In order to receive state funding, a local e

In [None]:
from transformers import AutoTokenizer
checkpoint = "google-t5/t5-large"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [None]:
prefix = "summarize: "
def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=512, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [None]:
import evaluate

rouge = evaluate.load("rouge")

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
from transformers import BigBirdPegasusModel, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
!pip install accelerate -U



In [None]:
!pip install transformers[torch]



In [None]:
!pip install -U "huggingface_hub"



In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="summarization_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,No log,3.032908,0.0884,0.0328,0.0763,0.0764,19.0
2,No log,2.734574,0.0862,0.0292,0.0738,0.0739,19.0
3,No log,2.647223,0.0914,0.0341,0.078,0.0781,19.0
4,No log,2.62415,0.0932,0.0346,0.0794,0.0794,19.0




TrainOutput(global_step=248, training_loss=3.363183790637601, metrics={'train_runtime': 312.0978, 'train_samples_per_second': 12.676, 'train_steps_per_second': 0.795, 'total_flos': 1070824333246464.0, 'train_loss': 3.363183790637601, 'epoch': 4.0})

In [None]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

events.out.tfevents.1711642732.be666c03dea9.15905.0:   0%|          | 0.00/4.18k [00:00<?, ?B/s]

events.out.tfevents.1711642823.be666c03dea9.15905.1:   0%|          | 0.00/4.18k [00:00<?, ?B/s]

events.out.tfevents.1711642942.be666c03dea9.17584.0:   0%|          | 0.00/4.18k [00:00<?, ?B/s]

Upload 10 LFS files:   0%|          | 0/10 [00:00<?, ?it/s]

events.out.tfevents.1711643110.be666c03dea9.18598.0:   0%|          | 0.00/8.01k [00:00<?, ?B/s]

events.out.tfevents.1711644194.be666c03dea9.23213.0:   0%|          | 0.00/4.18k [00:00<?, ?B/s]

events.out.tfevents.1711644321.be666c03dea9.24082.0:   0%|          | 0.00/8.00k [00:00<?, ?B/s]

events.out.tfevents.1711644224.be666c03dea9.23213.1:   0%|          | 0.00/4.18k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.05k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/iamsid47/summarization_model/commit/22d909d2561d67ab6b2b4e7cc9a944f997c78319', commit_message='End of training', commit_description='', oid='22d909d2561d67ab6b2b4e7cc9a944f997c78319', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
text = """
 Alright, okay. Software is known for producing a lot of scams. Think Nigerian Prince, Tai Lopez, Crypto NFT, AAA Gaming Industry. But I ain't talking about all that. I'm talking about the lies sold to developers. And it's a pretty long list so buckle up. Clean code. You see, the way Charles Ponzi put the Ponzi in Ponzi scheme, Uncle Bob has put the B in Bakwas. Clean code has gone through a reputational renaissance similar to Ellen, Jimmy and James. Bro's whole book is a filler episode. When you take the energy of a junior engineer and combine it with such riveting advice, that's how you get a pull request the size of Epstein's list. Ain't nobody reviewing that, bro. I barely read my own code. And that's Jamie Oliver's recipe for a 3AM Christmas Eve production outage. And off the back of the last famous manifesto, Uncle Bob decides to write his own on the topic of agile development. Now this is the manifesto that deserved US military intervention because it invested the whole world. From Brazil to India, Orlando to Ohio, we're all super happy sitting in another sprint planning. Agile was a 9000 IQ chest move by the boomers to get into tech without coding. But quite honestly, we're the plebs because we listened to the boomers and fell for the next scam. CS degrees. Now although degrees are a society level scam, leave it to programmers to overengineer education. Like in any other occupation, you have simple linear career paths. You want to become a plumber? Go to trade school. You want to become an accountant? Get a degree. But the path to software engineering is like a fidget spinner. There's like three options and none of them work anymore and it's also out of style. Despite your education, all developers can agree. Leak code is a massive scam. Look man, it's been discussed to death but all I'm gonna say is, do you think mechanics need to build a car from scratch to get a job? All this grinding just to code some front end. But wait, that's another scam. Front end is easy. Front end being called easy is a scam perpetrated by insecure back end engineers because they think gluing together some crappy spring boot to postgres is real engineering. They're probably coding that mess in Neo Vim and preaching the next scam to unsuspecting developers. Vim productivity. Vim productivity is stage one of toxic developer disease, otherwise known as TDD, which often develops into stage two I use arch by the way. And the final stage is when you spend half your salary to buy a keyboard split up like Bangladesh and Pakistan to hook up to your overpriced thinkpad only to be paid half as much as your coworker with actual hobbies a girlfriend and a life outside of programming. But wait, you're having a maladaptive daydream because you don't have any coworkers because tech jobs are literally a scam. Sure, you're highly paid, but the amount of time you'll spend unemployed from layoffs averages out to working at McDonald's. The biggest pain of being laid off isn't even losing money. It's overhearing your mom say, oh, Bobby, what to do? No job, no, you see, your beta fell for the biggest scam. Everyone should learn to code a scam peddled by politicians and their body businessmen when they outsource your job to a developing country. No, no, no, it's for the greater good of the economy. Tech pros love spouting on and on about how anyone can learn to code and build a startup. My dude, Mark Zuckerberg dropped out of Harvard. You're dropping out of Udemy and getting kicked out of free code camp design patterns. At this point, a good rule of thumb is that if Uncle Bob has a book on it, you really shouldn't be using it. Design patterns like many aspects of software engineering, great in practice, impossible to implement. Similar to best practices. And if you disagree, you can kiss my dry, yagny ass because your code is always going to be slow and unreadable. Not that it matters if it's fast because code performance is literally a scam. Code performance is like astrology for developers. I mean, it could have an impact on your code, but more often than not, it's just to sell you a course. Your real performance is actually how quickly you ship features. This whole industry is full of scams, man. But you know what isn't a scam? The sponsor of today's video, Brilliant.org. Brilliant is an app and a website with an amazing collection of lessons on math and science. I love learning new things, but learning math and science has always been super tricky. Most of the material online is very drawn out and not as engaging. I've also recently been trying to get into game development and needed to brush up on my physics fundamentals. With the lack of time and resources, I needed a solution. And that's where Brilliant came in. You see, Brilliant uses a combination of techniques such as visual learning with interactive diagrams and reinforcement learning with experiments. But the more important thing is that it's delivered in super concise and fun lessons. I've been using my Commute as a key trigger to use Brilliant, and in the past few months I've gotten to the point where I can understand and think critically about basic physics problems. Don't believe me? You can get started with Brilliant for free for 30 days. That's Brilliant.org slash Big Box S.W.I.E. The link is in the description. You'll also get 20% off the annual subscription. Thank you so much to Brilliant for sponsoring this video, and thank you for your time. I am Big Box.
 """

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model="iamsid47/summarization_model")
summarizer(text)

config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/20.7k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1342 > 512). Running this sequence through the model will result in indexing errors


[{'summary_text': "I'm talking about the lies sold to developers. And it's a pretty long list so buckle up. You see, the way Charles Ponzi put the Ponzi in Ponzi scheme, Uncle Bob has put the B in Bakwas. You're the plebs because we listened to the boomers and fell for the next scam. The path to software engineering is like a fidget spinner. There's like three options and none of them work anymore."}]