# T5 Model for Text Summarization

## Dependencies and Load Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install evaluate
!pip install rouge_score

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, GenerationConfig
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, Trainer, EarlyStoppingCallback
from datasets import Dataset, DatasetDict
import evaluate
import evaluate
import torch
import os
from os import listdir
from os.path import isfile, join
import json
import re
import numpy as np
import pandas as pd

In [None]:
device = torch.device('cuda')
torch.cuda.is_available()

True

In [None]:
train_set = pd.read_csv('/content/drive/MyDrive/datasets/ready_data_train.csv')
valid_set = pd.read_csv('/content/drive/MyDrive/datasets/ready_data_dev.csv')
test_set = pd.read_csv('/content/drive/MyDrive/datasets/ready_data_test.csv')

In [None]:
train_set.shape, valid_set.shape, test_set.shape

((3000, 2), (1000, 2), (1000, 2))

In [None]:
train_set = train_set.sample(n=3000, random_state=88).reset_index(drop=True)
valid_set = valid_set.sample(n=1000, random_state=88).reset_index(drop=True)
test_set = test_set.sample(n=1000, random_state=88).reset_index(drop=True)

In [None]:
#Check if there is duplicate
train_set.duplicated().sum(), valid_set.duplicated().sum(), test_set.duplicated().sum()

(0, 0, 0)

In [None]:
train_dataset = Dataset.from_pandas(train_set)
valid_dataset = Dataset.from_pandas(valid_set)
test_dataset = Dataset.from_pandas(test_set)

dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': valid_dataset,
    'test': test_dataset
})

## Define T5 Model

In [None]:
tokenizer = T5Tokenizer.from_pretrained("panggi/t5-base-indonesian-summarization-cased")
model = T5ForConditionalGeneration.from_pretrained("panggi/t5-base-indonesian-summarization-cased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.09k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/793k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [None]:
prefix = ""

def preprocess_function(examples):
    # Ensure you're using the correct column names
    inputs = examples['clean_article']  # Adjust if necessary based on your dataset
    targets = examples['clean_summary']  # Adjust this as well if necessary

    # Tokenize the inputs
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)

    # Tokenize the target summaries
    labels = tokenizer(text_target=targets, max_length=128, truncation=True)

    # Add labels to the model inputs
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

# Apply preprocessing with the correct dataset and column names
tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

## Training

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    weight_decay=0.03,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    # logging_dir='./logs',
    # logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="rouge2",
    greater_is_better=True,
)

In [None]:
rouge = evaluate.load('rouge')
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    if isinstance(predictions, tuple):
        predictions = preds[0]

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    # Directly decode predictions
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Rouge expects newline-separated text
    decoded_preds = ["\n".join(decoded_pred.split()) for decoded_pred in decoded_preds]
    decoded_labels = ["\n".join(decoded_label.split()) for decoded_label in decoded_labels]

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
)

In [None]:
#empty the cuda cache before training
torch.cuda.empty_cache()

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,2.50662,0.2884,0.1454,0.2505,0.2886
2,1.968700,2.494175,0.2942,0.1492,0.2547,0.2944




Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,No log,2.50662,0.2884,0.1454,0.2505,0.2886
2,1.968700,2.494175,0.2942,0.1492,0.2547,0.2944
3,1.681800,2.496251,0.2978,0.1496,0.257,0.298
4,1.526100,2.532616,0.2971,0.1505,0.2579,0.2974
5,1.526100,2.568656,0.3003,0.1502,0.2586,0.3005
6,1.364500,2.592018,0.2993,0.1509,0.2588,0.2999
7,1.269400,2.624082,0.2996,0.1499,0.2579,0.3
8,1.207400,2.642818,0.302,0.1522,0.2601,0.3022
9,1.207400,2.64859,0.3001,0.1503,0.2581,0.3005
10,1.164400,2.661786,0.3007,0.1506,0.2586,0.3011


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=3750, training_loss=1.4325258544921875, metrics={'train_runtime': 5209.9499, 'train_samples_per_second': 5.758, 'train_steps_per_second': 0.72, 'total_flos': 1.38782692629504e+16, 'train_loss': 1.4325258544921875, 'epoch': 10.0})

In [None]:
eval_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
print(eval_results)



{'eval_loss': 2.4019715785980225, 'eval_rouge1': 0.3267, 'eval_rouge2': 0.1807, 'eval_rougeL': 0.2857, 'eval_rougeLsum': 0.3269, 'eval_runtime': 159.6713, 'eval_samples_per_second': 6.263, 'eval_steps_per_second': 1.566, 'epoch': 10.0}


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# Save the model
model.save_pretrained("/content/drive/MyDrive/datasets/t5_indo_sum")

# Save the tokenizer
tokenizer.save_pretrained("/content/drive/MyDrive/datasets/t5_indo_sum/tokenizer")

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-7-740e19a840d7>", line 2, in <cell line: 2>
    model.save_pretrained("/content/drive/MyDrive/datasets/t5_indo_sum")
  File "/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py", line 2793, in save_pretrained
    safe_save_file(shard, os.path.join(save_directory, shard_file), metadata={"format": "pt"})
  File "/usr/local/lib/python3.10/dist-packages/safetensors/torch.py", line 286, in save_file
    serialize_file(_flatten(tensors), filename, metadata=metadata)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 2099, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardIn

TypeError: object of type 'NoneType' has no len()

In [None]:
!zip -r t5_indo_sum.zip /content/drive/MyDrive/datasets/t5_indo_sum

  adding: content/drive/MyDrive/datasets/t5_indo_sum/ (stored 0%)
  adding: content/drive/MyDrive/datasets/t5_indo_sum/config.json (deflated 48%)
  adding: content/drive/MyDrive/datasets/t5_indo_sum/generation_config.json (deflated 29%)
  adding: content/drive/MyDrive/datasets/t5_indo_sum/model.safetensors (deflated 7%)
  adding: content/drive/MyDrive/datasets/t5_indo_sum/tokenizer/ (stored 0%)
  adding: content/drive/MyDrive/datasets/t5_indo_sum/tokenizer/tokenizer_config.json (deflated 94%)
  adding: content/drive/MyDrive/datasets/t5_indo_sum/tokenizer/special_tokens_map.json (deflated 85%)
  adding: content/drive/MyDrive/datasets/t5_indo_sum/tokenizer/added_tokens.json (deflated 83%)
  adding: content/drive/MyDrive/datasets/t5_indo_sum/tokenizer/spiece.model (deflated 49%)


## Inference

In [None]:
from transformers import T5ForConditionalGeneration, AutoTokenizer
import torch

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/datasets/t5_indo_sum/tokenizer", use_fast=False)

# Load the model
model = T5ForConditionalGeneration.from_pretrained("/content/drive/MyDrive/datasets/t5_indo_sum")

# Set device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [None]:
def clean_article(text):
    # Remove phrases like "Liputan6.com, [City Name]:"
    text = re.sub(r'Liputan6\. com, [A-Za-z\s]+:', '', text)

    # Remove words followed by a date in the format (dd/mm) or (d/m)
    text = re.sub(r'\w+\s*\(\d{1,2}/\d{1,2}\)', '', text)

    # Remove text inside parentheses that follow a specific pattern (e.g., (UPI/Reporter Name)) and optionally with 'dan' conjunction
    text = re.sub(r'\([A-Z]+/[A-Za-z\s]+(?: dan [A-Za-z\s]+)?\)\.', '', text)

    # Remove text inside square brackets that starts with 'baca:'
    text = re.sub(r'\[baca: .*?\]', '', text)

    # Remove URLs starting with http or https
    text = re.sub(r'https?://\S+', '', text)

    # Remove leading and trailing whitespaces from the text
    return text.strip()

In [None]:
%%time
ARTICLE_TO_SUMMARIZE = """Bank Mandiri, sebagaimana bank umumnya, menyediakan layanan kartu debit bagi nasabahnya. Kartu debit Mandiri dapat digunakan oleh nasabah untuk melakukan berbagai transaksi di mesin ATM atau mesin EDC. Fungsi dari Kartu Debit Mandiri ini sangat beragam, mulai dari tarik tunai, setor tunai, transfer uang, cek saldo rekening, hingga membayar berbagai tagihan melalui mesin ATM.
Penting bagi Anda yang ingin membuka rekening tabungan di Bank Mandiri untuk memahami jenis Kartu Debit Mandiri agar tidak salah memilih. Setiap kartu debit Mandiri memiliki kelebihan dan kekurangannya masing-masing, sehingga penting bagi nasabah untuk memilih yang sesuai dengan kebutuhan dan preferensi mereka.

Dalam memilih jenis Kartu Debit Mandiri, nasabah perlu mempertimbangkan kebutuhan dan gaya hidup mereka. Apakah mereka membutuhkan manfaat tambahan seperti asuransi atau akses ke airport lounge, ataukah mereka menginginkan kartu debit yang sederhana namun praktis. Dengan mengetahui jenis Kartu Debit Mandiri yang sesuai, nasabah dapat memaksimalkan manfaat yang mereka dapatkan dari penggunaan kartu debit tersebut.
"""

# generate summary
input_ids = tokenizer.encode(clean_article(ARTICLE_TO_SUMMARIZE), return_tensors='pt')
summary_ids = model.generate(input_ids.to(model.device),
            min_length=20,
            max_length=128,
            num_beams=10,
            repetition_penalty=2.5,
            length_penalty=1.0,
            early_stopping=True,
            no_repeat_ngram_size=2,
            use_cache=True,
            do_sample = True,
            temperature = 0.1,
            top_k = 50,
            top_p = 0.95)
# start time dan end time
summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary_text)

Bank Mandiri menyediakan layanan kartu debit bagi nasabahnya. Kartu debit Mandiri dapat digunakan untuk melakukan berbagai transaksi di mesin ATM atau mesin EDC.
CPU times: user 13 s, sys: 1.37 s, total: 14.3 s
Wall time: 21 s


In [None]:
# parent_folder = "/content/" # Pake yang ada di folder "clean_data" di Google drive
# test_set = pd.read_csv(parent_folder+'final_test_set.csv')

In [None]:
df_sample = test_set.sample(1)

NameError: name 'test_set' is not defined

In [None]:
df_sample["final_clean_article"].values

NameError: name 'df_sample' is not defined

In [None]:
%%time
ARTICLE_TO_SUMMARIZE = """Iran menganggap pengerahan sistem rudal THAAD Amerika Serikat (AS) ke Israel sebagai perang psikologis. Itu disampaikan Menteri Pertahanan Iran Aziz Nasirzadeh ketika situasi Timur Tengah tetap tegang karena Tel Aviv bersiap melakukan serangan balasan terhadap Teheran. Pentagon mengatakan pada Selasa lalu bahwa pasukan AS telah tiba di Israel sebagai bagian dari penempatan sistem rudal Terminal High Altitude Area Defense (THAAD). Baterai sistem rudal canggih tersebut akan meningkatkan pertahanan Israel terhadap potensi serangan balik Iran. Baca Juga Jenderal Iran Klaim Negaranya Miliki Senjata Rahasia Lebih Kuat dari Bom Nuklir “Kami melihat penempatan itu sebagai bagian dari perang psikologis. Itu tidak menjadi masalah besar bagi kami," tulis kantor berita pemerintah Iran; IRNA, Kamis (17/10/2024), mengutip pernyataan Nasirzadeh. Juru bicara Pentagon Mayor Jenderal Pat Ryder mengatakan pada Selasa bahwa tim pendahulu personel AS dan komponen awal yang dibutuhkan untuk baterai sistem rudal THAAD telah tiba di Israel, dengan baterai penuh diharapkan akan segera beroperasi. "Penempatan tersebut menegaskan komitmen Amerika Serikat untuk membela Israel dan untuk membela warga Amerika di Israel dari serangan rudal balistik apa pun oleh Iran," kata Ryder. Sistem THAAD dioperasikan oleh 95 tentara dan terdiri dari enam peluncur yang dipasang di truk dengan masing-masing delapan pencegat, radar, dan komponen pengendali tembakan, menurut Layanan Riset Kongres AS. Sistem tersebut dirancang untuk mencegat rudal balistik jarak pendek, sedang, dan menengah. Pada 1 Oktober, Iran meluncurkan lebih dari 180 rudal ke Israel sebagai balasan atas terbunuhnya kepala Hizbullah Hassan Nasrallah dan komandan Iran Abbas Nilforoushan di Beirut bulan lalu.

Artikel ini telah diterbitkan di halaman SINDOnews.com pada Kamis, 17 Oktober 2024 - 09:40 WIB oleh Muhaimin dengan judul "Iran: AS Kerahkan Sistem Rudal THAAD ke Israel Merupakan Perang Psikologis". Untuk selengkapnya kunjungi:
https://international.sindonews.com/read/1474071/43/iran-as-kerahkan-sistem-rudal-thaad-ke-israel-merupakan-perang-psikologis-1729131048#goog_rewarded

Untuk membaca berita lebih mudah, nyaman, dan tanpa banyak iklan, silahkan download aplikasi SINDOnews.
- Android: https://sin.do/u/android
- iOS: https://sin.do/u/ios"""

# generate summary
input_ids = tokenizer.encode(clean_article(ARTICLE_TO_SUMMARIZE), return_tensors='pt')
summary_ids = model.generate(input_ids.to(model.device),
            min_length=20,
            max_length=128,
            num_beams=10,
            repetition_penalty=2.5,
            length_penalty=1.0,
            early_stopping=True,
            no_repeat_ngram_size=2,
            use_cache=True,
            do_sample = True,
            temperature = 0.1,
            top_k = 50,
            top_p = 0.95)
# start time dan end time
summary_text = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print(summary_text)

Iran menganggap pengerahan sistem rudal THAAD Amerika Serikat ke Israel sebagai perang psikologis. Itu disampaikan Menteri Pertahanan Iran Aziz Nasirzadeh ketika situasi Timur Tengah tetap tegang karena Tel Aviv bersiap melakukan serangan balasan terhadap Teheran.
CPU times: user 19.3 s, sys: 1.68 s, total: 21 s
Wall time: 27.8 s


In [None]:
pip install transformers gradio torch




In [None]:
import gradio as gr
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load T5 base model and tokenizer
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5ForConditionalGeneration.from_pretrained('t5-base')

# Function to summarize text
def summarize_text(text):
    # Preprocess the input text
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)

    # Generate summary
    input_ids = tokenizer.encode(clean_article(ARTICLE_TO_SUMMARIZE), return_tensors='pt')
    summary_ids = model.generate(input_ids.to(model.device),
            min_length=20,
            max_length=128,
            num_beams=10,
            repetition_penalty=2.5,
            length_penalty=1.0,
            early_stopping=True,
            no_repeat_ngram_size=2,
            use_cache=True,
            do_sample = True,
            temperature = 0.1,
            top_k = 50,
            top_p = 0.95)

    # Decode and return the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Create the Gradio interface
interface = gr.Interface(
    fn=summarize_text,
    inputs="text",
    outputs="text",
    title="Text Summarizer",
    description="Enter a long piece of text to get a concise summary generated by the T5 model.",
    examples=["Artificial Intelligence (AI) is rapidly transforming multiple industries..."]
)

# Launch the app
interface.launch(share=True)


IndentationError: unexpected indent (<ipython-input-10-8b9db013d3b0>, line 30)

In [None]:
import gradio as gr
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the custom model and tokenizer from your saved path
model_path = "/content/drive/MyDrive/datasets/t5_indo_sum"
tokenizer_path = "/content/drive/MyDrive/datasets/t5_indo_sum/tokenizer"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(tokenizer_path)

# Function to summarize text
def summarize_text(text):
    # Preprocess the input text
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)

    # Generate summary
    summary_ids = model.generate(inputs.to(model.device),
                                 min_length=20,
                                 max_length=128,
                                 num_beams=10,
                                 repetition_penalty=2.5,
                                 length_penalty=1.0,
                                 early_stopping=True,
                                 no_repeat_ngram_size=2,
                                 use_cache=True,
                                 do_sample=True,
                                 temperature=0.1,
                                 top_k=50,
                                 top_p=0.95)

    # Decode and return the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Create the Gradio interface
interface = gr.Interface(
    fn=summarize_text,
    inputs="text",
    outputs="text",
    title="Indonesian Text Summarizer",
    description="Enter a long piece of Indonesian text to get a concise summary generated by the custom T5 model.",
    examples=["Artificial Intelligence (AI) cepat mengubah berbagai industri..."]
)

# Launch the app
interface.launch(share=True)


## Inference in Gradio UI

### Dependencies for Gradio and load pretrained model

In [None]:
#Installing Gradio for colab if not exist
!pip install gradio

Collecting gradio
  Downloading gradio-5.1.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.2-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.0 (from gradio)
  Downloading gradio_client-1.4.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub>=0.25.1 (from gradio)
  Downloading huggingface_hub-0.26.0-py3-none-any.whl.metadata (13 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata

In [None]:
#Dependencies
from transformers import BertModel, AutoTokenizer, BertTokenizer, BertForSequenceClassification, EncoderDecoderModel
import torch
import gradio as gr

# Load the custom model and tokenizer from your saved path
model_path = "/content/drive/MyDrive/datasets/bert2bert_indo_sum"
tokenizer_path = "/content/drive/MyDrive/datasets/tokenizer"
model = EncoderDecoderModel.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(tokenizer_path)

#Set Token
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

# Set device to GPU using cuda
device = torch.device('cuda')
model.to(device)

EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

### Setting up the gradio

In [None]:
# Function to summarize text
def summarize_text(text):
    # Preprocess the input text
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)

    # Generate summary (setting like LLM)
    summary_ids = model.generate(inputs.to(model.device),
                                 min_length=20, #Minimum text to input
                                 max_length=512, #Maximum text to input
                                 num_beams=10,
                                 repetition_penalty=1.1, #This function for how much to discourage repeating the same token
                                 length_penalty=1.0,
                                 early_stopping=True,
                                 no_repeat_ngram_size=2,
                                 use_cache=True,
                                 do_sample=True,
                                 temperature=0.1, #If the text want to be similar with the source set temp to 0.1, if want to be creative set to 0.7
                                 top_k=45, #Limits the next token to one of the top-k most probable tokens. Acts similarly to temperature
                                 top_p=0.95) #Minimum cumulative probability for the possible next tokens. Acts similarly to temperature

    # Decode and return the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Create the Gradio interface
interface = gr.Interface(
    fn=summarize_text,
    inputs="text",
    outputs="text",
    title="Indonesian Text Summarizer",
    description="Enter a long piece of Indonesian text to get a concise summary generated by the custom BERT-to-BERT model.",
    examples=["Artificial Intelligence (AI) cepat mengubah berbagai industri..."]
)

# Launch the app
interface.launch(share=True, debug=True)


Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://8a09a209bb46eb8437.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
