In [None]:
!pip install datasets transformers rouge-score nltk -q

In [None]:
!pip install torch==1.7.1 -q

## IMPORT LIBRARY

In [None]:
import numpy as np
import pandas as pd
import torch
import datasets
from datasets import Dataset
from datasets import load_metric
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, DistilBertTokenizerFast, TFDistilBertModel
import matplotlib.pyplot as plt

## LOAD DATASET

In [None]:
train = pd.read_csv('/kaggle/input/indosum-unclean/train_uncleaned.csv')
pd.set_option('display.max_colwidth',None)
train.head(5) 

## PRAPROSES DATASET

In [None]:
import re

web = ["cnn indonesia", "merdeka.com", "antara news", "rimanews", "juara.net", "suara.com"]

def text_processing(data):
  for i in range(0,len(data)):
    data[i] = re.sub(r'[^\w.\s]', '', data[i].lower())

    for w in web:
      if w in data[i]:
        data[i] = data[i].replace(w, "")

In [None]:
text_processing(train["paragraphs"])
text_processing(train["summary"])

In [None]:
text_processing(train["paragraphs"])
text_processing(train["summary"])

In [None]:
df_valid = train[12218:13963]
df_test = train[13963:]
df_train = train[:12218]

df_train.shape, df_valid.shape,df_test.shape

## TOKENISASI

In [None]:
model_checkpoint ='t5-small'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
pad_on_right = tokenizer.padding_side == "right"

In [None]:
max_input_length = 500
max_target_length = 200

In [None]:
def preprocess_function(examples):
    inputs = ['summarize:' + doc for doc in examples["paragraphs"]]
    
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True,padding='max_length')

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
train = Dataset.from_pandas(df_train)
valid = Dataset.from_pandas(df_valid)

In [None]:
tokenized_train = train.map(preprocess_function, batched=True)
tokenized_valid = valid.map(preprocess_function, batched=True)

In [None]:
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

## FINE TUNING

In [None]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
batch_size = 16
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
import nltk
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Extract a few results
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
import gc
gc.collect()

In [None]:
# determine the device we will be using for training
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("[INFO] training using {}".format(torch.cuda.get_device_name(0)))
torch.cuda.empty_cache()
%env WANDB_DISABLED=True

In [None]:
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-indosum",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True
)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
metric = load_metric("rouge")

In [None]:
trainer.train()

In [None]:
train_loss = [1.063800, 0.726400, 0.708100, 0.648800, 0.660700, 0.641900, 0.632600, 0.628600, 0.638700, 0.627500]
val_loss = [0.724180, 0.682274, 0.669473, 0.650324, 0.643708, 0.634155, 0.630862, 0.626022, 0.626776, 0.625583]
epoch = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

plt.plot(train_loss, label="train")
plt.plot(val_loss, label="test")
plt.xlabel("epochs")
plt.ylabel("loss")
plt.title("Grafik Loss T5 (Tanpa Praproses)")
plt.legend()
plt.savefig("t5model_unclean.png")
plt.show()

## INFERENCE FASE

In [None]:
eval_dataset = Dataset.from_pandas(df_test)
eval_dataset = eval_dataset.map(
                preprocess_function,
                batched=True)

predict_results = trainer.predict(eval_dataset, max_length=500, num_beams=3)

In [None]:
predict_results.metrics

In [None]:
if args.predict_with_generate:
    predictions = tokenizer.batch_decode(predict_results.predictions, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    predictions = [pred.strip() for pred in predictions]

In [None]:
pred = predictions[0:10]
ori = df_test["summary"][0:10]
news = df_test["paragraphs"][0:10]

data = {'berita' : news, 'summary' : ori, 'prediksi' : pred}

result = pd.DataFrame(data)
result.to_csv("t5model_clean2.csv")
result

## SAVE MODEL WEIGHTS

In [None]:
import os
import subprocess
from IPython.display import FileLink, display

def download_file(path):
    os.chdir('/kaggle/working/')
    zip_name = f"/kaggle/working/t5model_clean_eksperimen2.zip"
    command = f"zip {zip_name} {path} -r"
    result = subprocess.run(command, shell=True, capture_output=True, text=True)
    if result.returncode != 0:
        print("Unable to run zip command!")
        print(result.stderr)
        return
    display(FileLink('t5model_clean_eksperimen.zip'))

In [None]:
download_file('/kaggle/working/t5-small-finetuned-indosum')