In [None]:
#T5로 뉴스 기사나 문서를 요약하는 실험

In [2]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

In [3]:
from datasets import load_dataset

In [4]:
#Load CNN/ DailyMail news summerization data load
dataset = load_dataset("cnn_dailymail", "3.0.0", split="test[:5]")

In [5]:
sample = dataset[0]

In [None]:
print("Data Structure and Contents:")
for key in sample:
    print(f"\n🔑 {key}:\n{sample[key][:300]}...")

In [None]:
#Tokenizer

In [None]:
model_name = "t5-small"

In [None]:
tokenizer = T5Tokenizer.from_pretrained(model_name)

In [None]:
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
#def summarize(text) :  function takes a news article (long text) as the text parameter and returns a string containing the summarized version of that tex

In [None]:
def summarize(text):
    input_text = "summarize: " + text
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=512, truncation=True)
    summary_ids = model.generate(
        inputs,
        max_length=150,
        min_length=30,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True
    )
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary


In [None]:
test_text = "The stock market fell by more than 500 points today amid economic uncertainty."

In [None]:
print("Summary:", summarize(test_text))

In [None]:
article_text = dataset[0]["article"]

In [None]:
print("Summary:", summarize(article_text))

In [None]:
for i, sample in enumerate(dataset):
    print(f"\nArticle #{i+1} Summary:")
    print(summarize(sample["article"]))
    print("Reference Summary:")
    print(sample["highlights"])
    print("="*50)


In [None]:
#Evaluate Text summerization using Rouge

In [None]:
!pip install rouge_score

In [None]:
import evaluate
rouge = evaluate.load("rouge")

In [None]:
predictions = [summarize(sample["article"]) for sample in dataset]
references = [sample["highlights"] for sample in dataset]

In [None]:
scores = rouge.compute(predictions=predictions, references=references)
print(scores)

In [None]:
#Result
"""
The bigram overlap (rouge2) is relatively low at around 7.7%, indicating that the contextual flow could be improved.
The sentence-level similarity scores (rougeL and rougeLsum) are approximately between 20% and 26%.
"""

In [None]:
#Fine tunning to improve text summerization feature

In [None]:
!pip install transformers datasets evaluate accelerate

In [None]:
train_dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:1%]")  # 학습용 일부
test_dataset = load_dataset("cnn_dailymail", "3.0.0", split="test[:5]")     # 테스트용 일부

In [None]:
#Tokenizer and model load

In [None]:
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

In [None]:
#data processing func Definition

In [None]:
def preprocess_function(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=150, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
sample = {
    "article": ["The stock market fell by more than 500 points today amid economic uncertainty."],
    "highlights": ["Stock market drops over 500 points due to economic concerns."]
}


In [None]:
output = preprocess_function(sample)

In [None]:

print("Input IDs:", output["input_ids"][0][:20])

In [None]:
print("Labels IDs:", output["labels"][0][:20]) 

In [None]:
print(output["input_ids"][0])

In [None]:
print("Decoded Input:", tokenizer.decode(output["input_ids"][0], skip_special_tokens=True))

In [None]:
print("Decoded Labels:", tokenizer.decode(output["labels"][0], skip_special_tokens=True))

In [None]:
#Tokenizing

In [None]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

In [None]:
!pip install --upgrade transformers

In [None]:
#set training_args

In [None]:
from transformers import TrainingArguments, IntervalStrategy, Trainer

In [None]:

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=1,   # 아주 작은 배치 사이즈
    num_train_epochs=1,              # 1 에폭만 실행
    logging_steps=10,                # 10 스텝마다 로그 출력
    save_strategy="no",              # 체크포인트 저장 안함 (빠르게 테스트용)
    disable_tqdm=False               # 진행바 표시 켬
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train.select(range(10)),
    eval_dataset=tokenized_test.select(range(5)),  # tokenized_test 크기 확인 후 맞추기
    tokenizer=tokenizer,
)
trainer.train()

In [None]:
#Run evaluation on finetuned text summerization using ROUGE

In [None]:
def compute_rouge(preds, refs):
    results = rouge.compute(predictions=preds, references=refs)
    return results

In [None]:
# 예시: 예측 요약 리스트(preds)와 정답 요약 리스트(refs)
preds = ["The stock market fell today..."]  # 모델이 생성한 요약들
refs = ["The market experienced a significant drop today..."]  # 정답 요약들

results = compute_rouge(preds, refs)
print(results)

In [None]:
#Result

In [None]:
import pandas as pd

data = {
    "Metric": ["rouge1", "rouge2", "rougeL", "rougeLsum"],
    "Before Fine-tuning": ["33.2%", "7.7%", "21.1%", "26.4%"],
    "After Fine-tuning": ["50%", "0%", "50%", "50%"]
}

df = pd.DataFrame(data)
df


In [None]:
"""
The significant increase in rouge1, rougeL, and rougeLsum indicates that word-level overlap and sentence similarity have improved a lot.
However, the rouge2 dropping to 0% means that the 2-gram overlap — reflecting contextual flow and phrase continuity — has worsened.
"""

In [None]:
#improve text summarization functionality 
"""
Plan A :Check the amount of data => increase from 1% to 10% of CNN/DailyMail) and re-evaluate
Plan B : If problems persist, adjust training settings ->  learning rate that is too high can cause instability +  Increase the number of epochs to allow sufficient training

+ increase batch sizes
Plan C: Use additional evaluation metrics such as BERTScore alongside ROUGE for a more comprehensive quality assessment
"""
