<a href="https://colab.research.google.com/github/subin6985/20242R0136COSE47402/blob/main/FinalProject/Deep_Learning_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install datasets
!pip install transformers
!pip install rouge_score



In [2]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Trainer, TrainingArguments

In [3]:
dataset = load_dataset("cnn_dailymail", "3.0.0")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})


In [4]:
# 8:1:1 비율로 데이터셋 분할
train_test_split = dataset["train"].train_test_split(test_size=0.2, seed=42)
val_test_split = train_test_split["test"].train_test_split(test_size=0.5, seed=42)
train_dataset = train_test_split["train"]
val_dataset = val_test_split["train"]
test_dataset = val_test_split["test"]

# 데이터 분할 상태 확인
print(f"Train: {len(train_dataset)}, Validation: {len(val_dataset)}, Test: {len(test_dataset)}")

Train: 229690, Validation: 28711, Test: 28712


In [5]:
# 모델과 토크나이저 준비
model_name = "facebook/bart-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [6]:
# 전처리 함수 정의
def preprocess_function(examples):
    # 입력 데이터: "Summarize:" 프롬프트 추가
    inputs = ["Summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    # 라벨 데이터 처리
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["highlights"], max_length=128, truncation=True, padding="max_length")

    # 라벨의 패딩 토큰을 -100으로 변경
    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in label_list]
        for label_list in labels["input_ids"]
    ]
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [7]:
# 샘플링 데이터셋 추출
train_dataset_sampled = train_dataset.select(range(500))
val_dataset_sampled = val_dataset.select(range(100))
test_dataset_sampled = test_dataset.select(range(100))

# 샘플링 데이터셋 전처리
train_dataset = train_dataset_sampled.map(preprocess_function, batched=True)
val_dataset = val_dataset_sampled.map(preprocess_function, batched=True)
test_dataset = test_dataset_sampled.map(preprocess_function, batched=True)


Map:   0%|          | 0/100 [00:00<?, ? examples/s]



In [8]:
# 훈련 파라미터 설정
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=2,
    weight_decay=0.01,
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=100,
    fp16=True,
    report_to="none"
)



In [9]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

# Trainer 생성
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# 모델 학습
trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,2.7286,2.25465
2,2.1814,2.218722




TrainOutput(global_step=250, training_loss=2.3572749938964845, metrics={'train_runtime': 6099.5343, 'train_samples_per_second': 0.164, 'train_steps_per_second': 0.041, 'total_flos': 304868229120000.0, 'train_loss': 2.3572749938964845, 'epoch': 2.0})

In [10]:
# 평가
metrics = trainer.evaluate(test_dataset)
print(metrics)

{'eval_loss': 2.1323797702789307, 'eval_runtime': 160.3784, 'eval_samples_per_second': 0.624, 'eval_steps_per_second': 0.156, 'epoch': 2.0}


In [11]:
# 테스트 데이터셋으로 요약 생성
def generate_summary(examples):
    inputs = ["Summarize: " + doc for doc in examples["article"]]
    inputs = tokenizer(inputs, return_tensors="pt", truncation=True, padding=True, max_length=512)
    outputs = model.generate(inputs["input_ids"], max_length=128, num_beams=4, early_stopping=True)
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

In [12]:
# 테스트 데이터 일부에 대해 요약 생성
test_sample = test_dataset.select(range(5))
generated_summaries = generate_summary(test_sample)
for i, summary in enumerate(generated_summaries):
    print(f"Original: {test_sample[i]['highlights']}")
    print(f"Generated: {summary}")
    print("-" * 80)

Original: Aung San Suu Kyi is released Saturday .
She has been under house arrest for much of the past two decades .
She has defiantly challenged the authority of the military junta .
She likens Myanmar's plight to South African apartheid .
Generated: Aung San Suu Kyi is the very embodiment of Myanmar's long struggle for democracy .
The 65-year-old human rights activist has endured house arrest for much of the past two decades and, perhaps, has become the world's most recognizable political prisoner .
--------------------------------------------------------------------------------
Original: An energized conservative electorate helps Republicans to historic gain in midterms .
Republicans nab at least 60 more House seats, based on CNN analysis of exit poll data .
Senate Majority Leader Harry Reid defeats Republican Sharron Angle in Nevada .
President Obama calls House Minority Leader John Boehner to congratulate him .
Generated: President Barack Obama called House Minority Leader John Bo