# Projekt ZUM - Streszczenie tekstu

## Wstępna analiza

In [None]:
! pip install datasets

In [None]:
from datasets import load_dataset

ds = load_dataset("abisee/cnn_dailymail", "3.0.0")
df_train = ds['train'].to_pandas()
df_val = ds['validation'].to_pandas()
df_test = ds['test'].to_pandas()

In [None]:
ds

In [None]:
import pandas as pd

pd.set_option('display.max_colwidth', 500)

df_train

In [None]:
# wyświetlanie przykładowej pary tekst - streszczenie
df_train.head(1)

In [None]:
# średnia długość tekstu i średnia długość streszczeń
import numpy as np

def words_count(text):
    return len(text.split())

text_len = [words_count(x) for x in df_train["article"]]
summary_len = [words_count(x) for x in df_train["highlights"]]

print(f"średnia długość tekstu (wyrazy): {np.mean(text_len)}")
print(f"średnia długość streszczeń (wyrazy): {np.mean(summary_len)}")

In [None]:
ratios = [s_len / t_len for s_len, t_len in zip(summary_len, text_len)]
print(f"Średnia proporcja długości streszczeń do tekstów: {np.mean(ratios):.2f}")


In [None]:
import matplotlib.pyplot as plt

# Histogramy
plt.figure(figsize=(12, 6))
plt.hist(text_len, bins=50, alpha=0.7, label="Długości tekstów")
plt.hist(summary_len, bins=50, alpha=0.7, label="Długości streszczeń")
plt.legend()
plt.title("Histogram długości tekstów i streszczeń")
plt.xlabel("Długość")
plt.ylabel("Liczba przykładów")
plt.show()


In [None]:
# Najdłuższy i najkrótszy artykuł
longest_article = max(df_train["article"], key=lambda x: words_count(x))
shortest_article = min(df_train["article"], key=lambda x: words_count(x))

print(f"Najdłuższy artykuł (liczba słów: {words_count(longest_article)}):\n{longest_article[:500]}...\n")
print(f"Najkrótszy artykuł (liczba słów: {words_count(shortest_article)}):\n{shortest_article[:500]}\n")

# Najdłuższe i najkrótsze streszczenie
longest_summary = max(df_train["highlights"], key=lambda x: words_count(x))
shortest_summary = min(df_train["highlights"], key=lambda x: words_count(x))

print(f"Najdłuższe streszczenie (liczba słów: {words_count(longest_summary)}):\n{longest_summary[:500]}...\n")
print(f"Najkrótsze streszczenie (liczba słów: {words_count(shortest_summary)}):\n{shortest_summary[:500]}\n")

In [None]:
from collections import Counter

# Tokenizacja i liczenie słów
df_train['article_word_count'] = df_train['article'].apply(lambda x: words_count(x))
df_train['summary_word_count'] = df_train['highlights'].apply(lambda x: words_count(x))

# Najczęściej występujące słowa w streszczeniach
all_summaries = " ".join(df_train['highlights'])
word_counts = Counter(all_summaries.split())
print(word_counts.most_common(10))


## Inżynieria cech

In [None]:
import re

def clean_text(text):
  text = re.sub(r'http[s]?://\S+', '', text)
  text = text.strip().replace('\n', ' ').replace('\r', ' ')
  text = re.sub(r'\s+', ' ', text)
  return text

train_data = ds['train']
train_data = train_data_map = train_data.map(lambda x: {'article': clean_text(x['article']), 'highlights': clean_text(x['highlights'])})
val_data = ds['validation'].map(lambda x: {'article': clean_text(x['article']), 'highlights': clean_text(x['highlights'])})
test_data = ds['test'].map(lambda x: {'article': clean_text(x['article']), 'highlights': clean_text(x['highlights'])})

train_data[:2], val_data[:2], test_data[:2]

In [None]:
! pip install transformers torch

In [None]:
from transformers import AutoTokenizer

model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_data(examples):
    inputs = [f"summarize: {article}" for article in examples['article']]
    targets = examples['highlights']

    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=150,
            truncation=True,
            padding="max_length"
        )

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

train_data_tokenized = train_data.map(
    tokenize_data,
    batched=True,
    remove_columns=['article', 'highlights'],
    batch_size=1000
)

val_data_tokenized = val_data.map(
    tokenize_data,
    batched=True,
    remove_columns=['article', 'highlights'],
    batch_size=1000
)

test_data_tokenized = test_data.map(
    tokenize_data,
    batched=True,
    remove_columns=['article', 'highlights'],
    batch_size=1000
)

train_data_tokenized = train_data_tokenized.with_format("torch")
val_data_tokenized = val_data_tokenized.with_format("torch")
test_data_tokenized = test_data_tokenized.with_format("torch")

print(train_data_tokenized[0])

## Model

In [None]:
! pip install -U accelerate

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir='./logs',
    logging_steps=500,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data_tokenized,
    eval_dataset=val_data_tokenized,
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
results = trainer.evaluate(eval_dataset=val_data_tokenized)
print(results)

In [None]:
test_results = trainer.evaluate(eval_dataset=test_data_tokenized)
print(test_results)

In [None]:
# Generating predictions for the test dataset
test_sample = test_data_tokenized[0]
input_ids = test_sample['input_ids']
outputs = model.generate(input_ids=input_ids.unsqueeze(0), max_length=150)

# Decode predictions
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded_output)

In [None]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")