In [None]:
import numpy as np
import pandas as pd
import os
import re
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2TokenizerFast, Trainer, TrainingArguments, default_data_collator
from datasets import Dataset, load_metric
import torch
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
import contractions

In [None]:
# DataSet Source
# https://www.kaggle.com/datasets/gowrishankarp/newspaper-text-summarization-cnn-dailymail/data

def load_input_dataset_files():
    files = {}
    for dirname, _, filenames in os.walk('./dataset'):
        for filename in filenames:
            fullpath = os.path.join(dirname, filename)
            if filename.split(".")[-1] == "csv":
                files[''.join(filename.split(".")[:-1])] = pd.read_csv(fullpath)
                print(f"Loaded file: {filename}")
    return files

input_files = load_input_dataset_files()

In [None]:
raw_train_all_df = input_files['train']
raw_validation_all_df = input_files['validation']

In [None]:
print(raw_train_all_df.shape)
print(raw_validation_all_df.shape)

In [None]:
#  C:\Users\Administrator\AppData\Roaming\nltk_data
nltk.download('stopwords')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))

In [None]:
raw_train_all_df['article_length'] = raw_train_all_df['article'].apply(lambda x: len(x.split()))
raw_train_all_df['summary_length'] = raw_train_all_df['highlights'].apply(lambda x: len(x.split()))
raw_validation_all_df['article_length'] = raw_validation_all_df['article'].apply(lambda x: len(x.split()))
raw_validation_all_df['summary_length'] = raw_validation_all_df['highlights'].apply(lambda x: len(x.split()))
print(raw_train_all_df[['article_length', 'summary_length']].describe())

raw_train_all_df['article_length'] = pd.to_numeric(raw_train_all_df['article_length'], errors='coerce')
raw_train_all_df['summary_length'] = pd.to_numeric(raw_train_all_df['summary_length'], errors='coerce')

raw_validation_all_df['article_length'] = pd.to_numeric(raw_validation_all_df['article_length'], errors='coerce')
raw_validation_all_df['summary_length'] = pd.to_numeric(raw_validation_all_df['summary_length'], errors='coerce')

train_all_df = raw_train_all_df[(raw_train_all_df['article_length'] <= 900) & (raw_train_all_df['summary_length'] <= 900)]
validation_all_df = raw_validation_all_df[(raw_validation_all_df['article_length'] <= 900) & (raw_validation_all_df['summary_length'] <= 900)]
print(train_all_df[['article_length', 'summary_length']].describe())

In [None]:
print(train_all_df.shape)
print(validation_all_df.shape)

In [None]:
train_df = train_all_df.sample(n=40000, random_state=42)
validation_df = validation_all_df.sample(n=2000, random_state=42)

In [None]:
print("Duplicate articles in training set:", train_df.duplicated(subset=['article']).sum())
train_df.head(1)

In [None]:
def normalize_text(text):
    # Remove leading/trailing whitespace
    text = text.strip()
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    return text

def remove_unwanted_characters(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove numbers (optional)
    # text = re.sub(r'\d+', '', text)
    
    # Remove excess punctuation (e.g., !!!, ???)
    text = re.sub(r'([!?.])\1+', r'\1', text)
    
    # Remove other unwanted symbols (customize as needed)
    # Example: Remove pipe |, tilde ~, caret ^, etc.
    text = re.sub(r'[|~^]', '', text)
    return text

def expand_contractions_text(text):
    return contractions.fix(text)

def standardize_possessives(text):
    # Example: Convert "US's" to "US is" if appropriate
    # Note: This is context-dependent and should be used carefully
    text = re.sub(r"(\b\w+)'s\b", r"\1 is", text)
    return text

def remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(tokens)

def format_all(df_list):
    for df in df_list:

        df.drop_duplicates(subset=['article', 'highlights'], inplace=True)
    
        # change it's to it is etc
        df['article'] = df['article'].apply(expand_contractions_text)
        df['highlights'] = df['highlights'].apply(expand_contractions_text)

        df['article'] = df['article'].apply(standardize_possessives)
        df['highlights'] = df['highlights'].apply(standardize_possessives)
        
        df['article'] = df['article'].apply(normalize_text)
        df['highlights'] = df['highlights'].apply(normalize_text)
        
        df['article'] = df['article'].apply(remove_unwanted_characters)
        df['highlights'] = df['highlights'].apply(remove_unwanted_characters)

        df['article'] = df['article'].str.lower()
        df['highlights'] = df['highlights'].str.lower()

        # df['article'] = df['article'].apply(remove_stopwords)
        # df['highlights'] = df['highlights'].apply(remove_stopwords)


In [None]:
train_df_copy = train_df.copy()
validation_df_copy = validation_df.copy()
# train_df_copy = pd.DataFrame(train_df.head(10000))
# validation_df_copy = pd.DataFrame(validation_df.head(500))

format_all([train_df_copy, validation_df_copy])

In [None]:
pd.options.display.max_colwidth = 9999
train_df_copy.head(1)

In [None]:
def add_special_tokens():
	""" Returns GPT2 tokenizer after adding separator and padding tokens """
	tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
	special_tokens = {'pad_token':'<|pad|>','sep_token':'<|sep|>'}
	num_add_toks = tokenizer.add_special_tokens(special_tokens)
	return tokenizer

In [None]:
# Load the GPT-2 tokenizer and model
# Move the model to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = add_special_tokens()
model.resize_token_embeddings(len(tokenizer))
ignore_idx = tokenizer.pad_token_id
model.to(device)

def prepare_input_text(article, summary):
    return f"{article} </s> {summary}"

def generate_input_text_for_all(df_list):
    for df in df_list:
        df['input_text'] = df.apply(lambda x: prepare_input_text(x['article'], x['highlights']), axis=1)

def tokenize_function(df):
    encodings = tokenizer(df['input_text'],
                        padding='max_length',
                        truncation=True,
                        max_length=800
                    )
    # Create labels, setting padding tokens to -100 for loss calculation
    encodings['labels'] = [[-100 if token == ignore_idx else token for token in input_ids] for input_ids in encodings['input_ids']]
    return encodings

def compute_metrics(eval_pred):
    rouge = load_metric("rouge")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    batch_size = 10  # Adjust based on memory capacity
    rouge_results = []

    # Process ROUGE calculation in smaller batches
    for i in range(0, len(decoded_preds), batch_size):
        batch_preds = decoded_preds[i:i + batch_size]
        batch_labels = decoded_labels[i:i + batch_size]
        result = rouge.compute(predictions=batch_preds, references=batch_labels, use_stemmer=True)
        rouge_results.append(result)

    # Aggregate results
    final_result = {"rouge2": {"fmeasure": np.mean([r["rouge2"].mid.fmeasure for r in rouge_results])}}
    return final_result


In [None]:
generate_input_text_for_all([train_df_copy, validation_df_copy])

train_dataset = Dataset.from_pandas(train_df_copy[['input_text']])
validation_dataset = Dataset.from_pandas(validation_df_copy[['input_text']])

train_dataset = train_dataset.map(tokenize_function, batched=True)
validation_dataset = validation_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
validation_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="steps",
    eval_steps=1000,
    learning_rate=7e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    # load_best_model_at_end=True,
    # metric_for_best_model="rouge2",
    # greater_is_better=True,
    remove_unused_columns=False
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    # compute_metrics=compute_metrics
)

# Start training
trainer.train()

In [None]:
model.save_pretrained('./fine_tuned_40k_model_LR2')
tokenizer.save_pretrained('./fine_tuned_40k_model_LR2')

In [None]:
def prepare_input_for_generation(article):
    return f"{article} </s>"

# Assuming `model` is your fine-tuned model and `tokenizer` is your tokenizer
def generate_summary(article, device):
    # Prepare the input
    input_text = prepare_input_for_generation(article)
    
    # Tokenize the input
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

    # Generate summary
    summary_ids = model.generate(
        input_ids,
        max_length=280,  # Set maximum length for the generated summary
        top_k=10,              # Limit the number of highest probability tokens
        top_p=0.7,             # Use nucleus sampling
        early_stopping=True,
        pad_token_id=tokenizer.eos_token_id  # Ensure padding is handled correctly
    )

    # Decode the generated summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    
    # Remove the input text to show only the generated summary
    generated_summary = summary.split('</s>')[-1].strip()  # Get the part after the separator
    return generated_summary

In [None]:
input_article = "the bishop of the fargo catholic diocese in north dakota has exposed potentially hundreds of church members in fargo, grand forks and jamestown to the hepatitis a virus in late september and early october. the state health department has issued an advisory of exposure for anyone who attended five churches and took communion. bishop john folda (pictured) of the fargo catholic diocese in north dakota has exposed potentially hundreds of church members in fargo, grand forks and jamestown to the hepatitis a . state immunization program manager molly howell says the risk is low, but officials feel it is important to alert people to the possible exposure. the diocese announced on monday that bishop john folda is taking time off after being diagnosed with hepatitis a. the diocese says he contracted the infection through contaminated food while attending a conference for newly ordained bishops in italy last month. symptoms of hepatitis a include fever, tiredness, loss of appetite, nausea and abdominal discomfort. fargo catholic diocese in north dakota (pictured) is where the bishop is located ."
generate_summary(input_article, device)