In [1]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 1: Load the data
data = pd.read_csv("summery_training.csv", encoding="latin1")
print(data.head())

# Convert to Hugging Face Dataset format
dataset = Dataset.from_pandas(data)

# Step 2: Tokenize the data
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

                    country  \
0      United Arab Emirates   
1                Kazakhstan   
2              Saudi Arabia   
3  United States of America   
4                   Finland   

                                  policy_description  \
0  The National Hydrogen Strategy 2050 aims to su...   
1  The code regulates activities which may have a...   
2  Saudi Arabia aims to increase the share of nat...   
3              Focused on better lighting solutions.   
4  Investment proposal for areas with coal-fired ...   

                                             Summary  \
0  The National Hydrogen Strategy 2050 aims to po...   
1  A code requiring the largest polluters to adop...   
2  Saudi Arabia aims for 50% of electricity from ...   
3  Focus on enhancing lighting solutions for ener...   
4  Proposal for investing in regions with coal pl...   

                             facebook/bart-large-cnn  \
0  The National Hydrogen Strategy 2050 aims to su...   
1  The code regulates activ

In [3]:
def preprocess_function(examples):
    # Tokenize inputs (prefix input with "summarize: " for T5 models)
    inputs = ["summarize: " + text for text in examples["policy_description"]]
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        padding="max_length",
    )
    # Tokenize targets (summary)
    labels = tokenizer(
        examples["Summary"],
        max_length=150,
        truncation=True,
        padding="max_length",
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 28.29 examples/s]


In [4]:
# Step 3: Load the T5 model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Step 4: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./t5-summarization",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    fp16=True,
)



In [5]:
# Step 5: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Step 6: Train and Evaluate
trainer.train()

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,15.053608
2,No log,13.826735
3,No log,13.389071


TrainOutput(global_step=9, training_loss=14.587103949652779, metrics={'train_runtime': 45.7765, 'train_samples_per_second': 0.59, 'train_steps_per_second': 0.197, 'total_flos': 3654228639744.0, 'train_loss': 14.587103949652779, 'epoch': 3.0})

In [6]:
trainer.evaluate()

{'eval_loss': 13.389071464538574,
 'eval_runtime': 2.7934,
 'eval_samples_per_second': 3.222,
 'eval_steps_per_second': 1.074,
 'epoch': 3.0}

In [7]:
# Step 7: Save the Model
model.save_pretrained("./t5-summarization")
tokenizer.save_pretrained("./t5-summarization")

('./t5-summarization/tokenizer_config.json',
 './t5-summarization/special_tokens_map.json',
 './t5-summarization/tokenizer.json')

In [8]:
# Load the trained model and tokenizer
model_name = "./t5-summarization"  # Path to the saved model
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Function to generate a summary for a given text
def generate_summary(text):
    # Prefix input with "summarize: " as required by T5
    inputs = tokenizer(
        "summarize: " + text,
        return_tensors="pt",
        max_length=512,
        truncation=True,
        padding="max_length",
    )

    # Generate the summary
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=150,
        num_beams=4,
        early_stopping=True,
    )

    # Decode and return the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example usage
policy_description = "Resolution 1-E/2017 establishes a discount of up to 20% on electricity prices for energy-intensive industries. Provision 3/2018 (2018, as part of Joint Resolution 1-E/2017) Companies that want to benefit from reduced electicity price have to implement the ISO norm 50001 on energy management systems (i.e. develop a plan of action for energy management, establish targets for energy performance, and define indicators to monitor progress)"
summary = generate_summary(policy_description)
print("Generated Summary:", summary)


Generated Summary: resolution 1-E/2017 establishes a discount of up to 20% on electricity prices. companies that want to benefit from reduced electicity price have to implement the ISO norm 50001 on energy management systems.
