In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, BartForConditionalGeneration, TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 1: Load the data
data = pd.read_csv('summery_training.csv', encoding='latin1')
print(data.head())

# Convert to Hugging Face Dataset format
dataset = Dataset.from_pandas(data)

# Step 2: Tokenize the data
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)

                    country  \
0      United Arab Emirates   
1                Kazakhstan   
2              Saudi Arabia   
3  United States of America   
4                   Finland   

                                  policy_description  \
0  The National Hydrogen Strategy 2050 aims to su...   
1  The code regulates activities which may have a...   
2  Saudi Arabia aims to increase the share of nat...   
3              Focused on better lighting solutions.   
4  Investment proposal for areas with coal-fired ...   

                                             Summary  \
0  The National Hydrogen Strategy 2050 aims to po...   
1  A code requiring the largest polluters to adop...   
2  Saudi Arabia aims for 50% of electricity from ...   
3  Focus on enhancing lighting solutions for ener...   
4  Proposal for investing in regions with coal pl...   

                             facebook/bart-large-cnn  \
0  The National Hydrogen Strategy 2050 aims to su...   
1  The code regulates activ

In [3]:
def preprocess_function(examples):
    # Tokenize the policy description (input)
    model_inputs = tokenizer(
        examples["policy_description"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    
    # Tokenize the summary (target labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["Summary"],
            max_length=150,
            truncation=True,
            padding="max_length"
        )

    # Assign labels to the tokenized input
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|██████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 27.24 examples/s]


In [4]:
# Step 3: Load the model and set up training arguments
model = BartForConditionalGeneration.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir="./bart-policy-finetune",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    weight_decay=0.01,
    num_train_epochs=3,
    save_total_limit=3,
    fp16=True,
)



In [5]:
# Step 4: Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss
0,No log,10.984097


TrainOutput(global_step=0, training_loss=34040.03143310547, metrics={'train_runtime': 76.7722, 'train_samples_per_second': 0.352, 'train_steps_per_second': 0.039, 'total_flos': 6501313806336.0, 'train_loss': 34040.03143310547, 'epoch': 0})

In [6]:
trainer.evaluate()

{'eval_loss': 10.98409652709961,
 'eval_runtime': 13.5388,
 'eval_samples_per_second': 0.665,
 'eval_steps_per_second': 0.369,
 'epoch': 0}

In [7]:
# Step 6: Save the trained model
model.save_pretrained("./bart-policy-finetune")
tokenizer.save_pretrained("./bart-policy-finetune")



('./bart-policy-finetune/tokenizer_config.json',
 './bart-policy-finetune/special_tokens_map.json',
 './bart-policy-finetune/vocab.json',
 './bart-policy-finetune/merges.txt',
 './bart-policy-finetune/added_tokens.json',
 './bart-policy-finetune/tokenizer.json')

In [8]:
model = BartForConditionalGeneration.from_pretrained("./bart-policy-finetune")
tokenizer = AutoTokenizer.from_pretrained("./bart-policy-finetune")

# Function to generate a policy summary
def generate_policy_summary(policy_description):
    inputs = tokenizer(
        policy_description,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=512
    )
    
    # Generate summaries
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=150,          # Adjust the max length as needed
        num_beams=5,             # Increase beams for better quality
        early_stopping=True,
        temperature=0.7,         # Adjust for diversity in the output
        num_return_sequences=1
    )
    
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example policy description
policy_description = "Resolution 1-E/2017 establishes a discount of up to 20% on electricity prices for energy-intensive industries. Provision 3/2018 (2018, as part of Joint Resolution 1-E/2017) Companies that want to benefit from reduced electicity price have to implement the ISO norm 50001 on energy management systems (i.e. develop a plan of action for energy management, establish targets for energy performance, and define indicators to monitor progress)"

# Generate the summary
summary = generate_policy_summary(policy_description)

print(summary)



Resolution 1-E/2017 establishes a discount of up to 20% on electricity prices for energy-intensive industries. Companies that want to benefit from reduced electicity price have to implement the ISO norm 50001 on energy management systems.
