In [4]:
!pip install pytesseract transformers datasets rouge-score nltk tensorboard py7zr --upgrade




In [5]:
# install git-fls for pushing model and logs to the hugging face hub
!sudo apt-get install git-lfs --yes

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.3).
0 upgraded, 0 newly installed, 0 to remove and 60 not upgraded.


In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Step 1: Load the data
data = pd.read_csv('summery_training.csv', encoding='latin1')
print(data.head())

# Convert to Hugging Face Dataset format
dataset = Dataset.from_pandas(data)

# Step 2: Tokenize the data
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

                    country  \
0      United Arab Emirates   
1                Kazakhstan   
2              Saudi Arabia   
3  United States of America   
4                   Finland   

                                  policy_description  \
0  The National Hydrogen Strategy 2050 aims to su...   
1  The code regulates activities which may have a...   
2  Saudi Arabia aims to increase the share of nat...   
3              Focused on better lighting solutions.   
4  Investment proposal for areas with coal-fired ...   

                                             Summary  \
0  The National Hydrogen Strategy 2050 aims to po...   
1  A code requiring the largest polluters to adop...   
2  Saudi Arabia aims for 50% of electricity from ...   
3  Focus on enhancing lighting solutions for ener...   
4  Proposal for investing in regions with coal pl...   

                             facebook/bart-large-cnn  \
0  The National Hydrogen Strategy 2050 aims to su...   
1  The code regulates activ

In [3]:
# Step 3: Preprocess the data
def preprocess_function(examples):
    # Tokenize the policy description (input)
    model_inputs = tokenizer(
        examples["policy_description"],
        max_length=512,
        truncation=True,
        padding="max_length"
    )
    
    # Tokenize the summary (target labels)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["Summary"],
            max_length=150,  # Limit the length of the summary to avoid excessively long labels
            truncation=True,
            padding="max_length"
        )

    # Assign labels to the tokenized input
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map: 100%|█████████████████████| 9/9 [00:00<00:00, 130.27 examples/s]


In [4]:
# Step 4: Load the model and set up training arguments
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

training_args = TrainingArguments(
    output_dir="./flan-t5-policy-finetune",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    weight_decay=0.01,
    num_train_epochs=3,
    save_total_limit=3,
    fp16=True,
)



In [5]:
# Step 5: Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
0,No log,39.849087


TrainOutput(global_step=0, training_loss=112451.98822021484, metrics={'train_runtime': 195.8287, 'train_samples_per_second': 0.138, 'train_steps_per_second': 0.015, 'total_flos': 4108544114688.0, 'train_loss': 112451.98822021484, 'epoch': 0})

In [6]:
# Step 7: Evaluate trained model
trainer.evaluate()

{'eval_loss': 39.84908676147461,
 'eval_runtime': 9.1868,
 'eval_samples_per_second': 0.98,
 'eval_steps_per_second': 0.544,
 'epoch': 0}

In [7]:
# Step 7: Save the trained model
model.save_pretrained("./flan-t5-policy-finetune")
tokenizer.save_pretrained("./flan-t5-policy-finetune")


('./flan-t5-policy-finetune/tokenizer_config.json',
 './flan-t5-policy-finetune/special_tokens_map.json',
 './flan-t5-policy-finetune/tokenizer.json')

In [8]:
model = AutoModelForSeq2SeqLM.from_pretrained("./flan-t5-policy-finetune")
tokenizer = AutoTokenizer.from_pretrained("./flan-t5-policy-finetune")

# Function to generate a policy summary
def generate_policy_summary(policy_description):
    inputs = tokenizer(
        policy_description,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=512
    )
    
    # Generate summaries
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=150,          # Adjust the max length as needed
        num_beams=5,             # Increase beams for better quality
        early_stopping=True,
        temperature=0.7,         # Adjust for diversity in the output
        num_return_sequences=1
    )
    
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return summary

# Example policy description
policy_description = "Resolution 1-E/2017 establishes a discount of up to 20% on electricity prices for energy-intensive industries. Provision 3/2018 (2018, as part of Joint Resolution 1-E/2017) Companies that want to benefit from reduced electicity price have to implement the ISO norm 50001 on energy management systems (i.e. develop a plan of action for energy management, establish targets for energy performance, and define indicators to monitor progress)"

# Generate the summary
summary = generate_policy_summary(policy_description)

print(summary)



Revision 1-E/2017 establishes a discount of up to 20% on electricity prices for energy-intensive industries.
