In [2]:
!pip install evaluate
!pip install optuna
!pip install datasets
!pip install bert_score
!pip install rouge_score

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.18-py311-none-any.whl.metadata (7.5 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m 

In [3]:
import json
import torch
from datasets import load_dataset
import nltk
import optuna
from transformers import BartTokenizer, BartForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
from evaluate import load

# Download NLTK data for sentence tokenization
nltk.download("punkt")

# Step 1: Load the ClimateFever dataset using Hugging Face datasets
print("Loading ClimateFever dataset...")
climatefever_dataset = load_dataset("climate_fever", split="test")

# Step 2: Extract and adapt (problem, approach) pairs
dataset = []

env_keywords = [
    "climate change", "carbon emission", "pollution", "biodiversity",
    "deforestation", "renewable energy", "sustainability", "ocean acidification"
]

def is_env_science(claim):
    claim_lower = claim.lower()
    return any(keyword in claim_lower for keyword in env_keywords)

def synthesize_approach(claim, evidence_list):
    evidence_text = " ".join([evidence["evidence"] for evidence in evidence_list])
    if "carbon emission" in claim.lower() or "global warming" in claim.lower():
        approach = f"To address the issue of {claim.lower()}, a multi-step strategy can be implemented: 1. Promote renewable energy adoption by offering incentives such as tax credits for solar and wind energy installations. 2. Expand public transportation systems to reduce reliance on fossil fuel-based vehicles, especially in urban areas. 3. Implement stricter regulations on industrial emissions, requiring companies to adopt cleaner technologies and report emissions annually. Additionally, public awareness campaigns can educate communities about sustainable practices, such as reducing energy consumption and supporting green policies. International collaboration with organizations like the UN can help secure funding and coordinate efforts across countries, ensuring a unified approach to tackling this issue. {evidence_text} This approach aims to mitigate the environmental impact while fostering long-term sustainability."
    elif "pollution" in claim.lower():
        approach = f"To mitigate {claim.lower()}, a comprehensive plan can be adopted: 1. Enforce regulations banning single-use plastics and promoting biodegradable alternatives. 2. Enhance waste management systems by increasing recycling facilities and ensuring proper disposal in affected regions. 3. Launch cleanup initiatives, such as deploying technologies to remove debris from ecosystems. 4. Educate communities about the impact of pollution through school programs and media campaigns, encouraging reduced waste production. Collaboration with global organizations can help secure funding and coordinate efforts across regions, ensuring a unified approach to tackling this issue. {evidence_text} This strategy aims to reduce pollution while promoting sustainable practices."
    else:
        approach = f"To address {claim.lower()}, the following approach can be implemented: 1. Develop policies to protect ecosystems, such as establishing protected areas and regulating resource extraction. 2. Promote sustainable practices among communities through education and incentives. 3. Invest in research to better understand the issue and develop innovative solutions. 4. Foster international cooperation to address global aspects of the problem. {evidence_text} This approach seeks to balance environmental protection with sustainable development, ensuring long-term benefits for both nature and society."

    word_count = len(approach.split())
    if not (150 <= word_count <= 300):
        if word_count < 150:
            approach += " Furthermore, engaging stakeholders at all levels—from local communities to international policymakers—ensures that solutions are both practical and widely supported, maximizing their impact over time."
        elif word_count > 300:
            approach = " ".join(approach.split()[:300])

    return approach

claim_to_evidence = {}
for entry in climatefever_dataset:
    claim = entry["claim"]
    evidence = entry["evidences"]
    if not is_env_science(claim):
        continue
    if claim not in claim_to_evidence:
        claim_to_evidence[claim] = []
    claim_to_evidence[claim].extend(evidence)

for claim, evidence_list in claim_to_evidence.items():
    if not evidence_list:
        continue
    approach = synthesize_approach(claim, evidence_list)
    dataset.append({"problem": claim, "approach": approach})
    if len(dataset) >= 500:
        break

with open("environmental_science_climatefever_dataset.json", "w") as f:
    json.dump(dataset, f, indent=4)

print(f"Dataset created with {len(dataset)} pairs. Saved to environmental_science_climatefever_dataset.json")

with open("environmental_science_climatefever_dataset.json", "r") as f:
    data = json.load(f)

inputs = ["problem: " + item["problem"] for item in data]
targets = [item["approach"] for item in data]

dataset = Dataset.from_dict({"input_text": inputs, "target_text": targets})

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

def preprocess_function(examples):
    inputs = examples["input_text"]
    targets = examples["target_text"]
    model_inputs = tokenizer(inputs, max_length=64, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)

train_test_split = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = train_test_split["train"]
eval_dataset = train_test_split["test"]

def objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True)
    batch_size = trial.suggest_categorical("batch_size", [4, 8, 16])
    num_train_epochs = trial.suggest_int("num_train_epochs", 3, 10)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    training_args = TrainingArguments(
        output_dir=f"./bart_env_science_trial_{trial.number}",
        eval_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01,
        save_strategy="epoch",
        logging_dir=f"./logs/trial_{trial.number}",
        logging_steps=10,
        report_to="none",
    )

    model = BartForConditionalGeneration.from_pretrained("facebook/bart-base").to(device)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    trainer.train()

    rouge = load("rouge")
    predictions = []
    references = []

    for example in eval_dataset:
        input_text = example["input_text"]
        inputs = tokenizer(input_text, return_tensors="pt", max_length=64, truncation=True)
        inputs = {key: val.to(device) for key, val in inputs.items()}
        outputs = model.generate(inputs["input_ids"], max_length=256, num_beams=4, early_stopping=True)
        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(generated)
        references.append(example["target_text"])

    rouge_results = rouge.compute(predictions=predictions, references=references)
    return rouge_results["rougeL"]

print("Starting hyperparameter optimization with Optuna...")
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=5)

best_trial = study.best_trial
print("Best trial:")
print(f"  ROUGE-L: {best_trial.value}")
print("  Best hyperparameters: ", best_trial.params)

best_learning_rate = best_trial.params["learning_rate"]
best_batch_size = best_trial.params["batch_size"]
best_num_train_epochs = best_trial.params["num_train_epochs"]

final_training_args = TrainingArguments(
    output_dir="./bart_env_science_final",
    eval_strategy="epoch",
    learning_rate=best_learning_rate,
    per_device_train_batch_size=best_batch_size,
    per_device_eval_batch_size=best_batch_size,
    num_train_epochs=best_num_train_epochs,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs/final",
    logging_steps=10,
)

final_model = BartForConditionalGeneration.from_pretrained("facebook/bart-base").to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))
final_trainer = Trainer(
    model=final_model,
    args=final_training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

print("Training final model with best hyperparameters...")
final_trainer.train()

final_model.save_pretrained("./bart_env_science_final_model")
tokenizer.save_pretrained("./bart_env_science_final_model")

print("Final model training complete and saved to ./bart_env_science_final_model")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Loading ClimateFever dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.09k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/869k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/1535 [00:00<?, ? examples/s]

Dataset created with 173 pairs. Saved to environmental_science_climatefever_dataset.json


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Map:   0%|          | 0/173 [00:00<?, ? examples/s]

[I 2025-04-25 02:19:47,191] A new study created in memory with name: no-name-b98470e0-1263-4065-b011-bc04067a9c9e


Starting hyperparameter optimization with Optuna...


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss
1,2.754,2.005248
2,1.8574,1.794053
3,1.234,1.798009
4,1.0749,1.825169
5,0.7719,1.919367
6,0.5884,1.962801
7,0.4751,2.024443
8,0.368,2.032342
9,0.3258,2.066427
10,0.2742,2.065082




Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

[I 2025-04-25 02:24:10,491] Trial 0 finished with value: 0.5054513463120134 and parameters: {'learning_rate': 0.00018079396908209555, 'batch_size': 4, 'num_train_epochs': 10}. Best is trial 0 with value: 0.5054513463120134.


Epoch,Training Loss,Validation Loss
1,4.0116,3.513608
2,2.4878,2.585549
3,1.831,2.57375
4,1.4089,2.590152
5,1.026,2.704611
6,0.8159,2.767109


[I 2025-04-25 02:27:03,638] Trial 1 finished with value: 0.40695922576725796 and parameters: {'learning_rate': 0.0006612786959225536, 'batch_size': 8, 'num_train_epochs': 6}. Best is trial 0 with value: 0.5054513463120134.


Epoch,Training Loss,Validation Loss
1,4.4676,2.536941
2,2.4853,2.009352
3,1.7639,1.853535
4,1.4252,1.819193
5,1.2371,1.819258


[I 2025-04-25 02:29:27,123] Trial 2 finished with value: 0.373181757873953 and parameters: {'learning_rate': 0.00040062571307323506, 'batch_size': 16, 'num_train_epochs': 5}. Best is trial 0 with value: 0.5054513463120134.


Epoch,Training Loss,Validation Loss
1,3.7956,2.391702
2,2.5339,2.102465
3,2.1808,1.976437
4,1.9285,1.868206
5,1.7299,1.812015
6,1.6021,1.768495
7,1.5201,1.753811
8,1.4711,1.741342
9,1.4275,1.741447


[I 2025-04-25 02:34:37,063] Trial 3 finished with value: 0.32426426200959363 and parameters: {'learning_rate': 8.759694625613774e-05, 'batch_size': 16, 'num_train_epochs': 9}. Best is trial 0 with value: 0.5054513463120134.


Epoch,Training Loss,Validation Loss
1,3.2255,2.517161
2,2.3076,2.446448
3,1.3566,2.460692
4,1.0915,2.532108


[I 2025-04-25 02:38:39,932] Trial 4 finished with value: 0.2357724425941916 and parameters: {'learning_rate': 0.00047849192746592936, 'batch_size': 4, 'num_train_epochs': 4}. Best is trial 0 with value: 0.5054513463120134.


Best trial:
  ROUGE-L: 0.5054513463120134
  Best hyperparameters:  {'learning_rate': 0.00018079396908209555, 'batch_size': 4, 'num_train_epochs': 10}
Training final model with best hyperparameters...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msqp6023[0m ([33msqp6023-penn-state[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,2.7166,1.959965
2,1.8447,1.794456
3,1.228,1.795447
4,1.0344,1.815832
5,0.776,1.922184
6,0.5726,1.964445
7,0.4703,2.02774
8,0.3671,2.034924
9,0.3158,2.06718
10,0.2746,2.065202




Final model training complete and saved to ./bart_env_science_final_model


In [5]:
# Step 6: Evaluate the final model
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load metrics
rouge = load("rouge")
bertscore = load("bertscore")

# Experiment 1: Standard input format
predictions_standard = []
references = []

for example in eval_dataset:
    input_text = example["input_text"]
    inputs = tokenizer(input_text, return_tensors="pt", max_length=64, truncation=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    outputs = final_model.generate(inputs["input_ids"], max_length=256, num_beams=4, early_stopping=True)
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predictions_standard.append(generated)
    references.append(example["target_text"])

# ✅ Experiment 2: Keyword-augmented format
predictions_keywords = []

for example in eval_dataset:
    problem_text = example["input_text"].replace("problem: ", "")
    input_text_with_keywords = f"problem: {problem_text} [climate change, sustainability]"
    inputs = tokenizer(input_text_with_keywords, return_tensors="pt", max_length=64, truncation=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}
    outputs = final_model.generate(inputs["input_ids"], max_length=256, num_beams=4, early_stopping=True)
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    predictions_keywords.append(generated)

# Compute ROUGE-L and BERTScore for input with keywords
rouge_results_keywords = rouge.compute(predictions=predictions_keywords, references=references)
bertscore_results_keywords = bertscore.compute(predictions=predictions_keywords, references=references, lang="en")
print("\nEvaluation with keywords in input format:")
print("ROUGE-L:", rouge_results_keywords["rougeL"])
print("BERTScore (F1):", sum(bertscore_results_keywords["f1"]) / len(bertscore_results_keywords["f1"]))

# Manual evaluation: Print a few examples
print("\nManual Evaluation (First 3 Examples):")
for i in range(min(3, len(eval_dataset))):
    print(f"\nProblem: {eval_dataset[i]['input_text']}")
    print(f"Generated Approach (Standard): {predictions_standard[i]}")
    print(f"Generated Approach (With Keywords): {predictions_keywords[i]}")
    print(f"Ground Truth: {references[i]}")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Evaluation with keywords in input format:
ROUGE-L: 0.47465975289251106
BERTScore (F1): 0.8966621789667342

Manual Evaluation (First 3 Examples):

Problem: problem: During a state House debate on a jobs and energy bill this week, Democrats offered an amendment that would put the Legislature on record saying that climate change is real and that humans are causing it.
Generated Approach (Standard): To address during a state house debate on a jobs and energy bill this week, Democrats offered an amendment that would put the Legislature on record saying that climate change is real and that humans are causing it., the following approach can be implemented: 1. Develop policies to protect ecosystems, such as establishing protected areas and regulating resource extraction. 2. Promote sustainable practices among communities through education and incentives. 3. Invest in research to better understand the issue and develop innovative solutions. 4. Foster international cooperation to address global