In [None]:

!pip install transformers datasets torch


import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Trainer, TrainingArguments
from datasets import load_dataset


model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

dataset = load_dataset("Ram20307/carbonetralpathway")

dataset_split = dataset['train'].train_test_split(test_size=0.1)  # 10% for validation
train_dataset = dataset_split['train']
eval_dataset = dataset_split['test']


def preprocess_function(examples):
    inputs = tokenizer(examples['input'], truncation=True, padding="max_length", max_length=512)
    outputs = tokenizer(examples['output'], truncation=True, padding="max_length", max_length=512)


    labels = outputs['input_ids']
    labels = [-100 if token == tokenizer.pad_token_id else token for token in labels]

    return {
        'input_ids': inputs['input_ids'],
        'attention_mask': inputs['attention_mask'],
        'labels': labels,
        'labels_attention_mask': outputs['attention_mask']
    }

train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)


data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = TrainingArguments(
    output_dir="./flan_t5_finetuned",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    learning_rate=1e-4,
    logging_dir="./logs",
    save_strategy="epoch",
    evaluation_strategy="epoch"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Train the model
trainer.train()


Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/144 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2700 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss
1,0.7002,0.057459
2,0.0615,0.048568


Epoch,Training Loss,Validation Loss
1,0.7002,0.057459
2,0.0615,0.048568
3,0.0516,0.045856


TrainOutput(global_step=2025, training_loss=0.2149973287700135, metrics={'train_runtime': 2983.3575, 'train_samples_per_second': 2.715, 'train_steps_per_second': 0.679, 'total_flos': 5546534554828800.0, 'train_loss': 0.2149973287700135, 'epoch': 3.0})

In [None]:
# Load the tokenizer and model
model_id = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [None]:
import os

print(os.listdir(model_path))


['checkpoint-675', 'checkpoint-1350', 'checkpoint-2025']


In [None]:
from huggingface_hub import login

login(token="your token")


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Assuming your checkpoint directories are named checkpoint-1, checkpoint-2, etc.
checkpoint_path = "./flan_t5_finetuned/checkpoint-2025"  # Adjust as necessary
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)



In [None]:
input_text = "carbon neutral pathway suggestion for 70000 tonnes of co2e"
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(
    **inputs,
    max_length=150,   # Increase as needed
    num_beams=5,      # Optional: for better results
    early_stopping=True
)

decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded_output)


to address the carbon emissions of 70000 tonnes of co2e, it is recommended to plant approximately 3,227,409 trees over an area of 1786.00 hectares. this afforestation effort is expected to yield around 1,220,000 carbon credits. by implementing this strategy, you will significantly contribute to carbon offset initiatives through co2 sequestration, thereby advancing environmental sustainability goals.


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from huggingface_hub import HfApi, HfFolder

# Load the model and tokenizer
checkpoint_path = "./flan_t5_finetuned/checkpoint-2025"
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

# Define repository information
model_name = "samplefinetunedsih"  # Replace with your desired model repository name
model_repo = f"Ram20307/{model_name}"  # Replace 'your-username' with your Hugging Face username

# Create repository
api = HfApi()
api.create_repo(repo_id=model_repo, repo_type="model")

# Push the model
model.push_to_hub(model_repo)
tokenizer.push_to_hub(model_repo)

print(f"Model and tokenizer pushed to {model_repo}")


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Model and tokenizer pushed to Ram20307/samplefinetunedsih


In [None]:
from transformers import pipeline


pipe = pipeline("text2text-generation", model="Ram20307/samplefinetunedsih")

input_text = "carbon neutral pathway suggestion for 70000 tonnes of co2e"


output = pipe(input_text, max_length=150, num_return_sequences=1, do_sample=True, temperature=0.7)


#print("Pipeline Output:", output[0]['generated_text'])


from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


tokenizer = AutoTokenizer.from_pretrained("Ram20307/samplefinetunedsih")
model = AutoModelForSeq2SeqLM.from_pretrained("Ram20307/samplefinetunedsih")

inputs = tokenizer(input_text, return_tensors="pt")


outputs = model.generate(
    **inputs,
    max_length=150,  # Increase the max length
    do_sample=True,  # Enable sampling for varied outputs
    top_k=50,        # Use top-k sampling for diversity
    top_p=0.95,      # Use top-p sampling
    temperature=0.7  # Adjust temperature for creativity
)


generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Manual Model Output:", generated_text)


Manual Model Output: to address the carbon emissions of 70000 tonnes of co2e, it is recommended to plant approximately 3,045,409 trees over an area of 1783.00 hectares. this afforestation effort is expected to yield around 1,220,000 carbon credits. by implementing this strategy, you will significantly contribute to carbon offset initiatives through co2 sequestration, thereby advancing environmental sustainability goals.
