In [14]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_chroma import Chroma
import chromadb
from pathlib import Path
import os
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
import torch


In [2]:
# Set the absolute path for the Chroma database
ABS_PATH = Path().resolve().joinpath('Chroma1')
DB_DIR = os.path.join(ABS_PATH, "env_policy")
print('DB_DIR:', DB_DIR)

DB_DIR: /Users/rahul/Projects/AI-Project/ClimateActionPolicy-RAG/Chroma1/env_policy


In [3]:
# Define client settings for Chroma
client_settings = chromadb.config.Settings(
    is_persistent=True,
    persist_directory=DB_DIR,
    anonymized_telemetry=False,
)

In [4]:
# Initialize the HuggingFaceBgeEmbeddings with the correct model
embedder = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-small-en")


In [5]:
# Load the Chroma vector store
bge_vectorstore = Chroma(
    embedding_function=embedder,
    client_settings=client_settings,
    collection_name="env_policy_bge",
    collection_metadata={"hnsw": "cosine"}
)


In [6]:
# Initialize the retriever
retriever = bge_vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 5, "include_metadata": True}
)

In [7]:
# Define the query
query = "Give me the top three things to consider when writing my climate action plan."



In [10]:
# Retrieve documents
retrieved_docs = retriever.invoke(query)
print(retrieved_docs)


[Document(page_content='Timestamp: 14/06/2024 19:50:18\nTitle: How to prioritise actions for your climate action plan\nContent: At the heart of a city’s climate action plan (CAP) is a set of actions that have been selected, prioritised and defined to ensure that the vision, goals and targets of the plan can be achieved. There is no one correct set of actions for achieving your city’s climate change goals. Ultimately, the actions in your plan need to be underpinned by high ambition, a strong evidence base, robust engagement and governance structures. They should have in-built flexibility to allow the city to learn from successes and failures and to adapt the approach over time.\nOur CAP guide will be updated shortly in line with the Cities Climate Transition Framework, released in December 2023.\nThere are many ways to develop a set of actions for your city’s CAP. Whether your city is developing its first CAP, or consolidating or strengthening existing plans, this article offers guidanc

In [11]:
# Function to create a context from retrieved documents
def create_context_from_docs(docs):
    return " ".join([doc.page_content for doc in docs])


In [12]:
# Create context from retrieved documents
context = create_context_from_docs(retrieved_docs)
print(context)


Timestamp: 14/06/2024 19:50:18
Title: How to prioritise actions for your climate action plan
Content: At the heart of a city’s climate action plan (CAP) is a set of actions that have been selected, prioritised and defined to ensure that the vision, goals and targets of the plan can be achieved. There is no one correct set of actions for achieving your city’s climate change goals. Ultimately, the actions in your plan need to be underpinned by high ambition, a strong evidence base, robust engagement and governance structures. They should have in-built flexibility to allow the city to learn from successes and failures and to adapt the approach over time.
Our CAP guide will be updated shortly in line with the Cities Climate Transition Framework, released in December 2023.
There are many ways to develop a set of actions for your city’s CAP. Whether your city is developing its first CAP, or consolidating or strengthening existing plans, this article offers guidance on the process of selectin

facebook/bart-large-cnn

In [11]:
# Initialize the text generation pipeline with a summarization model
generator = pipeline('summarization', model='facebook/bart-large-cnn')

Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-cnn and are newly initialized: ['model.shared.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
def generate_with_bart(context, question, max_length=1024):
    input_text = f"{context} {question}"
    # Split the input_text into chunks of max_length
    input_chunks = [input_text[i:i + max_length] for i in range(0, len(input_text), max_length)]
    summaries = []
    for chunk in input_chunks:
        result = generator(chunk, max_length=150, min_length=40, do_sample=False)
        summaries.append(result[0]['summary_text'])
    return " ".join(summaries)

In [17]:
# Define the context and question

question = "Give me the top three things to consider when writing my climate action plan."


In [18]:
# Generate the answer using the context and the question
answer = generate_with_bart(context, question)

print("Answer:", answer)

Answer: There is no one correct set of actions for achieving your city’s climate change goals. The actions in your plan need to be underpinned by high ambition. They should have in-built flexibility to allow the city to learn from successes and failures and to adapt the approach over time. Actions include any policy, project, programme, strategy, partnership, investment or infrastructure that leads to emission reduction and climate resilience. They can be at any size, scale or stage of implementation, from concept to implementation and monitoring. Identify and give structure to a ‘longlist’ of actions. These actions should present a clear, coherent and robust approach to meeting your city’s climate change vision, goals and targets. There are multiple, feasible and effective options to reduce greenhouse gas emissions and adapt to human-caused climate change. Cities can use the Climate Action Planning Guide to develop effective climate action plans. Taking right action now could result i

In [12]:
# Define the question
question = "Give me the top three things to consider when writing my climate action plan."


distilgpt2

In [19]:
# Initialize the Hugging Face text generation pipeline
generator = pipeline('text-generation', model='distilgpt2')

In [20]:
# Function to generate an answer using the Hugging Face generator
def generate_with_huggingface(context, question):
    input_text = f"{context} {question}"
    # Split input_text into chunks that the model can handle
    max_length = 512  # Adjust this length according to the model's capacity
    input_chunks = [input_text[i:i + max_length] for i in range(0, len(input_text), max_length)]
    generated_text = ""
    for chunk in input_chunks:
        result = generator(chunk, max_length=max_length, num_return_sequences=1)
        generated_text += result[0]['generated_text']
    return generated_text


In [21]:
# Generate the answer using the context and the question
answer = generate_with_huggingface(context, question)

print("Answer:", answer)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

Answer: Timestamp: 14/06/2024 19:50:18
Title: How to prioritise actions for your climate action plan
Content: At the heart of a city’s climate action plan (CAP) is a set of actions that have been selected, prioritised and defined to ensure that the vision, goals and targets of the plan can be achieved. There is no one correct set of actions for achieving your city’s climate change goals. Ultimately, the actions in your plan need to be underpinned by high ambition, a strong evidence base, robust engagement and goverment to ensure that actions for all areas and regions are implemented.
It is crucial to follow the vision outlined here as the following:
To ensure the success of our global climate negotiations, we are taking action now to achieve each and every change that our people and the world should address. Together, we will win in the next elections. Our vision is not the only way forward, but we need to be very clear where each and every action at this stage needs to be prioritised,

google/flan-t5-large

In [20]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Initialize the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('google/flan-t5-large')
model = T5ForConditionalGeneration.from_pretrained('google/flan-t5-large')

In [25]:
# Function to generate an answer using the T5 model
def generate_with_t5(context, question):
    input_text = f"context: {context} question: {question}"
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    
    # Generate the output
    outputs = model.generate(
        inputs.input_ids,
        max_length=1000,
        min_length=500,
        num_beams=5,
        early_stopping=True
    )
    
    # Decode the generated text
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return answer

In [26]:
# Define the context and question
question = "Give me the top three things to consider when writing my climate action plan."


In [27]:
# Generate the answer using the context and the question
answer = generate_with_t5(context, question)

print("Answer:", answer)

Answer: How to select, prioritise and define actions for your city’s climate action plan - Cities Climate Transition Guide - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - Greenhouse Gases and Climate Change - G

BERT

In [51]:
# Initialize the Hugging Face question-answering pipeline
qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad", tokenizer="bert-large-uncased-whole-word-masking-finetuned-squad")




In [52]:
# Function to generate an answer using the BERT model for question-answering
def generate_with_bert(context, question):
    result = qa_pipeline(question=question, context=context)
    return result['answer']


In [54]:
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration

# Initialize the tokenizer, retriever, and model
tokenizer = RagTokenizer.from_pretrained("facebook/rag-sequence-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-sequence-nq", index_name="exact")
model = RagSequenceForGeneration.from_pretrained("facebook/rag-sequence-nq")

# Function to generate an answer using RAG
def generate_with_rag(question, context_docs):
    # Tokenize the input
    inputs = tokenizer(question, return_tensors="pt")
    
    # Retrieve relevant documents
    input_ids = inputs["input_ids"]
    question_hidden_states = model.question_encoder(input_ids)[0]
    docs_dict = retriever(input_ids.cpu().numpy(), question_hidden_states.cpu().detach().numpy(), return_tensors="pt")
    
    doc_scores = torch.bmm(question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].transpose(1, 2)).squeeze(1)
    retrieved_doc_ids = docs_dict["context_input_ids"]
    retrieved_doc_attention_mask = docs_dict["context_attention_mask"]

    # Generate the response
    outputs = model.generate(
        input_ids,
        context_input_ids=retrieved_doc_ids,
        context_attention_mask=retrieved_doc_attention_mask,
        doc_scores=doc_scores,
        max_length=200
    )

    # Decode the generated response
    answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return answer

# Generate the answer using the context and the question
answer = generate_with_rag(question, context)

print("Answer:", answer)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

ImportError: 
RagRetriever requires the 🤗 Datasets library but it was not found in your environment. You can install it with:
```
pip install datasets
```
In a notebook or a colab, you can install it by executing a cell with
```
!pip install datasets
```
then restarting your kernel.

Note that if you have a local folder named `datasets` or a local python file named `datasets.py` in your current
working directory, python may try to import this instead of the 🤗 Datasets library. You should rename this folder or
that python file if that's the case. Please note that you may need to restart your runtime after installation.


facebook/rag-token-base

In [21]:
from transformers import pipeline

# Example prompt for RAG generation
prompt = "Generate a summary based on the top three considerations for writing a climate action plan:"

# Combine prompt and retrieved context
input_text = f"{prompt} {context}"

print("Input Text:")
print(input_text)



Input Text:
Generate a summary based on the top three considerations for writing a climate action plan: Timestamp: 14/06/2024 19:50:18
Title: How to prioritise actions for your climate action plan
Content: At the heart of a city’s climate action plan (CAP) is a set of actions that have been selected, prioritised and defined to ensure that the vision, goals and targets of the plan can be achieved. There is no one correct set of actions for achieving your city’s climate change goals. Ultimately, the actions in your plan need to be underpinned by high ambition, a strong evidence base, robust engagement and governance structures. They should have in-built flexibility to allow the city to learn from successes and failures and to adapt the approach over time.
Our CAP guide will be updated shortly in line with the Cities Climate Transition Framework, released in December 2023.
There are many ways to develop a set of actions for your city’s CAP. Whether your city is developing its first CAP, o

In [23]:
# Load the RAG pipeline
generator = pipeline("text-generation", model="EleutherAI/gpt-neo-2.7B")

# Generate text using RAG
generated_text = generator(input_text, max_length=100, num_return_sequences=1)[0]['generated_text']

print("Generated Text:")
print(generated_text)

Token indices sequence length is longer than the specified maximum sequence length for this model (2137 > 2048). Running this sequence through the model will result in indexing errors
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 2137, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


IndexError: index out of range in self

In [15]:
# Load a Hugging Face model and tokenizer for generation
model_name = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at facebook/bart-large-cnn and are newly initialized: ['model.shared.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Set up the generation pipeline
generator = pipeline('text2text-generation', model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)


In [19]:
# Function to handle long input by splitting it into manageable chunks
def generate_with_bart(context, question, max_length=1024):
    input_text = f"{context} {question}"
    input_chunks = [input_text[i:i + max_length] for i in range(0, len(input_text), max_length)]
    summaries = []
    for chunk in input_chunks:
        result = generator(chunk, max_length=150, min_length=40, do_sample=False)
        summaries.append(result[0]['generated_text'])
    return " ".join(summaries)


In [22]:
# Define a function to generate a response based on retrieved documents
def generate_response(query):
    response = generate_with_bart(context, query)
    return response

In [23]:
response = generate_response(query)
print(response)


There is no one correct set of actions for achieving your city’s climate change goals. The actions in your plan need to be underpinned by high ambition. They should have in-built flexibility to allow the city to learn from successes and failures and to adapt the approach over time. Actions include any policy, project, programme, strategy, partnership, investment or infrastructure that leads to emission reduction and climate resilience. They can be at any size, scale or stage of implementation, from concept to implementation and monitoring. Identify and give structure to a ‘longlist’ of actions. These actions should present a clear, coherent and robust approach to meeting your city’s climate change vision, goals and targets. There are multiple, feasible and effective options to reduce greenhouse gas emissions and adapt to human-caused climate change. Cities can use the Climate Action Planning Guide to develop effective climate action plans. Taking right action now could result in the tr

In [24]:
# Change the model name to use different models
model_name = "google/pegasus-xsum"  # Example: Using Pegasus for summarization
# Other options:
# model_name = "gpt2-large"
# model_name = "t5-large"
# model_name = "sshleifer/distilbart-cnn-12-6"
# model_name = "facebook/bart-base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

# Set up the generation pipeline
generator = pipeline('text2text-generation', model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

In [25]:
# Function to handle long input by splitting it into manageable chunks
def generate_with_model(context, question, max_length=1024):
    input_text = f"{context} {question}"
    input_chunks = [input_text[i:i + max_length] for i in range(0, len(input_text), max_length)]
    summaries = []
    for chunk in input_chunks:
        result = generator(chunk, max_length=150, min_length=40, do_sample=False)
        summaries.append(result[0]['generated_text'])
    return " ".join(summaries)

In [26]:
# Define a function to generate a response based on retrieved documents
def generate_response(query):
    response = generate_with_model(context, query)
    return response


In [27]:
response = generate_response(query)
print(response)

A city’s climate action plan (CAP) is a set of actions that have been selected, prioritised and defined to ensure that the vision, goals and targets of the plan can be achieved. A climate action is any activity your city commits to implementing to achieve its vision and strategies for adapting to climate change and reducing emissions. a climate action is any activity your city commits to implementing to achieve its vision and strategies for adapting to climate change and reducing emissions. A city’s climate action plan (CAP) should include actions that strengthen or facilitate other aspects of a CAP, for example, by building capacity, demonstrating leadership, strengthening the evidence base, enhancing governance or seeking finance. The Intergovernmental Panel on Climate Change (IPCC) has warned that the world is heading towards dangerous levels of climate change and that urgent action is needed to limit the rise in global temperatures to less than 2C above pre-industrial levels. The U