In [1]:
from datasets import load_dataset

ds = load_dataset("climatebert/climate_detection")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 1300/1300 [00:00<00:00, 13095.33 examples/s]
Generating test split: 100%|██████████| 400/400 [00:00<00:00, 25111.83 examples/s]


In [2]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1300
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 400
    })
})

In [7]:
from langchain.schema import Document

# Confirm columns
print(ds["train"].column_names)  # ['text', 'label']

# Extract texts from the 'text' column
texts = ds["train"]["text"]

# Convert texts to Document objects
documents = [Document(page_content=text) for text in texts]


['text', 'label']


In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000,
                                               chunk_overlap = 200)
splits = text_splitter.split_documents(documents)

In [10]:
print(splits[0])
print(splits[1])

page_content='− Scope 3: Optional scope that includes indirect emissions associated with the goods and services supply chain produced outside the organization. Included are emissions from the transport of products from our logistics centres to stores (downstream) performed by external logistics operators (air, land and sea transport) as well as the emissions associated with electricity consumption in franchise stores.'
page_content='The Group is not aware of any noise pollution that could negatively impact the environment, nor is it aware of any impact on biodiversity. With regards to land use, the Group is only a commercial user, and the Group is not aware of any local constraints with regards to water supply. The Group does not believe that it is at risk with regards to climate change in the near-or mid-term.'


In [11]:
from langchain_community.vectorstores import Chroma
from langchain_google_genai import GoogleGenerativeAIEmbeddings 

embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004",google_api_key="XXXXX")  
vectorstore = Chroma.from_documents(documents=splits, embedding=embeddings)

In [12]:
print(vectorstore._collection.get())



In [13]:
retriever = vectorstore.as_retriever()

In [14]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")



In [15]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-2.0-flash",
    temperature=0,
    google_api_key="XXXXX"
)

In [16]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [17]:
def format_docs(docs):
    return "\n".join(doc.page_content for doc in docs)

In [18]:
rag_chain = ({"context":retriever | format_docs, "question":RunnablePassthrough()}
             | prompt
             | llm
             | StrOutputParser())

In [20]:
rag_chain.invoke("Carbon emissions are rising due to deforestation.")

'Yes, carbon emissions have tripled since 1960, contributing to climate change. Deforestation contributes to this issue, with forests shrinking by 13 million hectares per year. This unsustainable consumption of natural resources drives land and biodiversity loss.'