<a href="https://colab.research.google.com/github/sleepyzzpanda/Environment-RAG-Chatbot/blob/main/Climate_RAG_Chatbot_with_gradio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GPT-4.1 RAG Chatbot for Climate Information
This notebook sets up a retrieval-augmented generation (RAG) chatbot using GPT-4.1 and FAISS embeddings for climate data, with an interactive gradio-based interface.

In [None]:
!pip install torch transformers datasets faiss-cpu sentence-transformers ipywidgets
!pip install openai




In [None]:
import torch
import requests
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from ipywidgets import interact_manual, widgets
from datasets import load_dataset
import pandas as pd
import openai
import os

# Load secret from Colab
from google.colab import userdata
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY').strip()


In [None]:
climate_x = load_dataset("rlacombe/ClimateX")
print(climate_x['train'].column_names)

passages = []

for example in climate_x["train"]:
    passages.append(example["statement"])

passages_n = []

nicky = load_dataset("NickyNicky/guardian_environment_news")
print(nicky['train'].column_names)
for example in nicky["train"]:
    passages_n.append(example["Article Text"])


# clean the passages
passages = [str(p).strip() for p in passages if p is not None and str(p).strip() != ""]
passages_n = [str(p).strip() for p in passages_n if p is not None and str(p).strip() != ""]






README.md: 0.00B [00:00, ?B/s]

ipcc_statements_dataset.tsv: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/8094 [00:00<?, ? examples/s]

['statement_idx', 'report', 'page_num', 'sent_num', 'statement', 'confidence', 'score', 'split']


README.md:   0%|          | 0.00/488 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/95.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/30059 [00:00<?, ? examples/s]

['Title', 'Intro Text', 'Authors', 'Article Text', 'Date Published']


In [None]:
def chunk_text(text, max_chars=4000):
    return [text[i:i+max_chars] for i in range(0, len(text), max_chars)]


In [None]:
passages = [
    chunk
    for p in passages
    for chunk in chunk_text(p)
]

passages_n = [
    chunk
    for p in passages_n
    for chunk in chunk_text(p)
]


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:

file_path = "/content/drive/MyDrive/IAT360FinalProject/climate_headlines_sentiment.csv"
news_df = pd.read_csv(file_path)
print(news_df.columns)
print(news_df.head())

# Fill NaNs with empty strings to avoid errors
text_columns = ['Headline', 'Content', 'Justification']
news_df[text_columns] = news_df[text_columns].fillna('')

# Combine columns row-wise
news_passages = (news_df[text_columns]
                 .agg(' '.join, axis=1)   # joins columns with a space
                 .tolist())
# Remove empty or whitespace-only passages
news_passages = [p.strip() for p in news_passages if len(p.strip()) > 0]

news_passages = [
    chunk
    for p in news_passages
    for chunk in chunk_text(p)
]

print(f"{len(news_passages)} combined passages ready for embedding")
print(news_passages[:3])


Index(['Unnamed: 0', 'Headline', 'Link', 'Content', 'Sentiment',
       'Justification'],
      dtype='object')
   Unnamed: 0                                           Headline  \
0           0  Australia's year ahead in climate and environm...   
1           1  Projections reveal the vulnerability of freshw...   
2           2  Record heat in 2023 worsened global droughts, ...   
3           3  It's not just the total rainfall "“ why is eas...   
4           4  Expert Commentary: 2023 was the warmest year o...   

                                                Link  \
0  https://www.abc.net.au/news/science/2024-01-23...   
1  https://news.griffith.edu.au/2024/01/09/projec...   
2  https://www.anu.edu.au/news/all-news/record-he...   
3  https://www.theguardian.com/australia-news/202...   
4  https://www.csiro.au/en/news/all/news/2024/jan...   

                                             Content  Sentiment  \
0   The year has barely started and extreme weath...        0.0   
1   “Wat

In [None]:
file_path2 = file_path = "/content/drive/MyDrive/IAT360FinalProject/rabuahmad-climatecheck.csv"
news_df2 = pd.read_csv(file_path2)
print(news_df2.columns)
print(news_df2.head())

# Fill NaNs with empty strings to avoid errors
text_columns2 = ['claim', 'abstract']
news_df2[text_columns2] = news_df2[text_columns2].fillna('')

# Combine columns row-wise
passages2 = (news_df2[text_columns2]
                 .agg(' '.join, axis=1)   # joins columns with a space
                 .tolist())
# Remove empty or whitespace-only passages
passages2 = [p.strip() for p in passages2 if len(p.strip()) > 0]

passages2 = [
    chunk
    for p in passages2
    for chunk in chunk_text(p)
]

print(f"{len(passages2)} combined passages ready for embedding")
print(passages2[:3])



Index(['claim', 'abstract', 'abstract_id', 'claim_id', 'annotation'], dtype='object')
                                               claim  \
0  Turns out, species that can adapt easily to di...   
1  Turns out, species that can adapt easily to di...   
2  Let's not forget the overwhelming evidence for...   
3  Let's not forget the overwhelming evidence for...   
4  Fossil fuel projects harm social harmony in lo...   

                                            abstract  abstract_id  claim_id  \
0  Local adaptation of plant species is a central...        50203         0   
1  Being faced with unknown environments is a con...       217080         0   
2  Summary \n1. The evidence for anthropogenicall...        29893         5   
3  Despite an overwhelming scientific consensus, ...        72797         5   
4  There is some concern that coal seam gas minin...       270804        10   

  annotation  
0   Supports  
1   Supports  
2   Supports  
3   Supports  
4   Supports  
475 combined

In [None]:
file_path3 = file_path = "/content/drive/MyDrive/IAT360FinalProject/rlacombe-ClimateX.csv"
news_df3 = pd.read_csv(file_path3)
print(news_df3.columns)
print(news_df3.head())

# Fill NaNs with empty strings to avoid errors
text_columns3 = ['statement']
news_df3[text_columns3] = news_df3[text_columns3].fillna('')

# Combine columns row-wise
passages3 = (news_df3[text_columns3]
                 .agg(' '.join, axis=1)   # joins columns with a space
                 .tolist())
# Remove empty or whitespace-only passages
passages3 = [p.strip() for p in passages3 if len(p.strip()) > 0]

passages3 = [
    chunk
    for p in passages3
    for chunk in chunk_text(p)
]

print(f"{len(passages3)} combined passages ready for embedding")
print(passages3[:3])



Index(['statement_idx', 'report', 'page_num', 'sent_num', 'statement',
       'confidence', 'score', 'split'],
      dtype='object')
   statement_idx   report  page_num  sent_num  \
0              0  AR6_WGI        20        22   
1              2  AR6_WGI        21        18   
2              3  AR6_WGI        24         2   
3              9  AR6_WGI        24        11   
4             11  AR6_WGI        24        17   

                                           statement confidence  score  split  
0  Since 2011 (measurements reported in AR5), con...       high      2  train  
1  The average rate of sea level rise was 1.3 [0....       high      2  train  
2  Since 1750, increases in CO2 (47%) and CH4 (15...  very high      3   test  
3  A long-term increase in surface open ocean pH ...       high      2  train  
4  Marine heatwaves have approximately doubled in...       high      2  train  
5289 combined passages ready for embedding
['Since 2011 (measurements reported in AR5), conc

In [None]:
file_path4 = file_path = "/content/drive/MyDrive/IAT360FinalProject/tdiggelm-climate_fever.csv"
_df4 = pd.read_csv(file_path4)
print(_df4.columns)
print(_df4.head())

# Fill NaNs with empty strings to avoid errors
text_columns4 = ['claim', 'evidences']
_df4[text_columns4] = _df4[text_columns4].fillna('')

# Combine columns row-wise
passages4 = (_df4[text_columns4]
                 .agg(' '.join, axis=1)   # joins columns with a space
                 .tolist())
# Remove empty or whitespace-only passages
passages4 = [p.strip() for p in passages4 if len(p.strip()) > 0]

passages4 = [
    chunk
    for p in passages4
    for chunk in chunk_text(p)
]

print(f"{len(passages4)} combined passages ready for embedding")
print(passages4[:3])



Index(['claim_id', 'claim', 'claim_label', 'evidences'], dtype='object')
   claim_id                                              claim  claim_label  \
0         0  Global warming is driving polar bears toward e...            0   
1         5  The sun has gone into ‘lockdown’ which could c...            0   
2        11  They tell us that we are the primary forces co...            0   
3        14  The Great Barrier Reef is experiencing the mos...            0   
4        28  Volcanoes Melting West Antarctic Glaciers, Not...            0   

                                           evidences  
0  [{'evidence_id': 'Extinction risk from global ...  
1  [{'evidence_id': 'Famine:386', 'evidence_label...  
2  [{'evidence_id': 'Carbon dioxide:183', 'eviden...  
3  [{'evidence_id': 'Coral bleaching:52', 'eviden...  
4  [{'evidence_id': 'Antarctica:375', 'evidence_l...  
654 combined passages ready for embedding
['Global warming is driving polar bears toward extinction [{\'evidence_id\': \'E

In [None]:
file_path5 = file_path = "/content/drive/MyDrive/IAT360FinalProject/yoonseong-climatebert-factcheck.csv"
_df5 = pd.read_csv(file_path5)
print(_df5.columns)
print(_df5.head())

# Fill NaNs with empty strings to avoid errors
text_columns5 = ['claim', 'evidence']
_df5[text_columns5] = _df5[text_columns5].fillna('')

# Combine columns row-wise
passages5 = (_df5[text_columns5]
                 .agg(' '.join, axis=1)   # joins columns with a space
                 .tolist())
# Remove empty or whitespace-only passages
passages5 = [p.strip() for p in passages5 if len(p.strip()) > 0]

passages5 = [
    chunk
    for p in passages5
    for chunk in chunk_text(p)
]

print(f"{len(passages5)} combined passages ready for embedding")
print(passages5[:3])



Index(['claim_id', 'claim', 'evidence', 'evidence_label', 'label', 'category'], dtype='object')
   claim_id                                              claim  \
0      1259  While the north-east, midwest and upper great ...   
1      2361  "Unquestionably, the U.N. Intergovernmental Pa...   
2       399  Climate Change ‘Heat Records’ Are a Huge Data ...   
3      2710  [Wind energy] is a more expensive way of produ...   
4      1350  Until last June, most scientists acknowledged ...   

                                            evidence evidence_label  \
0  By August 2014, a three-year drought was promp...       SUPPORTS   
1  In it, the IUGG concurs with the "comprehensiv...       SUPPORTS   
2  In February 2019, The Western Journal publishe...       SUPPORTS   
3  Costs of production from coal fired plants bui...       SUPPORTS   
4  The use of proxy indicators to get quantitativ...       SUPPORTS   

        label                                category  
0  entailment           

In [None]:
from openai import OpenAI
import numpy as np
import faiss

client = OpenAI(api_key=OPENAI_API_KEY)

# Helper function to embed a list of texts using text-embedding-3-small
def embed_texts(texts, model="text-embedding-3-small", batch_size=128):
    """
    Embeds a list of texts using the OpenAI Embeddings API in safe batches.
    Returns a list of embeddings in order.
    """
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]

        response = client.embeddings.create(
            model=model,
            input=batch
        )

        batch_embeddings = [item.embedding for item in response.data]
        all_embeddings.extend(batch_embeddings)

    return np.array(all_embeddings)


In [None]:
# ---- Embed first set of passages ----
embeddings = embed_texts(passages, batch_size=128)

# Create FAISS index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

# Track passages
passages.extend(passages)



In [None]:
# ---- Embed first set of passages ----
embeddings_n = embed_texts(passages_n, batch_size=128)

# Create FAISS index
index.add(embeddings_n)

# Track passages
passages.extend(passages_n)

In [None]:
# ---- Embed news passages ----
news_embeddings = embed_texts(news_passages, batch_size=128)
index.add(news_embeddings)

# Track passages
passages.extend(news_passages)



In [None]:
embeddings_2 = embed_texts(passages2, batch_size=128)
index.add(embeddings_2)

# Track passages
passages.extend(passages2)


In [None]:
embeddings_3 = embed_texts(passages3, batch_size=128)
index.add(embeddings_3)

# Track passages
passages.extend(passages3)


In [None]:
embeddings_4 = embed_texts(passages4, batch_size=128)
index.add(embeddings_4)

# Track passages
passages.extend(passages4)


In [None]:
embeddings_5 = embed_texts(passages5, batch_size=128)
index.add(embeddings_5)

# Track passages
passages.extend(passages5)

In [None]:
# Retrieval function
def retrieve_passages(query, k=2):
    # Embed the query using OpenAI embeddings
    query_emb = embed_texts([query])[0]     # returns shape (1536,)
    query_emb = np.array(query_emb).reshape(1, -1)

    # Search FAISS
    _, indices = index.search(query_emb, k)
    return [passages[i] for i in indices[0]]


In [None]:
import os
from IPython.display import display, clear_output
from ipywidgets import widgets
from openai import OpenAI
import re

# Load API key from colab secret
client = OpenAI(api_key=OPENAI_API_KEY)

# Store chat history
chat_history = []


# ---------------------------
# CLEANUP: REMOVE LABEL TAGS
# ---------------------------
def remove_labels(text):
    return re.sub(r'\b[I|B]-[A-Za-z0-9_-]+\b', '', text).strip()


# ---------------------------
# GPT-4 GENERATION USING RAG
# ---------------------------
def generate_answer_clean(query, k=2):
    """
    Retrieve passages + generate GPT-4 answer.
    """

    # Retrieve top-k context passages
    context_passages = retrieve_passages(query, k)
    context = " ".join(context_passages)

    # Build prompt
    prompt = f"""
You are a helpful assistant specializing in knowledge about climate change and the envorinment. Use the context below to answer the question.
Ensure the answer ends with a complete sentence.

Context:
{context}

Question: {query}
Answer:
""".strip()

    # Send to GPT-4
    response = client.chat.completions.create(
        model="gpt-4.1",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3
    )

    answer = response.choices[0].message.content
    answer = remove_labels(answer)
    return answer


# ---------------------------
# CHAT WIDGET LOGIC
# ---------------------------
def chat_interface_widget(user_input):
    if user_input.strip() == "":
        return

    # Generate answer using GPT-4
    answer = generate_answer_clean(user_input)

    # Update history
    chat_history.append(("You", user_input))
    chat_history.append(("Bot", answer))

    # Refresh chat display
    clear_output(wait=True)
    for speaker, text in chat_history:
        print(f"{speaker}: {text}\n")
    display(input_widget, run_button)


# ---------------------------
# INPUT WIDGET
# ---------------------------
input_widget = widgets.Text(
    value='',
    description='Your Question:',
    placeholder='Ask something...'
)

run_button = widgets.Button(description="Send")

def on_button_click(b):
    chat_interface_widget(input_widget.value)
    input_widget.value = ""

run_button.on_click(on_button_click)

display(input_widget, run_button)


Text(value='', description='Your Question:', placeholder='Ask something...')

Button(description='Send', style=ButtonStyle())

In [None]:

# !pip install gradio
import gradio as gr

def chat_fn(message, history):
    return generate_answer_clean(message)

my_theme = gr.Theme.from_hub("JohnSmith9982/small_and_pretty")

gr.ChatInterface(
    fn=chat_fn,
    title="EVERGREEN: Simplifying Climate Conversations for Everyone",
    description="Ask me about climate science!",
    theme=my_theme,
).launch()


  self.chatbot = Chatbot(


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://5d4a35da5de565ff23.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


