<a href="https://colab.research.google.com/github/sleepyzzpanda/Environment-RAG-Chatbot/blob/main/Climate_RAG_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# GPT-2 RAG Chatbot for Climate Information
This notebook sets up a retrieval-augmented generation (RAG) chatbot using GPT-2 and FAISS embeddings for climate data, with an interactive cell-based interface.

In [1]:
!pip install torch transformers datasets faiss-cpu sentence-transformers ipywidgets

Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi, faiss-cpu
Successfully installed faiss-cpu-1.13.0 jedi-0.19.2


In [2]:
import torch
import requests
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from ipywidgets import interact_manual, widgets
from datasets import load_dataset
import pandas as pd
# !unzip archive.zip -d climate_news_data


In [3]:
# Load GPT-2 model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [18]:
# # Load the ClimateBERT environmental claims dataset
# claims_dataset = load_dataset("climatebert/environmental_claims")
# print(claims_dataset['train'].column_names)
# # Load the NER dataset
# ner_dataset = load_dataset("ibm-research/Climate-Change-NER")
# print(ner_dataset['train'].column_names)


# news_df = pd.read_csv("climate_news_data/climate-change-news.csv")
# print(news_df.columns)
# print(news_df.head())

climate_x = load_dataset("rlacombe/ClimateX")
print(climate_x['train'].column_names)

passages = []

for example in climate_x["train"]:
    passages.append(example["statement"])

# for example in ner_dataset["train"]:
#     # If the dataset has 'tokens' and 'ner_tags'
#     if "tokens" in example:
#         sentence = " ".join(example["tokens"])  # join tokens into plain text
#         passages.append(sentence)
#     elif "text" in example:  # for datasets like ClimateBERT
#         passages.append(example["text"])

# Optional: remove empty or malformed entries
clean_passages = [p.strip() for p in passages if len(p.strip()) > 0]




['statement_idx', 'report', 'page_num', 'sent_num', 'statement', 'confidence', 'score', 'split']


In [6]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [17]:

file_path = "/content/drive/MyDrive/IAT360FinalProject/climate_headlines_sentiment.csv"
news_df = pd.read_csv(file_path)
print(news_df.columns)
print(news_df.head())

# Fill NaNs with empty strings to avoid errors
text_columns = ['Headline', 'Content', 'Justification']
news_df[text_columns] = news_df[text_columns].fillna('')

# Combine columns row-wise
news_passages = (news_df[text_columns]
                 .agg(' '.join, axis=1)   # joins columns with a space
                 .tolist())
# Remove empty or whitespace-only passages
news_passages = [p.strip() for p in news_passages if len(p.strip()) > 0]

print(f"{len(news_passages)} combined passages ready for embedding")
print(news_passages[:3])


Index(['Unnamed: 0', 'Headline', 'Link', 'Content', 'Sentiment',
       'Justification'],
      dtype='object')
   Unnamed: 0                                           Headline  \
0           0  Australia's year ahead in climate and environm...   
1           1  Projections reveal the vulnerability of freshw...   
2           2  Record heat in 2023 worsened global droughts, ...   
3           3  It's not just the total rainfall "“ why is eas...   
4           4  Expert Commentary: 2023 was the warmest year o...   

                                                Link  \
0  https://www.abc.net.au/news/science/2024-01-23...   
1  https://news.griffith.edu.au/2024/01/09/projec...   
2  https://www.anu.edu.au/news/all-news/record-he...   
3  https://www.theguardian.com/australia-news/202...   
4  https://www.csiro.au/en/news/all/news/2024/jan...   

                                             Content  Sentiment  \
0   The year has barely started and extreme weath...        0.0   
1   “Wat

In [19]:
# Create embeddings and FAISS index
embed_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = embed_model.encode(clean_passages)
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))
passages.extend(clean_passages)
news_embeddings = embed_model.encode(news_passages)
index.add(np.array(news_embeddings))
passages.extend(news_passages)


In [20]:
# Retrieval function
def retrieve_passages(query, k=2):
    query_emb = embed_model.encode([query])
    _, indices = index.search(np.array(query_emb), k=k)
    return [passages[i] for i in indices[0]]

In [21]:
# RAG generation function
def generate_answer(query, k=2, max_new_tokens=75):
    context_passages = retrieve_passages(query, k)
    context = ' '.join(context_passages)
    prompt = f"Question: {query}\nProvide accurate information concisely in 1-2 sentences based on the following context (in natural language): {context}"

    # Encode input
    inputs = tokenizer(prompt, return_tensors="pt")

    # Generate output
    output = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id  # avoids padding issues
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)


In [22]:
from IPython.display import display, clear_output
from ipywidgets import widgets

# Store chat history
chat_history = []

import re

def remove_labels(text):
    # Remove anything that looks like I- or B- tags
    return re.sub(r'\b[I|B]-[A-Za-z0-9_-]+\b', '', text).strip()


def generate_answer_clean(query, k=2, max_new_tokens=50):
    """
    Generate GPT-2 answer based on retrieved context,
    returns only the clean answer without repeated prompt/context.
    """
    context_passages = retrieve_passages(query, k)
    context = ' '.join(context_passages)

    prompt = f"Question: {query}\nAnswer concisely based on the following context: {context}"
    inputs = tokenizer(prompt, return_tensors="pt")

    output = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode and remove repeated prompt/context
    full_text = tokenizer.decode(output[0], skip_special_tokens=True)

    # Try to extract the part after "Answer concisely based on the following context:"
    if "Answer concisely based on the following context:" in full_text:
        answer = full_text.split("Answer concisely based on the following context:")[-1].strip()
    else:
        answer = full_text.strip()

    return remove_labels(answer)

def chat_interface_widget(user_input):
    """
    Widget callback for interactive chat.
    """
    if user_input.strip() == '':
        return

    # Generate answer
    answer = generate_answer_clean(user_input)

    # Append to chat history
    chat_history.append(("You", user_input))
    chat_history.append(("Bot", answer))

    # Clear previous output and display chat
    clear_output(wait=True)
    for speaker, text in chat_history:
        print(f"{speaker}: {text}\n")

# Create interactive text widget
input_widget = widgets.Text(
    value='',
    description='Your Question:',
    placeholder='Type your question here...'
)

run_button = widgets.Button(description="Send")

def on_button_click(b):
    chat_interface_widget(input_widget.value)
    input_widget.value = ''  # Clear input box after sending

run_button.on_click(on_button_click)

# Display widget and button
display(input_widget, run_button)


You: greenland ice sheets

Bot: The Greenland Ice Sheet was smaller than at present during the Last Interglacial period (high confidence) and the mid-Holocene CCP6 2329Polar Regions Cross-Chapter Paper 6 in Greenland Ice Sheet and the Greenland Ice Sheet (GIS) (http://www.geocities.org/geocities/gis/gis_gis_gis_gis_gis_gis_gis_

