In [2]:
import pandas as pd
from tqdm import tqdm
from pandarallel import pandarallel
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain.document_loaders import DirectoryLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain.schema import Document

pandarallel.initialize(progress_bar=True, verbose=0)
tqdm.pandas()
import os
from openai import OpenAI
import numpy as np
import chromadb
from chromadb.config import Settings

with open("secrets.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if line.startswith("openai"):
            secret = line.split("=")[1].strip()

os.environ["OPENAI_API_KEY"] = secret

In [3]:
df = pd.read_parquet("data/processed/chunked_sd_embedded.parquet")

In [4]:
with open("secrets.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if line.startswith("api_token"):
            token = line.split("=")[1].strip()

embeddings = HuggingFaceEndpointEmbeddings(
    model="http://100.67.185.22:8080", huggingfacehub_api_token=token
)

In [4]:
# Flatten, pad/truncate, and convert each embedding to a consistent 1D np.float32 array
def prepare_embedding_for_chromadb(embedding):
    # Flatten the embedding if it's nested
    flat_embedding = (
        [float(val) for sublist in embedding for val in sublist]
        if isinstance(embedding[0], (list, np.ndarray))
        else embedding
    )

    # Ensure the embedding is exactly 2048 dimensions
    if len(flat_embedding) < 2048:
        flat_embedding.extend(
            [0.0] * (2048 - len(flat_embedding))
        )  # Pad with zeros if too short
    elif len(flat_embedding) > 2048:
        flat_embedding = flat_embedding[:2048]  # Truncate if too long

    # Convert to np.float32
    return np.array(flat_embedding, dtype=np.float32)


# Apply the function to prepare embeddings
tqdm.pandas()
df["embeddings"] = df["embeddings"].progress_apply(prepare_embedding_for_chromadb)

# Check the result
print(
    "Sample embedding type and shape:",
    type(df["embeddings"][0]),
    df["embeddings"][0].shape,
    df["embeddings"][0].dtype,
)

100%|██████████| 63708/63708 [00:08<00:00, 7829.54it/s]

Sample embedding type and shape: <class 'numpy.ndarray'> (2048,) float32





In [5]:
# Convert 'date' column to string format
df["date"] = df["date"].astype(str)

In [6]:
# Ensure all doc_ids are unique by adding a suffix to duplicates
df["doc_id"] = df["doc_id"].astype(str)  # Ensure IDs are strings
df["doc_id"] = df.groupby("doc_id").cumcount().astype(str) + "_" + df["doc_id"]

In [7]:
# load eval dataset
df_eval = pd.read_csv("data/eval_dataset/cleantech_rag_evaluation_data_2024-02-23.csv")

with open("secrets.txt", "r") as f:
    lines = f.readlines()
    for line in lines:
        if line.startswith("openai"):
            secret = line.split("=")[1].strip()

os.environ["OPENAI_API_KEY"] = secret

ai_client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

# Initialize ChromaDB client with persistent settings
client = chromadb.PersistentClient(path="./data/chromadb/")
collection_name = "energy_articles"

# Delete and recreate collection with a specified dimension of 1024 (or your intended dimension)
if collection_name in [col.name for col in client.list_collections()]:
    client.delete_collection(collection_name)
collection = client.get_or_create_collection(name=collection_name)

eval_data_index = df_eval.sample(n=1)
eval_question = eval_data_index.iloc[0]["question"]
article_url = eval_data_index.iloc[0]["article_url"]

# Query text
query_text = eval_question

# Generate query embedding using the Hugging Face endpoint
query_embedding = embeddings.embed_query(query_text)

# Retrieve top 5 most relevant documents
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5,  # Number of similar documents to retrieve
)

# Prepare context with document references
retrieved_text = ""
for idx, doc in enumerate(results["documents"][0]):
    metadata = results["metadatas"][0][idx]  # Access metadata for each document
    doc_id = metadata.get(
        "doc_id", f"Document {idx + 1}"
    )  # Retrieve doc_id if available
    title = metadata.get("title", "Untitled Document")
    url = metadata.get("url", "URL not available")
    content_snippet = doc[:300] + "..."  # Take the first 300 characters as a snippet

    retrieved_text += (
        f"Document {idx + 1} - ID: {doc_id}\n"
        f"Title: {title}\n"
        f"URL: {url}\n"
        f"Content Snippet: {content_snippet}\n\n"
    )

# Create a system message with instructions for the assistant
system_message = """
You are a knowledgeable assistant. Based on the information from the documents provided by the user, answer the question in a detailed and informative way. In your answer, refer to specific documents by mentioning their titles, URLs, and IDs when relevant.

At the end of your answer, please provide a separate "Sources" section, listing all document titles, IDs, and URLs you referenced, even if they were only indirectly useful.
"""

# Construct the prompt as the user's message
prompt = f"""
Question: {query_text}

Documents:
{retrieved_text}

Please structure your answer as follows:
Answer:
(Your detailed answer here, with references to specific documents as needed)

Sources:
- Document 1: ID, Title, URL
- Document 2: ID, Title, URL
- Document 3: ID, Title, URL
(Include every document you referred to in the answer)
"""

# Generate a response with GPT-3.5-turbo
response = ai_client.chat.completions.create(
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": prompt},
    ],
    model="gpt-3.5-turbo",
)

generated_response = response.choices[0].message.content

# Print the generated response
print("Response:", generated_response)
print("-" * 40)
print(f"Used question: {eval_question}, URL: {article_url}")

Response: Answer:
The turbines used in Icelandic geothermal power plants come from various manufacturers such as Toshiba Energy Systems & Solutions Corporation (ID: 3). Toshiba Energy Systems & Solutions Corporation has been involved in supplying turbines for geothermal power plants in Iceland, as shown in the document "Geothermal Finance & Development: June 2020" (ID: 3). This document discusses Toshiba's contract to supply turbines for the Þeistareykir Geothermal Power Plant in Iceland, highlighting their role in providing efficient and reliable equipment for geothermal power generation.

In addition to Toshiba, other companies like Mitsubishi Hitachi Power Systems Europe GmbH have also supplied turbines for Icelandic geothermal projects. The document "Geothermal in Iceland" (ID: 2) mentions the involvement of Mitsubishi Hitachi Power Systems Europe GmbH in supplying turbines for the Hellisheiði Geothermal Power Plant in Iceland. This indicates that a variety of turbine manufacturers

In [8]:
print(df_eval.head(2))

   example_id  question_id                                           question  \
0           1            1  What is the innovation behind Leclanché's new ...   
1           2            2       What is the EU’s Green Deal Industrial Plan?   

                                      relevant_chunk  \
0  Leclanché said it has developed an environment...   
1  The Green Deal Industrial Plan is a bid by the...   

                                         article_url  
0  https://www.sgvoice.net/strategy/technology/23...  
1  https://www.sgvoice.net/policy/25396/eu-seeks-...  


In [7]:
df_eval = pd.read_csv("data/eval_dataset/cleantech_rag_evaluation_data_2024-02-23.csv")

df_eval.head(2)

Unnamed: 0,example_id,question_id,question,relevant_chunk,article_url
0,1,1,What is the innovation behind Leclanché's new ...,Leclanché said it has developed an environment...,https://www.sgvoice.net/strategy/technology/23...
1,2,2,What is the EU’s Green Deal Industrial Plan?,The Green Deal Industrial Plan is a bid by the...,https://www.sgvoice.net/policy/25396/eu-seeks-...


In [8]:
# Function to generate answers using OpenAI
def get_answer(question):
    try:
        response = ai_client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=[{"role": "user", "content": question}],
        )
        return response.choices[0].message.content
    except Exception as e:
        print(
            f"Error while generating answer for question: {question}. Error: {str(e)}"
        )
        return None


# Generate answers for each question and store them
df_eval["predicted_answer"] = df_eval["question"].apply(get_answer)

# Optionally, use 'relevant_chunk' as reference answer if no other reference is available
df_eval["reference_answer"] = df_eval["relevant_chunk"]

# Save the updated DataFrame
df_eval.to_csv("data/eval_dataset/updated_eval_data.csv", index=False)

Error while generating answer for question: What is the innovation behind Leclanché's new method to produce lithium-ion batteries?. Error: name 'ai_client' is not defined
Error while generating answer for question: What is the EU’s Green Deal Industrial Plan?. Error: name 'ai_client' is not defined
Error while generating answer for question: What is the EU’s Green Deal Industrial Plan?. Error: name 'ai_client' is not defined
Error while generating answer for question: What are the four focus areas of the EU's Green Deal Industrial Plan?. Error: name 'ai_client' is not defined
Error while generating answer for question: When did the cooperation between GM and Honda on fuel cell vehicles start?. Error: name 'ai_client' is not defined
Error while generating answer for question: Did Colgate-Palmolive enter into PPA agreements with solar developers?. Error: name 'ai_client' is not defined
Error while generating answer for question: What is the status of ZeroAvia's hydrogen fuel cell electri

In [25]:
df_eval = pd.read_csv("data/eval_dataset/updated_eval_data.csv")

df_eval.head(2)

Unnamed: 0,example_id,question_id,question,relevant_chunk,article_url,predicted_answer,reference_answer
0,1,1,What is the innovation behind Leclanché's new ...,Leclanché said it has developed an environment...,https://www.sgvoice.net/strategy/technology/23...,Leclanché's new method to produce lithium-ion ...,Leclanché said it has developed an environment...
1,2,2,What is the EU’s Green Deal Industrial Plan?,The Green Deal Industrial Plan is a bid by the...,https://www.sgvoice.net/policy/25396/eu-seeks-...,The EU’s Green Deal Industrial Plan is a key c...,The Green Deal Industrial Plan is a bid by the...


In [None]:
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_correctness


# Convert 'retrieved_contexts' to a list of contexts per entry
df_eval["retrieved_contexts"] = df_eval["relevant_chunk"].apply(lambda x: [x])

# Ensure 'response' column is correctly named and already exists
df_eval["response"] = df_eval["predicted_answer"]

# Add a 'reference' column, modify this line as needed based on your dataset's correct answers
df_eval["reference"] = df_eval[
    "relevant_chunk"
]  # Adjust this if another column serves as a better reference

# Convert to Hugging Face's datasets format
dataset = Dataset.from_pandas(df_eval)

# Evaluate the dataset
score = evaluate(dataset, metrics=[faithfulness, answer_correctness])
score_df = score.to_pandas()

# Save the evaluation results
score_df.to_csv("data/eval_dataset/eval_results.csv", index=False)

Evaluating:   0%|          | 0/46 [00:00<?, ?it/s]

No statements were generated from the answer.
