In [2]:
import pandas as pd
from tqdm import tqdm
from pandarallel import pandarallel
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain.document_loaders import DirectoryLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain.schema import Document
pandarallel.initialize(progress_bar=True, verbose=0)
tqdm.pandas()
import os
from openai import OpenAI
import numpy as np
import chromadb
from chromadb.config import Settings

with open('secrets.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        if line.startswith('openai'):
            secret = line.split('=')[1].strip()

os.environ["OPENAI_API_KEY"] = secret

storage_path = './data/chromadb'

In [3]:
with open('secrets.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        if line.startswith('api_token'):
            token = line.split('=')[1].strip()

embeddings = HuggingFaceEndpointEmbeddings(
    model='http://100.67.185.22:8080',
    huggingfacehub_api_token=token
)

text_splitter = SemanticChunker(
    embeddings,
    breakpoint_threshold_type='standard_deviation'
)

In [4]:
 # split the text into chunks
def split_text(documents: list[Document]):
    chunks = text_splitter.split_documents(documents)

    return chunks

In [5]:
# Flatten, pad/truncate, and convert each embedding to a consistent 1D np.float32 array
def prepare_embedding_for_chromadb(embedding):
    # Flatten the embedding if it's nested
    flat_embedding = [float(val) for sublist in embedding for val in sublist] if isinstance(embedding[0], (list, np.ndarray)) else embedding
    
    # Ensure the embedding is exactly 2048 dimensions
    if len(flat_embedding) < 2048:
        flat_embedding.extend([0.0] * (2048 - len(flat_embedding)))  # Pad with zeros if too short
    elif len(flat_embedding) > 2048:
        flat_embedding = flat_embedding[:2048]  # Truncate if too long
    
    # Convert to np.float32
    return np.array(flat_embedding, dtype=np.float32)

In [8]:
from model_m3 import EmbeddingModelM3

embed_local = EmbeddingModelM3()

In [9]:
query_result = embed_local.embed_query("Hello, world!") # local
#query_result = embeddings.embed_query("Hello, world!") # remote
query_result[:3]

[-0.026738807559013367, 0.42828133702278137, -0.6886834502220154]

# Chunking with Semantic Chunker from langchain
### Breakpoint: Standard Deviation

In [16]:
df = pd.read_parquet("data/clean_cleantech.parquet")

In [None]:
df.content[0][:100]

In [None]:
df['chunks'] = df['content'].parallel_apply(lambda content: split_text([Document(content)]))

In [None]:
df.head(3)

In [None]:
df['chunk_size'] = df['chunks'].progress_apply(len)

In [None]:
df.head(3)

In [None]:
df['chunks'] = df['chunks'].progress_apply(lambda x: [t.page_content for t in x])

In [None]:
df.sample(5)

In [None]:
df.to_parquet('data/processed/chunked_sd.parquet')

In [None]:
df_chunked = pd.read_parquet('data/processed/chunked_sd.parquet')

In [None]:
# remove empty chunks
df_chunked['chunks'] = df_chunked['chunks'].progress_apply(lambda x: [y for y in x if len(y) > 0])

# Embed the Chunks
### model: BAAI/bge-m3

In [None]:
# embed the chunks
df_chunked['embeddings'] = df_chunked['chunks'].parallel_apply(embeddings.embed_documents)

In [None]:
df_chunked.head(3)

In [None]:
# save the chunked and embedded data
df_chunked.to_parquet('data/processed/chunked_sd_embedded.parquet')

# Setting up the ChromaDB
preparing the embedded parquet fiel for ChromaDB

In [None]:
df = pd.read_parquet('data/processed/chunked_sd_embedded.parquet')

In [None]:
df.head(3)

In [None]:
df.embeddings[0]

In [None]:
type(df.embeddings[0]), type(df.embeddings[0][0])

### preparing the data for ChromaDB

In [None]:
# Apply the function to prepare embeddings
tqdm.pandas()
df['embeddings'] = df['embeddings'].progress_apply(prepare_embedding_for_chromadb)

# Check the result
print("Sample embedding type and shape:", type(df['embeddings'][0]), df['embeddings'][0].shape, df['embeddings'][0].dtype)

In [None]:
df.embeddings[0]

In [None]:
# Convert 'date' column to string format
df['date'] = df['date'].astype(str)

In [None]:
# Ensure all doc_ids are unique by adding a suffix to duplicates
df['doc_id'] = df['doc_id'].astype(str)  # Ensure IDs are strings
df['doc_id'] = df.groupby('doc_id').cumcount().astype(str) + '_' + df['doc_id']

#### saving

In [None]:
# Specify the storage path
settings = Settings()

# Initialize ChromaDB client with persistent settings
client = chromadb.PersistentClient(path=storage_path, settings=settings)
collection_name = "energy_articles"

# Delete and recreate collection
if collection_name in [col.name for col in client.list_collections()]:
    client.delete_collection(collection_name)
collection = client.get_or_create_collection(name=collection_name)

df['embeddings'] = df['embeddings'].progress_apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)

# Insert data in batches
batch_size = 10000
for start in tqdm(range(0, len(df), batch_size)):
    batch = df.iloc[start:start + batch_size]
    
    ids = batch['doc_id'].astype(str).tolist()
    documents = batch['content'].tolist()
    embeds = [embed.tolist() if isinstance(embed, np.ndarray) else embed for embed in batch['embeddings']]
    metadatas = batch[['title', 'date', 'domain', 'url', 'language']].to_dict(orient='records')
    
    # Insert into ChromaDB collection
    collection.add(
        ids=ids,
        documents=documents,
        embeddings=embeds,
        metadatas=metadatas
    )

print("Data successfully added to ChromaDB.")

In [None]:
test_db = collection.get(include=['embeddings', 'documents', 'metadatas'], limit=1)
print(test_db)

In [None]:
# Query similar documents
question = 'In 2021, what were the top 3 states in the US in terms of total solar power generating capacity?'
query_test = embeddings.embed_query(question)
print(query_test[:3])

In [None]:
query_embedding = prepare_embedding_for_chromadb(query_test)
top_k = 5  # number of similar entries to retrieve

results = collection.query(
    query_embeddings=[query_embedding.tolist()],
    n_results=top_k,
    include=['documents', 'metadatas']
)

print(results)

# Retrieval

In [13]:
ai_client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

# load eval dataset
df_eval = pd.read_csv('data/eval_dataset/cleantech_rag_evaluation_data_2024-02-23.csv')

settings = Settings()

client = chromadb.PersistentClient(path=storage_path, settings=settings)

collection_name = "energy_articles"
collection = client.get_collection(collection_name)

eval_data_index = df_eval.sample(n=1)
eval_question = eval_data_index.iloc[0]['question']
article_url = eval_data_index.iloc[0]['article_url']

# Query text
query_text = eval_question

# Generate query embedding using the Hugging Face endpoint
#query_embedding = embeddings.embed_query(query_text) # remote
query_embedding = embed_local.embed_query(query_text) # local

prepared_embeddings = prepare_embedding_for_chromadb(query_embedding)

# Retrieve top 5 most relevant documents
results = collection.query(
    query_embeddings=[prepared_embeddings.tolist()],  # Query embedding
    n_results=5,  # Number of similar documents to retrieve
    include=['documents', 'metadatas']  # Include documents and metadata in the results
)

#print("Query Results:", results)

# Prepare context with document references
retrieved_text = ""
if 'documents' in results and results['documents']:
    for idx, doc in enumerate(results['documents'][0]):
        metadata = results['metadatas'][0][idx]  # Access metadata for each document
        doc_id = metadata.get("doc_id", f"Document {idx + 1}")  # Retrieve doc_id if available
        title = metadata.get("title", "Untitled Document")
        url = metadata.get("url", "URL not available")
        content_snippet = doc[:300] + "..."  # Take the first 300 characters as a snippet

        retrieved_text += (
            f"Document {idx + 1} - ID: {doc_id}\n"
            f"Title: {title}\n"
            f"URL: {url}\n"
            f"Content Snippet: {content_snippet}\n\n"
        )
else:
    print("No documents found in query results.")

# Debug: Print the retrieved_text to ensure it’s populated
#print("Retrieved Text:", retrieved_text)
# Create a system message with instructions for the assistant
system_message = """
You are a knowledgeable assistant. Based on the information from the documents provided by the user, answer the question in a detailed and informative way. In your answer, refer to specific documents by mentioning their titles, URLs, and IDs when relevant.

At the end of your answer, please provide a separate "Sources" section, listing all document titles, IDs, and URLs you referenced, even if they were only indirectly useful.
"""

# Construct the prompt as the user's message
prompt = f"""
Question: {query_text}

Documents:
{retrieved_text}

Please structure your answer as follows:
Answer:
(Your detailed answer here, with references to specific documents as needed)

Sources:
- Document N: documnet_id document_title, document_url
- Document N: documnet_id, document_title, document_url
- Document N: documnet_id, document_title, document_url
(Include every document you referred to in the answer)
"""

# Generate a response with GPT-3.5-turbo
response = ai_client.chat.completions.create(
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": prompt}
    ],
    model="gpt-3.5-turbo",
)

generated_response = response.choices[0].message.content

# Print the generated response
print(f'Used question: {eval_question}\nURL: {article_url}')
print('-'*40)
print(generated_response)
print('-'*40)

Used question: Who develops quality control systems for ocean temperature in-situ profiles?
URL: https://www.azocleantech.com/news.aspx?newsID=32873
----------------------------------------
Answer:
The quality control system for ocean temperature in-situ profiles is developed by scientists from the Chinese Academy of Sciences' (CAS) Institute of Atmospheric Physics (IAP) and their associates. This system, known as the CAS Ocean Data Center Quality Control system, offers a novel climatological range-based automatic quality control mechanism for ocean temperature in situ profiles. The quality controlled and bias-corrected ocean in-situ profile data from CAS Ocean Data Center, part of the Global Ocean Science Database, are now accessible to the public, ensuring data reliability and accuracy (Document 1, Document 3).

The development of such quality control systems is crucial due to the vast amount of ocean temperature profiles collected over the last century, each with varying precision, 

In [47]:
import pandas as pd
from ragas import EvaluationDataset, evaluate
from ragas.metrics import ContextPrecision, Faithfulness, AnswerRelevancy, ContextRecall
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI
import os

# Initialize the LLM for metrics that require it
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-3.5-turbo"))

# Prepare the data with required columns
data = [
    {
        "question": row["question"],
        "user_input": row["question"],  # Same as the question
        "context": [row["relevant_chunk"]],  # Relevant context or chunk
        "retrieved_contexts": [row["relevant_chunk"]],  # Retrieved context(s)
        "response": response.choices[0].message.content,  # Use actual response if available
        "reference": "Expected answer here"  # Replace with ground truth if available
    }
    for _, row in df_eval.iterrows()
]

# Create the EvaluationDataset
eval_dataset = EvaluationDataset.from_list(data)

# Define metrics to use for evaluation
metrics = [
    ContextPrecision(),
    Faithfulness(llm=evaluator_llm),
    AnswerRelevancy(llm=evaluator_llm),
    ContextRecall()
]

# Run the evaluation
results = evaluate(eval_dataset, metrics=metrics)

# Display the results
print("Evaluation Results:", results)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Evaluating:   0%|          | 0/92 [00:00<?, ?it/s]

Evaluation Results: {'context_precision': 1.0000, 'faithfulness': 0.0425, 'answer_relevancy': 0.7428, 'context_recall': 0.2572}


In [48]:
df_results = results.to_pandas()
df_results.head()

Unnamed: 0,user_input,retrieved_contexts,response,reference,context_precision,faithfulness,answer_relevancy,context_recall
0,What is the innovation behind Leclanché's new ...,[Leclanché said it has developed an environmen...,Answer:\nThe quality control system for ocean ...,Expected answer here,1.0,0.0,0.741779,0.75
1,What is the EU’s Green Deal Industrial Plan?,[The Green Deal Industrial Plan is a bid by th...,Answer:\nThe quality control system for ocean ...,Expected answer here,1.0,0.0,0.732744,1.0
2,What is the EU’s Green Deal Industrial Plan?,[The European counterpart to the US Inflation ...,Answer:\nThe quality control system for ocean ...,Expected answer here,1.0,0.0,0.732744,0.0
3,What are the four focus areas of the EU's Gree...,[The new plan is fundamentally focused on four...,Answer:\nThe quality control system for ocean ...,Expected answer here,1.0,0.0,0.730669,0.5
4,When did the cooperation between GM and Honda ...,[What caught our eye was a new hookup between ...,Answer:\nThe quality control system for ocean ...,Expected answer here,1.0,0.0,0.752416,0.0
