In [1]:
import pandas as pd
from tqdm import tqdm
from pandarallel import pandarallel
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain.document_loaders import DirectoryLoader
from langchain_experimental.text_splitter import SemanticChunker
from langchain.schema import Document
pandarallel.initialize(progress_bar=True, verbose=0)
tqdm.pandas()
import os
from openai import OpenAI
import numpy as np
import chromadb
from chromadb.config import Settings

with open('secrets.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        if line.startswith('openai'):
            secret = line.split('=')[1].strip()

os.environ["OPENAI_API_KEY"] = secret

In [2]:
df = pd.read_parquet("data/clean_cleantech.parquet")

In [8]:
df.content[0][:100]

'New US President Joe Biden took office this week with the US China relationship at its worst in deca'

In [4]:
with open('secrets.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        if line.startswith('api_token'):
            token = line.split('=')[1].strip()

embeddings = HuggingFaceEndpointEmbeddings(
    model='http://100.67.185.22:8080',
    huggingfacehub_api_token=token
)

text_splitter = SemanticChunker(
    embeddings,
    breakpoint_threshold_type='standard_deviation'
)

In [5]:
 # split the text into chunks
def split_text(documents: list[Document]):
    chunks = text_splitter.split_documents(documents)

    return chunks

In [6]:
query_result = embeddings.embed_query("Hello, world!")
query_result[:3]

[-0.015927615, 0.026893413, -0.042682685]

# Chunking with Semantic Chunker from langchain
### Breakpoint: Standard Deviation

In [None]:
df['chunks'] = df['content'].parallel_apply(lambda content: split_text([Document(content)]))

In [None]:
df.head(3)

In [None]:
df['chunk_size'] = df['chunks'].progress_apply(len)

In [None]:
df.head(3)

In [None]:
df['chunks'] = df['chunks'].progress_apply(lambda x: [t.page_content for t in x])

In [None]:
df.sample(5)

In [None]:
df.to_parquet('data/processed/chunked_sd_new.parquet')

In [None]:
df_chunked = pd.read_parquet('data/processed/chunked_sd.parquet')

In [None]:
# remove empty chunks
df_chunked['chunks'] = df_chunked['chunks'].progress_apply(lambda x: [y for y in x if len(y) > 0])

# Embed the Chunks
### model: BAAI/bge-m3

In [None]:
# embed the chunks
df_chunked['embeddings'] = df_chunked['chunks'].parallel_apply(embeddings.embed_documents)

In [None]:
df_chunked.head(3)

In [None]:
# save the chunked and embedded data
df_chunked.to_parquet('data/processed/chunked_sd_embedded.parquet')

# Setting up the ChromaDB
preparing the embedded parquet fiel for ChromaDB

In [9]:
df = pd.read_parquet('data/processed/chunked_sd_embedded.parquet')

In [10]:
df.head(3)

Unnamed: 0,doc_id,title,date,content,domain,url,language,classification_prediction,labels,scores,pos_score,ner,chunks,chunk_size,embeddings
0,1283,New Chapter for US-China Energy Trade,2021-01-20,New US President Joe Biden took office this we...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...,en,"{'labels': ['Text Blog', 'Text Report', 'Text ...","[Text Blog, Text Report, Text Paragraph, Topic...","[0.2568563222885132, 0.19991767406463623, 0.19...",0.655545,"[{'end_pos': 6, 'labels': [{'confidence': 0.99...",[New US President Joe Biden took office this w...,1,"[[-0.050721478, 0.015130312, -0.07786225, -0.0..."
1,1284,Japan: Slow Restarts Cast Doubt on 2030 Energy...,2021-01-22,The slow pace of Japanese reactor restarts con...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...,en,"{'labels': ['Text Report', 'Text Blog', 'Text ...","[Text Report, Text Blog, Text Paragraph, Topic...","[0.2845178246498108, 0.21136364340782166, 0.17...",0.667931,"[{'end_pos': 25, 'labels': [{'confidence': 0.9...",[The slow pace of Japanese reactor restarts co...,1,"[[-0.007802753, 0.010925719, 0.019688666, -0.0..."
2,1287,Biden Appointees Signal Progressive Engagement,2021-01-28,Oil and natural gas industry officials have be...,energyintel,https://www.energyintel.com/0000017b-a7dc-de4c...,en,"{'labels': ['Text Report', 'Text Blog', 'Text ...","[Text Report, Text Blog, Text Paragraph, Topic...","[0.2419334203004837, 0.24028132855892181, 0.18...",0.669386,"[{'end_pos': 108, 'labels': [{'confidence': 0....",[Oil and natural gas industry officials have b...,2,"[[-0.054734446, 0.01624517, -0.05236124, -0.01..."


In [11]:
df.embeddings[0]

array([array([-0.05072148,  0.01513031, -0.07786225, ..., -0.00130273,
              -0.01237413, -0.03548589])                              ],
      dtype=object)

In [12]:
type(df.embeddings[0]), type(df.embeddings[0][0])

(numpy.ndarray, numpy.ndarray)

### preparing the data for ChromaDB

In [13]:
# Flatten, pad/truncate, and convert each embedding to a consistent 1D np.float32 array
def prepare_embedding_for_chromadb(embedding):
    # Flatten the embedding if it's nested
    flat_embedding = [float(val) for sublist in embedding for val in sublist] if isinstance(embedding[0], (list, np.ndarray)) else embedding
    
    # Ensure the embedding is exactly 2048 dimensions
    if len(flat_embedding) < 2048:
        flat_embedding.extend([0.0] * (2048 - len(flat_embedding)))  # Pad with zeros if too short
    elif len(flat_embedding) > 2048:
        flat_embedding = flat_embedding[:2048]  # Truncate if too long
    
    # Convert to np.float32
    return np.array(flat_embedding, dtype=np.float32)

# Apply the function to prepare embeddings
tqdm.pandas()
df['embeddings'] = df['embeddings'].progress_apply(prepare_embedding_for_chromadb)

# Check the result
print("Sample embedding type and shape:", type(df['embeddings'][0]), df['embeddings'][0].shape, df['embeddings'][0].dtype)

100%|██████████| 63708/63708 [00:06<00:00, 10031.44it/s]

Sample embedding type and shape: <class 'numpy.ndarray'> (2048,) float32





In [14]:
df.embeddings[0]

array([-0.05072148,  0.01513031, -0.07786225, ...,  0.        ,
        0.        ,  0.        ], dtype=float32)

In [15]:
# Convert 'date' column to string format
df['date'] = df['date'].astype(str)

In [16]:
# Ensure all doc_ids are unique by adding a suffix to duplicates
df['doc_id'] = df['doc_id'].astype(str)  # Ensure IDs are strings
df['doc_id'] = df.groupby('doc_id').cumcount().astype(str) + '_' + df['doc_id']

#### saving

In [17]:
# Specify the storage path
storage_path = './data/chromadb'
settings = Settings()

# Initialize ChromaDB client with persistent settings
client = chromadb.PersistentClient(path=storage_path, settings=settings)
collection_name = "energy_articles"

# Delete and recreate collection
if collection_name in [col.name for col in client.list_collections()]:
    client.delete_collection(collection_name)
collection = client.get_or_create_collection(name=collection_name)

df['embeddings'] = df['embeddings'].progress_apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)

# Insert data in batches
batch_size = 10000
for start in tqdm(range(0, len(df), batch_size)):
    batch = df.iloc[start:start + batch_size]
    
    ids = batch['doc_id'].astype(str).tolist()
    documents = batch['content'].tolist()
    embeds = [embed.tolist() if isinstance(embed, np.ndarray) else embed for embed in batch['embeddings']]
    metadatas = batch[['title', 'date', 'domain', 'url', 'language']].to_dict(orient='records')
    
    # Insert into ChromaDB collection
    collection.add(
        ids=ids,
        documents=documents,
        embeddings=embeds,
        metadatas=metadatas
    )

print("Data successfully added to ChromaDB.")

100%|██████████| 63708/63708 [00:03<00:00, 18750.30it/s]
100%|██████████| 7/7 [01:55<00:00, 16.45s/it]

Data successfully added to ChromaDB.





In [18]:
test_db = collection.get(include=['embeddings', 'documents', 'metadatas'], limit=1)
print(test_db)

{'ids': ['0_1283'], 'embeddings': array([[-0.05072148,  0.01513031, -0.07786225, ...,  0.        ,
         0.        ,  0.        ]]), 'documents': ['New US President Joe Biden took office this week with the US China relationship at its worst in decades. Energy has come to play a bigger role in that relationship than ever before, and rising Chinese imports of US oil and LNG could serve as the foundation for fresh discussions on trade one of the few areas where US China communications have not completely broken down. But tackling climate change, a priority for Biden unlike predecessor Donald Trump, may offer the easiest and biggest opportunity for cooperation between the two powers now. Due to a bipartisan perception that China\' s economic, geopolitical and technological rise poses an existential threat to the US, a Biden administration is unlikely to soften the tone on Beijing. Trump demanded bigger purchases of US energy products by China as part of a Phase 1 trade deal before he wo

In [19]:
# Query similar documents
question = 'In 2021, what were the top 3 states in the US in terms of total solar power generating capacity?'
query_test = embeddings.embed_query(question)
print(query_test[:3])

[-0.028192963, 0.024877222, 0.014737648]


In [20]:
# Get an embedding from the first entry to check the dimensions
test_db_2 = collection.get(include=['embeddings'], limit=1)
stored_embedding_dim = len(test_db_2['embeddings'][0])
query_embedding_dim = len(query_test)

print(f"Stored embedding dimension: {stored_embedding_dim}")
print(f"Query embedding dimension: {query_embedding_dim}")

Stored embedding dimension: 2048
Query embedding dimension: 1024


# Retrieval

[-0.028192963, 0.024877222, 0.014737648]


In [20]:
query_embedding = query_test  # replace with your embedding for similarity search
top_k = 5  # number of similar entries to retrieve

results = collection.query(
    query_embeddings=[query_embedding],
    n_results=top_k,
    include=['documents', 'metadatas']
)

print(results)

{'ids': [[]], 'embeddings': None, 'documents': [[]], 'uris': None, 'data': None, 'metadatas': [[]], 'distances': None, 'included': [<IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}


In [17]:
ai_client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)

# load eval dataset
df_eval = pd.read_csv('data/eval_dataset/cleantech_rag_evaluation_data_2024-02-23.csv')

# Initialize ChromaDB client with persistent settings
client = chromadb.PersistentClient(path='./data/chromadb/')
collection_name = "energy_articles"

# Delete and recreate collection with a specified dimension of 1024 (or your intended dimension)
if collection_name in [col.name for col in client.list_collections()]:
    client.delete_collection(collection_name)
collection = client.get_or_create_collection(name=collection_name)

eval_data_index = df_eval.sample(n=1)
eval_question = eval_data_index.iloc[0]['question']
article_url = eval_data_index.iloc[0]['article_url']

# Query text
query_text = eval_question

# Generate query embedding using the Hugging Face endpoint
query_embedding = embeddings.embed_query(query_text)

# Retrieve a sample of documents directly from the collection
all_docs = collection.get(limit=5, include=['documents', 'metadatas', 'embeddings'])
print("Sample Documents in Collection:", all_docs)

# Retrieve top 5 most relevant documents
results = collection.query(
    query_embeddings=[query_embedding],
    n_results=5,  # Number of similar documents to retrieve
    include=['documents', 'metadatas', 'embeddings']  # Include documents and metadata in the results
)

print("Query Results:", results)

# Prepare context with document references
retrieved_text = ""
if 'documents' in results and results['documents']:
    for idx, doc in enumerate(results['documents'][0]):
        metadata = results['metadatas'][0][idx]  # Access metadata for each document
        doc_id = metadata.get("doc_id", f"Document {idx + 1}")  # Retrieve doc_id if available
        title = metadata.get("title", "Untitled Document")
        url = metadata.get("url", "URL not available")
        content_snippet = doc[:300] + "..."  # Take the first 300 characters as a snippet

        retrieved_text += (
            f"Document {idx + 1} - ID: {doc_id}\n"
            f"Title: {title}\n"
            f"URL: {url}\n"
            f"Content Snippet: {content_snippet}\n\n"
        )
else:
    print("No documents found in query results.")

# Debug: Print the retrieved_text to ensure it’s populated
print("Retrieved Text:", retrieved_text)
# Create a system message with instructions for the assistant
system_message = """
You are a knowledgeable assistant. Based on the information from the documents provided by the user, answer the question in a detailed and informative way. In your answer, refer to specific documents by mentioning their titles, URLs, and IDs when relevant.

At the end of your answer, please provide a separate "Sources" section, listing all document titles, IDs, and URLs you referenced, even if they were only indirectly useful.
"""

# Construct the prompt as the user's message
prompt = f"""
Question: {query_text}

Documents:
{retrieved_text}

Please structure your answer as follows:
Answer:
(Your detailed answer here, with references to specific documents as needed)

Sources:
- Document N: documnet_id document_title, document_url
- Document N: documnet_id, document_title, document_url
- Document N: documnet_id, document_title, document_url
(Include every document you referred to in the answer)
"""

# Generate a response with GPT-3.5-turbo
response = ai_client.chat.completions.create(
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": prompt}
    ],
    model="gpt-3.5-turbo",
)

generated_response = response.choices[0].message.content

# Print the generated response
print("Response:", generated_response)
print('-'*40)
print(f'Used question: {eval_question}, URL: {article_url}')
print('-'*40)
print(f'Retrieved text preview: {retrieved_text}')
print('-'*40)
print(f'Result from db query: {results}')

Sample Documents in Collection: {'ids': [], 'embeddings': array([], dtype=float64), 'documents': [], 'uris': None, 'data': None, 'metadatas': [], 'included': [<IncludeEnum.embeddings: 'embeddings'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}
Query Results: {'ids': [[]], 'embeddings': [array([], dtype=float64)], 'documents': [[]], 'uris': None, 'data': None, 'metadatas': [[]], 'distances': None, 'included': [<IncludeEnum.embeddings: 'embeddings'>, <IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}
Retrieved Text: 


KeyboardInterrupt: 

In [None]:
from ragas import EvaluationDataset

# Convert df_eval into a list of dictionaries in the required format
data = [
    {
        "query": row["question"],
        "context": [row["relevant_chunk"]],  # Context as a list of one or more relevant chunks
        "response": "Placeholder response for evaluation"  # Use actual response if available
    }
    for _, row in df_eval.iterrows()
]

# Create the EvaluationDataset
eval_dataset = EvaluationDataset(data)