In [1]:
from src.vectorstorage import EmbeddingVectorStorage
from src.custom_embeddings import bge_m3_embed
from langchain.llms import OpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
import pandas as pd
from tqdm import tqdm
from src.evaluation import RAGEvaluation

with open('secrets.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        if line.startswith('api_token'):
            token = line.split('=')[1].strip()

with open('secrets.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        if line.startswith('openai'):
            openai_token = line.split('=')[1].strip()
                     



In [2]:
from src.config import PERSISTENT_DIR

# BAAI_bge_m3_RecursiveCharacterTextSplitter
# Alibaba-NLP_gte-Qwen2-7B_RecursiveCharacterTextSplitter
# nomic-ai_nomic-embed-text-v1_5_RecursiveCharacterTextSplitter

collection_name = "BAAI_bge_m3_RecursiveCharacterTextSplitter" 

# Reload the vector store
bge_m3_vectordb = EmbeddingVectorStorage(
    method_of_embedding=bge_m3_embed,
    collection=collection_name,
    path_persistent=str(PERSISTENT_DIR)
)

print(bge_m3_vectordb)
print(bge_m3_vectordb.test_heartbeat())  # Check if the Chroma client is active
print(bge_m3_vectordb.collection_is_empty())  # Check if the collection is empty

VectorStorage(method_of_embedding=CustomHuggingFaceEndpointEmbeddings, group=BAAI_bge_m3_RecursiveCharacterTextSplitter)
1732141997136561000
False


In [3]:
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

load_dotenv()
llm = ChatOpenAI(model='gpt-4o-mini')

from langchain_ollama import OllamaLLM

qwen2_5_3b = 'qwen2.5-coder:3b'

local_llm = OllamaLLM(model=qwen2_5_3b)

# Setup the prompt template
system_description = """You are an expert about the Clean Technology Sector.
                        Answer the user question as best you can. Answer as though you were writing a tutorial that addressed the user question."""

hyde_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_description),
        ("human", "{question}"),
    ]
)

# Define the model and output parser pipeline
gen_hypothetical_doc = (
    hyde_prompt
    | local_llm
    | StrOutputParser()
)

In [4]:
base_retriever = bge_m3_vectordb.as_retriever(k=20)

# Setup retrieval chain
hyde_retrieval_chain = (gen_hypothetical_doc | base_retriever)

In [5]:
# Setup RAG chain
base_rag_chain = RunnableParallel(
    {
        "question": RunnablePassthrough(),  # Directly passes the question through
        "context": hyde_retrieval_chain    # Retrieves contextual information
    }
)

hyde_rag = base_rag_chain.assign(answer=base_rag_chain)

# Invoke the system
result = hyde_rag.invoke("What is the company's goal in reducing gas flaring?")

In [6]:
def format_output(response_data):
    # Extract the question
    question = response_data.get('question', 'No question provided')
    
    # Extract context, which is a list of documents
    context = response_data.get('context', [])
    
    # Print the question
    print(f"Question: {question}\n")
    print("Context Documents:\n")
    
    # Iterate through each document in the context
    for idx, doc in enumerate(context, 1):
        metadata = doc.metadata
        page_content = doc.page_content
        
        # Format the metadata
        date = metadata.get('date', 'Unknown date')
        domain = metadata.get('domain', 'Unknown domain')
        title = metadata.get('title', 'No title')
        url = metadata.get('url', 'No URL provided')
        id = metadata.get('origin_doc_id')
        
        # Print formatted document details
        print(f"Document {idx}:")
        print(f"ID: {id}")
        print(f"Title: {title}")
        print(f"Date: {date}")
        print(f"Domain: {domain}")
        print(f"URL: {url}\n")
        print(f"Content: {page_content[:500]}...")  # Print the first 500 characters of the content
        print("-" * 80)  # Print a line for separation
    
    print("\nEnd of context documents.")

In [7]:
format_output(result)

Question: What is the company's goal in reducing gas flaring?

Context Documents:

Document 1:
ID: d686bcc24f6c65ff1bc50606790b15a8
Title: Road to Adipec: Pragmatic and progressive
Date: 2022-11-07
Domain: energyvoice
URL: https://www.energyvoice.com/podcast/455602/road-to-adipec-pragmatic-and-progressive/

Content: and gas companies have the most ability to make a change and tackle emissions at source. To turn the tap, to make sure the flare is lit or design the facility so it doesn t need to flare. There are steps the industry can take and should take....
--------------------------------------------------------------------------------
Document 2:
ID: 06d247cda1080b82221a5b49855aa9a2
Title: Tail Gas Treatment Using Fuel-Free Abatement Technologies for Compliance
Date: 2021-08-05
Domain: azocleantech
URL: https://www.azocleantech.com/article.aspx?ArticleID=1283

Content: By clicking Allow All you agree to the storing of cookies on your device to enhance site navigation, analyse site us

In [8]:
eval_df = pd.read_csv('eval_test_df.csv')

eval_df.head()

Unnamed: 0.1,Unnamed: 0,example_id,question_id,question,relevant_text,answer,article_url,top_score,top_score_id
0,0,1,1,What is the innovation behind Leclanché's new ...,Leclanché said it has developed an environment...,Leclanché's innovation is using a water-based ...,https://www.sgvoice.net/strategy/technology/23...,96,d4e62c94318b4b2d09a1f9519b9ba65b
1,1,2,2,What is the EU’s Green Deal Industrial Plan?,The Green Deal Industrial Plan is a bid by the...,The EU’s Green Deal Industrial Plan aims to en...,https://www.sgvoice.net/policy/25396/eu-seeks-...,100,0ff0fa30dbc4a6cedb659fa0d221ec83
2,2,3,2,What is the EU’s Green Deal Industrial Plan?,The European counterpart to the US Inflation R...,The EU’s Green Deal Industrial Plan aims to en...,https://www.pv-magazine.com/2023/02/02/europea...,98,80181cd176814323bb67524614a2f11e
3,3,4,3,What are the four focus areas of the EU's Gree...,The new plan is fundamentally focused on four ...,The four focus areas of the EU's Green Deal In...,https://www.sgvoice.net/policy/25396/eu-seeks-...,100,0ff0fa30dbc4a6cedb659fa0d221ec83
4,4,5,4,When did the cooperation between GM and Honda ...,What caught our eye was a new hookup between G...,July 2013,https://cleantechnica.com/2023/05/08/general-m...,100,4411ffe0103fa4d5d9f00addb04806dd


In [9]:
def invoke_hyde_rag_and_collect_docs(question):
    # This function simulates the invocation of the RAG chain for a given question
    # and extracts the document IDs from the context provided in the response.
    result = hyde_rag.invoke(question)
    if 'context' in result:
        retrieved_doc_ids = [doc.metadata['origin_doc_id'] for doc in result['context']]
        return retrieved_doc_ids
    else:
        return []  # Return an empty list if no context is provided

# Example usage in evaluation:
eval_df['retrieved_doc_ids'] = eval_df['question'].apply(invoke_hyde_rag_and_collect_docs)

In [10]:
# print the len of the retreived doc ids
print(len(eval_df['retrieved_doc_ids'][0]))

20


In [12]:
# Define RAGEvaluation object
rag_eval = RAGEvaluation(
    name="Experiment HyDE",
    rag_chain=hyde_rag,
    llm_model=llm,
    local_llm=local_llm.model,
    embeddings=bge_m3_embed
)

clean_file_path = 'data/preprocessed/clean_cleantech.parquet'
eval_file_path = 'data/eval_dataset/cleantech_rag_evaluation_data_2024-09-20.csv'

# Perform evaluation
results = rag_eval.evaluate(clean_file_path, eval_file_path, bge_m3_vectordb)


Loading cached data from: /Users/oliverpejic/Documents/Classes/npr/npr_mc1/npr-mc1-frfr/data/cache/preprocessed.pkl
Preparing evaluation dataset...


Preprocessing queries: 100%|██████████| 23/23 [25:28<00:00, 66.45s/it]


ArrowInvalid: Could not convert Document(metadata={'date': '2023-01-20', 'domain': 'energyvoice', 'origin_doc_id': 'd4e62c94318b4b2d09a1f9519b9ba65b', 'title': 'Leclanché’ s new disruptive battery boosts energy density', 'url': 'https://sgvoice.energyvoice.com/strategy/technology/23971/leclanches-new-disruptive-battery-boosts-energy-density/'}, page_content='process safer for employees. Leclanch claims to be a global pioneer in the field, having used aqueous binders in its for over a decade. , A further benefit of the water based process is a lower carbon footprint, as it uses 30 less energy, which would otherwise be needed to dry, or evaporate, and recycle the solvents. , With the water based production of the high capacity NMCA cathodes, we have reached a decisive milestone in lithium ion technology, said Dr Hilmi Buqa, vice president R D at Leclanch . Until now, producing them using environmentally friendly processes was considered impossible. But, now we have mastered the process. , In developing its new G NMCA battery, Leclanch has increased the nickel content of the cathode to around 90 , which has the effect of increasing the energy density. The G refers to the graphite anode. Increasing the nickel content allows for the cobalt content to be reduced from 20 to 5 . , Compared to conventional G NMC cells, these high capacity NMCA') with type Document: did not recognize Python value type when inferring an Arrow data type