In [None]:
from src.vectorstorage import EmbeddingVectorStorage
from src.custom_embeddings import CustomHuggingFaceEndpointEmbeddings
from langchain_core.embeddings import Embeddings
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEndpointEmbeddings
import pandas as pd


with open('secrets.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        if line.startswith('api_token'):
            token = line.split('=')[1].strip()
                     
bge_m3_embed = CustomHuggingFaceEndpointEmbeddings(
    model_name='BAAI/bge_m3',
    model='http://100.67.185.22:8080',
    huggingfacehub_api_token=token,
    model_kwargs={"normalize_embeddings": True}
)

stella_embed = CustomHuggingFaceEndpointEmbeddings(
    model_name='dunzhang/stella_en_1.5B_v5',
    model='http://100.67.185.22:8083',
    huggingfacehub_api_token=token,
    model_kwargs={"normalize_embeddings": True}
)

nomic_embed = CustomHuggingFaceEndpointEmbeddings(
    model_name='nomic-ai/nomic-embed-text-v1.5',
    model='http://100.67.185.22:8082',
    huggingfacehub_api_token=token,
    model_kwargs={"normalize_embeddings": True}
)

In [1]:
# Set the same parameters as the original vector store
group_name = "bge_m3_embed_RecursiveCharacterTextSplitter" 
path_persistent = "./data/chroma"

# Reload the vector store
bge_m3_vectordb = EmbeddingVectorStorage(
    method_of_embedding=bge_m3_embed,
    collection=group_name,
    path_persistent=path_persistent
)

print(bge_m3_vectordb)

VectorStorage(method_of_embedding=CustomHuggingFaceEndpointEmbeddings, group=bge_m3_embed_RecursiveCharacterTextSplitter)


In [None]:
print(bge_m3_vectordb.test_heartbeat())  # Check if the Chroma client is active
print(bge_m3_vectordb.collection_is_empty())  # Check if the collection is empty

In [None]:
query = "The company is also aiming to reduce gas flaring?"
bge_m3_vectordb.search_similar_w_scores(query)

In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_ollama import OllamaLLM

qwen2_5_0_5b_model = 'qwen2.5:0.5b-instruct-q4_0'
qwen2_5_1_5b_model = 'qwen2.5:1.5b-instruct-q4_0'
qwen2_5_3b_model = 'qwen2.5:3b-instruct-q4_0'
qwen2_5_7b_model = 'qwen2.5:7b-instruct-q4_0'
qwen2_5_14b_model = 'qwen2.5:14b-instruct-q4_0'
llama3_2_3b = 'llama3.2:3b-instruct-q6_K'
llama3_2_3b_8bit = 'llama3.2:3b-instruct-q8_0'

basic_retriever = bge_m3_vectordb.obtain_retriever()
llm_model = OllamaLLM(model=llama3_2_3b_8bit)

In [None]:
# Define the basic prompt
basic_prompt = PromptTemplate(
    template="""
    Answer the question to your best knowledge when looking at the following context:
    {context}
    
    If the context is irrelevant or does not provide enough information, respond with:
    "No relevant context found for this question."
                
    Question: {question}
    """,
    input_variables=["context", "question"],
)

print(basic_prompt)

In [None]:
# build the basic RAG pipeline
basic_rag_chain = RetrievalQA.from_chain_type(
    llm=llm_model,
    retriever=basic_retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": basic_prompt},
)

In [None]:
basic_rag_chain.invoke("What is the impact of renewable energy on the environment?")

In [None]:
HANDPICKED_QUESTIONS = [
    "Who is the target user for Leapfrog Energy?",
    "Which has the higher absorption coefficient for wavelengths above 500m - amorphous germanium or amorphous silicon?",
    "What is the EU’s Green Deal Industrial Plan?"
    "Why do more and more students stop studying?"
]

def ask_handpicked_questions(rag_chain, questions=HANDPICKED_QUESTIONS):
    answers = {question: rag_chain.invoke(question) for question in questions}

    for question, answer in answers.items():
        print(f"Question: {question}")
        print(f"Answer: {answer['result']}")
        print("\n")

ask_handpicked_questions(basic_rag_chain)

In [None]:
clean_dataset = pd.read_parquet('data/preprocessed/clean_cleantech.parquet')

clean_dataset.columns

In [None]:
eval_test = pd.read_csv('data/eval_dataset/cleantech_rag_evaluation_data_2024-09-20.csv', sep=';')

eval_test.columns

In [None]:
from fuzzywuzzy import fuzz, process

def compute_similarity(text_1, text_2):
    return fuzz.partial_ratio(text_1, text_2)

def find_most_similar(clean_df, eval_input_chunk):
    
    similarity_score = 0
    similarity_score_index = None
    
    for idx, doc in clean_df.iterrows():
        score = compute_similarity(eval_input_chunk, doc['content'])
        if score > similarity_score:
            similarity_score = score
            similarity_score_index = doc['id']
    
    return similarity_score, similarity_score_index

In [None]:
from tqdm import tqdm
from pandarallel import pandarallel

tqdm.pandas()
pandarallel.initialize(progress_bar=True)

def apply_find_most_similar(row, clean_df):
    score, score_index = find_most_similar(clean_df, row['relevant_text'])
    return pd.Series([score, score_index])


# Make sure your DataFrames are named as 'clean_dataset' and 'eval_test'
eval_test[['top_score', 'top_score_id']] = eval_test.parallel_apply(apply_find_most_similar, args=(clean_dataset,), axis=1)


In [None]:
eval_test.head(5)

In [None]:
eval_test = eval_test.drop_duplicates().copy()

In [None]:
eval_test = eval_test.rename(columns={'answer': 'ground_truth'})

In [None]:
eval_test.head(5)

In [None]:
from datasets import Dataset

from ragas.metrics import (
context_precision,
context_recall,
context_entity_recall,
faithfulness,
answer_correctness,
answer_relevancy,
answer_similarity
)

ragas_metrics = [
    context_precision,
    context_recall,
    context_entity_recall,
    faithfulness,
    answer_correctness,
    answer_relevancy,
    answer_similarity
]

from ragas import evaluate

test_eval = evaluate(
    dataset=dataset,
    metrics=ragas_metrics,
    llm=llm_model,
    embeddings=bge_m3_embed,
    raise_exceptions=True,
    show_progress=True,
)