In [10]:
import pandas as pd
from tqdm import tqdm
from pandarallel import pandarallel
from langchain_huggingface import HuggingFaceEndpointEmbeddings
from langchain_experimental.text_splitter import SemanticChunker
from langchain.schema import Document
pandarallel.initialize(progress_bar=True, verbose=0)
tqdm.pandas()
import os
from openai import OpenAI
import numpy as np
import chromadb
from chromadb.config import Settings
import random
from src.utils import prepare_embedding_for_chromadb, prepare_embedding_for_comparison, split_text, cosine_similarity_score, calculate_mrr
from src.model_m3 import EmbeddingModelM3

ImportError: cannot import name 'calculate_mrr' from 'data.src.utils' (/Users/arian/Documents/FHNW/npr/npr_hs_24/npr_mc1_new_hs24/npr-mc1-frfr/data/src/utils.py)

In [11]:
with open('secrets.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        if line.startswith('openai'):
            secret = line.split('=')[1].strip()

os.environ["OPENAI_API_KEY"] = secret

storage_path = './data/chromadb'

ai_client = OpenAI(
    api_key=os.environ.get("OPENAI_API_KEY"),
)


In [12]:
with open('secrets.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        if line.startswith('api_token'):
            token = line.split('=')[1].strip()

embeddings = HuggingFaceEndpointEmbeddings(
    model='http://100.67.185.22:8080',
    huggingfacehub_api_token=token
)

text_splitter = SemanticChunker(
    embeddings,
    breakpoint_threshold_type='standard_deviation'
)

In [13]:
embed_local = EmbeddingModelM3()

In [14]:
query_result = embed_local.embed_query("Hello, world!") # local
#query_result = embeddings.embed_query("Hello, world!") # remote
query_result[:3]

[-0.026738807559013367, 0.42828133702278137, -0.6886834502220154]

# Chunking with Semantic Chunker from langchain
### Breakpoint: Standard Deviation

In [None]:
df = pd.read_parquet("data/clean_cleantech.parquet")

### generate "eval_dataset" from the "df" dataframe

In [None]:
df.head(3)

In [None]:
df['chunks'] = df['content'].parallel_apply(lambda content: split_text([Document(content)], text_splitter))

In [None]:
df.head(3)

In [None]:
df['chunk_size'] = df['chunks'].progress_apply(len)

In [None]:
df.head(3)

In [None]:
df['chunks'] = df['chunks'].progress_apply(lambda x: [t.page_content for t in x])

In [None]:
df.sample(5)

In [None]:
df.to_parquet('data/processed/chunked_sd.parquet')

In [None]:
df_chunked = pd.read_parquet('data/processed/chunked_sd.parquet')

In [None]:
# Define the file path
file_path = 'data/eval_dataset/eval_dataset.parquet'

# Check if the file exists
if not os.path.isfile(file_path):
    # Load the dataset
    df_chunked = pd.read_parquet('data/processed/chunked_sd.parquet')

    # Define the evaluation dataset
    eval_data = []

    # Sample 100 random rows
    sample_rows = df_chunked.sample(n=100, random_state=42)

    # Iterate through the sampled rows
    for _, row in tqdm(sample_rows.iterrows(), total=sample_rows.shape[0]):
        doc_id = row['doc_id']
        url = row['url']
        
        # Choose a random chunk from 'chunks' for the row
        chunks = row['chunks']
        if chunks:
            used_chunk = random.choice(chunks)
            
            # Generate a question for the selected chunk
            prompt = f"Generate a question about the following text:\n\n{used_chunk}"
            
            response = ai_client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "user", "content": prompt}]
            )
            
            generated_question = response.choices[0].message.content.strip()
            
            # Append to evaluation dataset
            eval_data.append({
                'doc_id_df': doc_id,
                'used_chunk': used_chunk,
                'generated_question': generated_question,
                'url': url
            })

    # Convert to DataFrame
    eval_dataset = pd.DataFrame(eval_data)

    # Save to a new parquet file for later use
    eval_dataset.to_parquet(file_path, index=False)
else:
    print(f"The file '{file_path}' already exists.")

In [None]:
# remove empty chunks
df_chunked['chunks'] = df_chunked['chunks'].progress_apply(lambda x: [y for y in x if len(y) > 0])

# Embed the Chunks
### model: BAAI/bge-m3

In [None]:
# embed the chunks
df_chunked['embeddings'] = df_chunked['chunks'].parallel_apply(embeddings.embed_documents)

In [None]:
df_chunked.head(3)

In [None]:
# save the chunked and embedded data
df_chunked.to_parquet('data/processed/chunked_sd_embedded.parquet')

# Setting up the ChromaDB
preparing the embedded parquet fiel for ChromaDB

In [None]:
df = pd.read_parquet('data/processed/chunked_sd_embedded.parquet')

In [None]:
df.head(3)

In [None]:
df.embeddings[0]

In [None]:
type(df.embeddings[0]), type(df.embeddings[0][0])

### preparing the data for ChromaDB

In [None]:
# Apply the function to prepare embeddings
tqdm.pandas()
df['embeddings'] = df['embeddings'].progress_apply(prepare_embedding_for_chromadb)

# Check the result
print("Sample embedding type and shape:", type(df['embeddings'][0]), df['embeddings'][0].shape, df['embeddings'][0].dtype)

In [None]:
df.embeddings[0]

In [None]:
# Convert 'date' column to string format
df['date'] = df['date'].astype(str)

In [None]:
# Ensure all doc_ids are unique by adding a suffix to duplicates
df['doc_id'] = df['doc_id'].astype(str)  # Ensure IDs are strings
df['doc_id'] = df.groupby('doc_id').cumcount().astype(str) + '_' + df['doc_id']

#### saving

In [None]:
# Specify the storage path
settings = Settings()

# Initialize ChromaDB client with persistent settings
client = chromadb.PersistentClient(path=storage_path, settings=settings)
collection_name = "energy_articles"

# Delete and recreate collection
if collection_name in [col.name for col in client.list_collections()]:
    client.delete_collection(collection_name)
collection = client.get_or_create_collection(name=collection_name)

df['embeddings'] = df['embeddings'].progress_apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)

# Insert data in batches
batch_size = 10000
for start in tqdm(range(0, len(df), batch_size)):
    batch = df.iloc[start:start + batch_size]
    
    ids = batch['doc_id'].astype(str).tolist()
    documents = batch['content'].tolist()
    embeds = [embed.tolist() if isinstance(embed, np.ndarray) else embed for embed in batch['embeddings']]
    metadatas = batch[['title', 'date', 'domain', 'url', 'language']].to_dict(orient='records')
    
    # Insert into ChromaDB collection
    collection.add(
        ids=ids,
        documents=documents,
        embeddings=embeds,
        metadatas=metadatas
    )

print("Data successfully added to ChromaDB.")

In [None]:
# Query similar documents
question = 'In 2021, what were the top 3 states in the US in terms of total solar power generating capacity?'
query_test = embeddings.embed_query(question)
print(query_test[:3])

In [None]:
query_embedding = prepare_embedding_for_chromadb(query_test)
top_k = 5  # number of similar entries to retrieve

results = collection.query(
    query_embeddings=[query_embedding.tolist()],
    n_results=top_k,
    include=['documents', 'metadatas']
)

print(results)

# Retrieval

In [15]:
# load eval dataset
df_eval = pd.read_parquet('data/eval_dataset/eval_dataset.parquet')

settings = Settings()

client = chromadb.PersistentClient(path=storage_path, settings=settings)

collection_name = "energy_articles"
collection = client.get_collection(collection_name)

In [16]:
"""test_db = collection.get(include=['embeddings', 'documents', 'metadatas'], limit=1)
print(test_db)"""

"test_db = collection.get(include=['embeddings', 'documents', 'metadatas'], limit=1)\nprint(test_db)"

In [17]:
eval_data_index = df_eval.sample(n=1)
eval_question = eval_data_index.iloc[0][('generated_question')]
eval_answer = eval_data_index.iloc[0]['used_chunk']
article_url = eval_data_index.iloc[0]['url']
document_id = eval_data_index.iloc[0]['doc_id_df']

# Query text
query_text = eval_question

# Generate query embedding using the Hugging Face endpoint

#query_embedding = embeddings.embed_query(query_text) # remote
query_embedding = embed_local.embed_query(query_text) # local

prepared_embeddings = prepare_embedding_for_chromadb(query_embedding)

top_k = 20

# Retrieve top 20 most relevant documents
results = collection.query(
    query_embeddings=[prepared_embeddings.tolist()],  # Query embedding
    n_results=top_k,  # Number of similar documents to retrieve
    include=['metadatas', 'embeddings', 'documents']  # Include documents and metadata in the results
)

In [18]:
# Example usage for MRR calculation
mrr = calculate_mrr(df_eval, collection, embed_local, top_k=20)

Mean Reciprocal Rank (MRR): 0.6352619047619048


## Generate a response with GPT-3.5-turbo

In [None]:
# Prepare context with document references
retrieved_text = ""
content_test = ""
if 'documents' in results and results['documents']:
    for idx, doc in enumerate(results['documents'][0]):
        # Access the document's metadata and ID
        metadata = results['metadatas'][0][idx]  # Access metadata for each document
        doc_id = results['ids'][0][idx]  # Retrieve doc_id directly from results
        title = metadata.get("title", "Untitled Document")
        url = metadata.get("url", "URL not available")
        content = doc
        content_test += content

        # Build the retrieved text with document references
        retrieved_text += (
            f"Document {idx + 1} - ID: {doc_id}\n"
            f"Title: {title}\n"
            f"URL: {url}\n"
            f"Content: {content}\n\n"
        )
else:
    print("No documents found in query results.")
    
#print(retrieved_text)

# Create a system message with instructions for the assistant
system_message = """
You are a knowledgeable assistant. Based on the information from the documents provided by the user, answer the question in a detailed and informative way. In your answer, refer to specific documents by mentioning their titles, URLs, and IDs when relevant.

At the end of your answer, please provide a separate "Sources" section, listing all document titles, IDs, and URLs you referenced, even if they were only indirectly useful.
"""

# Construct the prompt as the user's message
prompt = f"""
Question: {query_text}

Documents:
{retrieved_text}

Please structure your answer as follows:
Answer:
(Your detailed answer here, with references to specific documents as needed)

Sources:
- Document N: documnet_id document_title, document_url
- Document N: documnet_id, document_title, document_url
- Document N: documnet_id, document_title, document_url
(Include every document you referred to in the answer)
"""

# Generate a response with GPT-3.5-turbo
response = ai_client.chat.completions.create(
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": prompt}
    ],
    model="gpt-3.5-turbo",
)

generated_response = response.choices[0].message.content

# Print the generated response
print(f'Used question: {eval_question}\nURL: {article_url}\n Used chunk for question: {eval_answer}\n Document ID: {document_id}')
print('-'*40)
print(generated_response)
print('-'*40)

In [None]:
import numpy as np

# Prepare relevant embedding
relevant_embedding = embed_local.embed_query(eval_answer)  # Assume this is the embedding for the relevant chunk
prepared_relevant_embedding = prepare_embedding_for_comparison(relevant_embedding)

# Retrieve embeddings from results and convert them to numpy arrays
retrieved_embeddings = [np.array(doc_embed) for doc_embed in results['embeddings'][0]]

# Ensure retrieved embeddings are prepared (consistent dimensions)
retrieved_embeddings = [prepare_embedding_for_comparison(embed) for embed in retrieved_embeddings]

# Calculate cosine similarity
similarities = cosine_similarity_score(retrieved_embeddings, prepared_relevant_embedding)

# Print top similarities
print("Top cosine similarity scores with the relevant chunk:")
for idx, score in enumerate(similarities.flatten()):
    print(f"Document {idx + 1}: Cosine Similarity = {score}")

In [None]:
from ragas import EvaluationDataset, evaluate
from ragas.metrics import ContextPrecision, Faithfulness, AnswerRelevancy, ContextRecall
from ragas.llms import LangchainLLMWrapper
from langchain_openai import ChatOpenAI

# Initialize the LLM for metrics that require it
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-3.5-turbo"))

# Prepare the data with required columns
data = [
    {
        "question": row["question"],
        "user_input": row["question"],  # Same as the question
        "context": [row["relevant_chunk"]],  # Relevant context or chunk
        "retrieved_contexts": [content_test],  # Retrieved context(s)
        "response": response.choices[0].message.content,  # Use actual response if available
        "reference": eval_answer  # Replace with ground truth if available
    }
    for _, row in df_eval.iterrows()
]

# Create the EvaluationDataset
eval_dataset = EvaluationDataset.from_list(data)

# Define metrics to use for evaluation
metrics = [
    ContextPrecision(),
    Faithfulness(llm=evaluator_llm),
    AnswerRelevancy(llm=evaluator_llm),
    ContextRecall()
]

# Run the evaluation
results_eval = evaluate(eval_dataset, metrics=metrics)

In [None]:
df_results = results.to_pandas()
df_results