In [1]:
# Import required modules
import os
from dotenv import load_dotenv
from langchain.prompts import PromptTemplate
from langchain.chains import ConversationalRetrievalChain
from langchain_postgres.vectorstores import PGVector
from sentence_transformers import SentenceTransformer


# Custom imports
from prompts_constants import *  # Ensure constants like USERNAME, PASSWORD, PGVECTOR_CONNECTION_STRING are imported
from evaluate_context import TextProcessor, SemanticSimilarity
from embeddings import SentenceTransformerEmbeddingWrapper
from llm import LangchainDSXLLM  # Import the LLM setup from llm.py



# Download NLTK resources (if not already downloaded)
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Function to initialize the vector store
def initialize_vector_store():
    embedding_model = SentenceTransformerEmbeddingWrapper('sentence-transformers/all-MiniLM-L12-v2')
    store = PGVector(
        embeddings=embedding_model,
        collection_name=COLLECTION_NAME,
        connection=PGVECTOR_CONNECTION_STRING,
        use_jsonb=True
    )
    return store



def create_dynamic_rag_chain(k, llm, store, prompt_template):
    retriever = store.as_retriever(search_kwargs={"k": k})
    return ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        return_source_documents=True,
        combine_docs_chain_kwargs={
            "prompt": prompt_template,
            "document_variable_name": "s",
        }
    )
    

def fact_check_dataframe(df, context_col, claim_col, api_key):
    def check_factuality(context, claim):
        try:
            response = requests.post(
                "https://api.bespokelabs.ai/v0/argus/factcheck",
                json={
                    "contexts": [context],  # Ground truth from context column
                    "claim": claim          # Claim from claim column
                },
                headers={"api_key": api_key}
            )
            # Pause to avoid overloading the API
            time.sleep(1)

            # Parse and extract the 'claim_supported_by_contexts' value if it exists
            result = response.json()
            return result.get('claim_supported_by_contexts', [None])[0]  # Extracting first element or None if not found
        except Exception as e:
            return None  # Return None if there is an error

    # Apply the factuality check to each row in the DataFrame
    df['factuality_score'] = df.apply(
        lambda row: check_factuality(row[context_col], row[claim_col]), axis=1
    )
    return df



# Function to evaluate the final context
def evaluate_final_context(query, final_context, text_processor, semantic_similarity, api_key=None):
    """
    Evaluates the final context with the query using various metrics and performs factuality checks.
    """
    # Compute semantic similarity score
    semantic_scores = semantic_similarity.compute_similarity(query, [final_context])
    semantic_score = semantic_scores[0]

    # Compute keyword matching score
    query_keywords = text_processor.preprocess(query, 'lemma')
    keyword_score = text_processor.keyword_match_score(query_keywords, final_context, 'lemma')

    # Context length penalty
    context_length_penalty = 1.0 - min(len(final_context.split()) / 500, 1.0)

    # Aggregate confidence score
    confidence_score = (
        0.6 * semantic_score +
        0.3 * keyword_score -
        0.1 * context_length_penalty
    )

    # Fact-checking (optional, requires API key)
    factuality_score = None
    if api_key:
        try:
            response = requests.post(
                "https://api.bespokelabs.ai/v0/argus/factcheck",
                json={
                    "contexts": [final_context],  # Ground truth from context
                    "claim": query               # Claim from query
                },
                headers={"api_key": api_key}
            )
            # Parse response for factuality score
            result = response.json()
            factuality_score = result.get('claim_supported_by_contexts', [None])[0]
        except Exception as e:
            print(f"Fact-checking failed: {e}")
            factuality_score = None

    # Aggregate results
    evaluation_results = {
        "semantic_score": semantic_score,
        "keyword_score": keyword_score,
        "context_length_penalty": context_length_penalty,
        "confidence_score": confidence_score,
        "factuality_score": factuality_score  # Include factuality score if computed
    }

    return evaluation_results


# Function to generate a summary using Azure LLM
def generate_summary_with_llm(query, context_evaluation, best_chunks, azure_llm):
    top_chunks_summary = "\n".join(
        [
            f"Chunk {i+1}: Semantic Score = {chunk['semantic_score']:.4f}, "
            f"Keyword Score = {chunk['keyword_score']:.4f}\n"
            f"Content: {chunk['chunk'][:200]}..."
            for i, chunk in enumerate(best_chunks)
        ]
    )

    prompt = (
        f"You are a helpful assistant summarizing evaluation metrics for a chatbot query.\n\n"
        f"User Query: '{query}'\n\n"
        f"The chatbot selected the following top chunks based on their relevance:\n"
        f"{top_chunks_summary}\n\n"
        f"The evaluation metrics for the final context were as follows:\n"
        f"- Semantic Similarity Score: {context_evaluation['semantic_score']:.4f}\n"
        f"- Keyword Matching Score: {context_evaluation['keyword_score']:.4f}\n"
        f"- Confidence Score: {context_evaluation['confidence_score']:.4f}\n"
        f"- Factuality Score: {context_evaluation['factuality_score']}\n\n"
        f"Please generate a concise and user-friendly summary explaining these metrics and how they relate to the query and selected chunks."
    )

    try:
        summary = azure_llm._call(prompt)
        return summary
    except Exception as e:
        return f"Error during LLM call: {str(e)}"
# Function to handle queries dynamically with RAG

# Function to handle queries dynamically with RAG
def answer_query_with_dynamic_rag(query, chat_history, store, azure_llm, prompt, text_processor, semantic_similarity, api_key):
    initial_rag_chain = create_dynamic_rag_chain(3, azure_llm, store, prompt)
    initial_result = initial_rag_chain({"question": query, "chat_history": chat_history})
    retrieved_chunks = [doc.page_content for doc in initial_result["source_documents"]]

    print("Computing semantic similarity...")
    semantic_scores = semantic_similarity.compute_similarity(query, retrieved_chunks)

    print("Computing keyword matching...")
    query_keywords = text_processor.preprocess(query, 'lemma')
    keyword_scores = [
        text_processor.keyword_match_score(query_keywords, chunk, 'lemma') for chunk in retrieved_chunks
    ]

    chunk_scores = [
        {
            "chunk": chunk,
            "semantic_score": semantic_scores[i],
            "keyword_score": keyword_scores[i],
        }
        for i, chunk in enumerate(retrieved_chunks)
    ]

    for chunk_score in chunk_scores:
        chunk_score["combined_score"] = (
            0.5 * chunk_score["semantic_score"] + 0.5 * chunk_score["keyword_score"]
        )
    sorted_chunks = sorted(chunk_scores, key=lambda x: x["combined_score"], reverse=True)

    best_n = max(range(1, 4), key=lambda n: sum(
        sorted_chunks[i]["combined_score"] for i in range(n)
    ) / n)
    print(f"Decided to use top-{best_n} chunks based on scores.")

    dynamic_rag_chain = create_dynamic_rag_chain(best_n, azure_llm, store, prompt)
    final_result = dynamic_rag_chain({"question": query, "chat_history": chat_history})

    answer = final_result["answer"]
    sources = final_result["source_documents"]

    best_chunks = sorted_chunks[:best_n]
    final_context = " ".join(chunk["chunk"] for chunk in best_chunks)
    context_evaluation = evaluate_final_context(query, final_context, text_processor, semantic_similarity, api_key=api_key)

    summary = generate_summary_with_llm(query, context_evaluation, best_chunks, azure_llm)

    return {
        "answer": answer,
        "sources": [doc.page_content[:200] for doc in sources],
        "selected_k": best_n,
        "retrieved_chunks": retrieved_chunks,
        "context_evaluation": context_evaluation,
        "summary": summary,
    }



  from tqdm.autonotebook import tqdm, trange
2025-01-29 00:54:31.285801: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-29 00:54:31.291936: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-29 00:54:31.343287: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-29 00:54:31.343338: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-29 00:54:31.343388: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable t

In [2]:

# Main execution workflow
if __name__ == "__main__":
    text_processor = TextProcessor()
    semantic_similarity = SemanticSimilarity()

    store = initialize_vector_store()
    llm = LangchainDSXLLM()

    query = "What is the capital of France?"
    chat_history = []

    # Wrap FIRST_PROMPT in a PromptTemplate
    FIRST_PROMPT_TEMPLATE = PromptTemplate(
        template=FIRST_PROMPT,
        input_variables=["s", "question"]
    )
    
    # Define a query
    query = "What is the capital of France?"
    chat_history = []
    bespoke_api_key = ""  # Replace with your actual API key
    
    # Use RAG chain
    result = answer_query_with_dynamic_rag(
        query=query,
        chat_history=chat_history,
        store=store,
        azure_llm=llm,
        prompt=FIRST_PROMPT_TEMPLATE,
        text_processor=text_processor,
        semantic_similarity=semantic_similarity,
        api_key=bespoke_api_key
    )

    print("\nFinal Answer:")
    print(result["answer"])
    print("\nSummary:")
    print(result["summary"])


  warn_deprecated(


Computing semantic similarity...
Computing keyword matching...
Decided to use top-1 chunks based on scores.
Fact-checking failed: name 'requests' is not defined

Final Answer:
The capital of France is Paris.

Summary:
Based on the user query, which asked for the capital of France, the chatbot provided a chunk of text that was not relevant to the query. The metrics used to evaluate this response provide insights into its quality. The semantic similarity score indicates how closely the provided chunk matches the user's query, and in this case, it was very low at 0.1565. Additionally, the keyword matching score, which determines how well the chunk includes relevant keywords, was 0.5000, suggesting only partial match. The confidence score, indicating the chatbot's certainty in its response, was 0.1777, which is relatively low. Lastly, there was no factuality score provided, meaning the chunk did not contain accurate information related to the query. Overall, the metrics indicate that the c

In [None]:
# Main execution workflow
if __name__ == "__main__":
    text_processor = TextProcessor()
    semantic_similarity = SemanticSimilarity()

    store = initialize_vector_store()
    llm = LangchainDSXLLM()

    # Wrap FIRST_PROMPT in a PromptTemplate
    FIRST_PROMPT_TEMPLATE = PromptTemplate(
        template=FIRST_PROMPT,
        input_variables=["s", "question"]
    )

    bespoke_api_key = "bespoke-0e6e9818ac3f0cf5fcac9ebc910c52f470181cb885003a5444b4e9b330fc3e19"  # Replace with your actual API key

    # Start an interactive session for multiple queries
    print("Enter your queries below (type 'exit' to stop):")
    chat_history = []

    while True:
        # Get user input
        query = input("Your query: ")
        if query.lower() == "exit":
            print("Exiting the query session.")
            break

        # Process the query
        try:
            result = answer_query_with_dynamic_rag(
                query=query,
                chat_history=chat_history,
                store=store,
                azure_llm=llm,
                prompt=FIRST_PROMPT_TEMPLATE,
                text_processor=text_processor,
                semantic_similarity=semantic_similarity,
                api_key=bespoke_api_key
            )

            # Display results
            print("\nFinal Answer:")
            print(result["answer"])
            print("\nSummary:")
            print(result["summary"])
            print("\nSources:")
            for source in result["sources"]:
                print(source)
            print("\n" + "-" * 80)

            # Append the query to the chat history for context in the next iterations
            chat_history.append((query, result["answer"]))

        except Exception as e:
            print(f"Error processing the query: {str(e)}")


Fact-checking failed: name 'requests' is not defined

Final Answer:
The reason behind Justin Trudeau announcing his resignation as Canada's prime minister is due to internal battles, declining party support, growing political polarization, economic challenges, high inflation, and tensions with the incoming U.S. administration of President-elect Donald Trump, particularly over proposed tariffs.

Summary:
The evaluation metrics for the chatbot's response to the query "Why did Justin Trudeau announce his resignation as Canada's prime minister?" are as follows:

1. Semantic Similarity Score: This score indicates the degree of similarity between the selected chunk and the user's query. In this case, the score is 0.7416, suggesting that the chunk is semantically relevant to the query.

2. Keyword Matching Score: This score measures how well the keywords in the selected chunk match the keywords in the user's query. The score is 0.8571, indicating a high level of keyword similarity.

3. Confid