In [3]:
from langsmith import Client, wrappers
from openevals.llm import create_llm_as_judge
from openevals.prompts import CORRECTNESS_PROMPT
from app.services.llm_service import HFChatModel
from dotenv import load_dotenv
load_dotenv()

True

In [130]:
# Define the input and reference output pairs that you'll use to evaluate your app
client = Client()

# Create the dataset
dataset = client.create_dataset(
    dataset_name="Vid Query dataset", description="A Vid Query dataset in LangSmith."
)

# Create examples in the dataset. Examples consist of inputs and reference outputs 
examples = [
    {
        "inputs": {"question": "What is Nihilism?"},
        "outputs": {"answer": "Nihilism is a philosophical belief that denies the existence of any objective meaning, purpose, or intrinsic value in life. It asserts that all values are human-created constructs and that there is no inherent good or evil in the world. Nihilism can take various forms, such as political nihilism, which advocates for the destruction of all political, social, and religious order, or ethical nihilism, which rejects the idea of absolute ethical or moral values."},
    },
    {
        "inputs": {"question": "What's the difference between cynicism and nihilism?"},
        "outputs": {"answer": "Cynicism and nihilism are distinct philosophical perspectives, although they share some similarities. Cynicism posits that people are primarily motivated by self-interest and do not have intrinsically good motives, while nihilism denies the existence of any inherent meaning, value, or purpose in the universe or human life. While cynics may be pessimistic about human nature, they do not reject the existence of good and evil, unlike nihilists who view these concepts as human constructs with no objective reality."},
    },
    {
        "inputs": {"question": "What did Friedrich Nietzsche say about Nihilism?"},
        "outputs": {"answer": "Friedrich Nietzsche argued for nihilism in the sense that he believed there is no objective structure or order in the world except the one we create for ourselves. He also stated that every belief, every considering something true is necessarily false, because there is simply no true world. However, he also expressed concerns about nihilism, stating that in the coming centuries, the advent of nihilism would drive civilization towards a catastrophe, a disaster waiting to implode."},
    }
]

# Add the examples to the dataset
client.create_examples(dataset_id=dataset.id, examples=examples)

{'example_ids': ['0a7b02cc-138a-41a3-bbb9-279396c20ff2',
  'b02d6815-e98e-4c7f-abfa-0f02b79efdbb',
  '40ec0d2a-9773-464e-8524-86c2c4a2d677'],
 'count': 3}

In [29]:
from app.services.transcript_fetcher import fetch_youtube_transcript
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

In [125]:
transcript = fetch_youtube_transcript(video_id="ZOvyn72x6kQ")
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

Using transcript in preferred language: en


In [126]:
from langchain.vectorstores import FAISS
from app.services.embedding_service import HFInferenceEmbeddings
import os

chunks = splitter.split_text(transcript)
vectorstore = FAISS.from_texts(
    texts=chunks,
    embedding=HFInferenceEmbeddings(
    model="sentence-transformers/all-MiniLM-L6-v2",
    token=os.getenv("HF_TOKEN")
)
)

In [127]:
vectorstore.similarity_search(query="What is Nihilism?", k=3)

[Document(id='fd952deb-aa24-4c0d-bb54-d926aa4f2b3b', metadata={}, page_content='becomes something that you believe in. But since you now believe in something, then\nthere is no nihilism, because nihilism is the belief that there is nothing. Nihilism is quite different from other philosophical\nideas because it was first a literary invention before it ever became philosophical. As a result, it’s not as clearly defined\nas many of the other philosophies that exist. Many different people explained it in different'),
 Document(id='496dbc48-ab05-4ef7-8086-24d8136e448c', metadata={}, page_content='as many of the other philosophies that exist. Many different people explained it in different\nways, but eventually these different definitions got categorized, forming many different kinds\nof nihilism. There’s political nihilism. Political nihilists believe that for humanity\nto move forward as a species, all political, social, and religious order must be destroyed. Then there’s ethical nihilism.

In [128]:
llm = HFChatModel(
    model="mistralai/Mistral-7B-Instruct-v0.3",
    provider="together",
    token=os.environ["HF_TOKEN"]
)


# Define your RAG prompt
rag_prompt = PromptTemplate.from_template("""
- You are a helpful assistant who is good at analyzing source information and answering questions.
- Use the following source documents to answer the user's questions.
- If you don't know the answer, just say that you don't know.
- Use three sentences maximum and keep the answer concise.
- Do not make up answers or provide information that is not in the source documents.
- If the question is not related to the source documents, say "No information available in the source documents."


Context:
{context}

Question:
{question}

Answer in a professional tone.
""")

# Chain that combines prompt + LLM
# qa_chain = LLMChain(llm=llm, prompt=rag_prompt)
qa_chain = rag_prompt | llm

def get_rag_response(question: str, vectorstore: FAISS = vectorstore, k: int = 3) -> str:
    
    if not vectorstore:
        # result = qa_chain.invoke({"context": "There's no video content available but answer any generic questions.", "question": question})   # Can later allow general chats with LLM
        return "No video content available. Please analyze a video first."
    
    # Retrieve top k relevant chunks
    docs = vectorstore.similarity_search(question, k=k)
    context = "\n\n".join(doc.page_content for doc in docs)
    

    # Ask LLM with context
    result = qa_chain.invoke(
        {"context": context, "question": question}
    )
    return {"answer": result.content.strip(), "documents": context}

In [64]:
get_rag_response("What is Nihilism?")

'Nihilism is a philosophical belief that denies the existence of any objective meaning, purpose, or intrinsic value in life or the universe. It asserts that nothing has any inherent significance or necessity, and that human-created concepts such as morality, good, and evil are subjective and arbitrary.'

In [102]:
import json

def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    """An evaluator for RAG answer accuracy"""
    instructions = """You are a teacher grading a quiz. 

You will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER. 

Here is the grade criteria to follow:
(1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer. 
(2) Ensure that the student answer does not contain any conflicting statements.
(3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the  ground truth answer.

Correctness:
A correctness value of True means that the student's answer meets all of the criteria.
A correctness value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

    answers = f"""\
QUESTION: {inputs['question']}
GROUND TRUTH ANSWER: {reference_outputs['answer']}
STUDENT ANSWER: {outputs['answer']}"""

    # Add explicit instruction for JSON output
    answers = (
        answers
        + "\n\nRespond ONLY in the following JSON format:\n"
        + '{ "explanation": "...", "correct": true/false }'
    )

    # Run evaluator
    response = llm.invoke([
        {"role": "system", "content": instructions},
        {"role": "user", "content": answers}
    ])

    # Parse JSON from model output
    try:
        # If response is an object with .content, extract it
        result = json.loads(response.content.strip())
        return result["correct"]
    except Exception as e:
        print("Failed to parse grader output:", e)
        print("Raw output:", response)
        return False

In [103]:
def relevance(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    """An evaluator for RAG answer accuracy"""
    instructions = """You are a teacher grading a quiz. 

You will be given a QUESTION and a STUDENT ANSWER. 

Here is the grade criteria to follow:
(1) Ensure the STUDENT ANSWER is concise and relevant to the QUESTION
(2) Ensure the STUDENT ANSWER helps to answer the QUESTION

Relevance:
A relevance value of True means that the student's answer meets all of the criteria.
A relevance value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

    answers = f"""\
QUESTION: {inputs['question']}
GROUND TRUTH ANSWER: {reference_outputs['answer']}
STUDENT ANSWER: {outputs['answer']}"""

    # Add explicit instruction for JSON output
    answers = (
        answers
        + "\n\nRespond ONLY in the following JSON format:\n"
        + '{ "explanation": "...", "relevant": true/false }'
    )

    # Run evaluator
    response = llm.invoke([
        {"role": "system", "content": instructions},
        {"role": "user", "content": answers}
    ])
    
    # Parse JSON from model output
    try:
        # If response is an object with .content, extract it
        result = json.loads(response.content.strip())
        return result["relevant"]
    except Exception as e:
        print("Failed to parse grader output:", e)
        print("Raw output:", response)
        return False

In [105]:
def groundedness(inputs: dict, outputs: dict) -> bool:
    """An evaluator for RAG answer accuracy"""
    instructions = """You are a teacher grading a quiz. 

You will be given FACTS and a STUDENT ANSWER. 

Here is the grade criteria to follow:
(1) Ensure the STUDENT ANSWER is grounded in the FACTS. 
(2) Ensure the STUDENT ANSWER does not contain "hallucinated" information outside the scope of the FACTS.

Grounded:
A grounded value of True means that the student's answer meets all of the criteria.
A grounded value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

    answer = f"FACTS: {outputs["documents"]}\nQUESTION: {inputs['question']}"

    # Add explicit instruction for JSON output
    answer = (
        answer
        + "\n\nRespond ONLY in the following JSON format:\n"
        + '{ "explanation": "...", "grounded": true/false }'
    )

    # Run evaluator
    response = llm.invoke([
        {"role": "system", "content": instructions},
        {"role": "user", "content": answer}
    ])
    
    # Parse JSON from model output
    try:
        # If response is an object with .content, extract it
        result = json.loads(response.content.strip())
        return result["grounded"]
    except Exception as e:
        print("Failed to parse grader output:", e)
        print("Raw output:", response)
        return False

In [111]:
def retrieval_relevance(inputs: dict, outputs: dict) -> bool:
    """An evaluator for document relevance"""
    instructions = """You are a teacher grading a quiz. 

You will be given a QUESTION and a set of FACTS provided by the student. 

Here is the grade criteria to follow:
(1) You goal is to identify FACTS that are completely unrelated to the QUESTION
(2) If the facts contain ANY keywords or semantic meaning related to the question, consider them relevant
(3) It is OK if the facts have SOME information that is unrelated to the question as long as (2) is met

Relevance:
A relevance value of True means that the FACTS contain ANY keywords or semantic meaning related to the QUESTION and are therefore relevant.
A relevance value of False means that the FACTS are completely unrelated to the QUESTION.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

    answer = f"FACTS: {outputs["documents"]}\nQUESTION: {inputs['question']}"

    # Add explicit instruction for JSON output
    answer = (
        answer
        + "\n\nRespond ONLY in the following JSON format:\n"
        + '{ "explanation": "...", "relevant": true/false }'
    )

    # Run evaluator
    response = llm.invoke([
        {"role": "system", "content": instructions},
        {"role": "user", "content": answer}
    ])
    
    # Parse JSON from model output
    try:
        # If response is an object with .content, extract it
        result = json.loads(response.content.strip())
        return result["relevant"]
    except Exception as e:
        print("Failed to parse grader output:", e)
        print("Raw output:", response)
        return False

In [137]:
def target(inputs: dict) -> dict:
    return get_rag_response(inputs["question"])

experiment_results = client.evaluate(
    target,
    data='Vid Query dataset',
    evaluators=[correctness, groundedness, relevance, retrieval_relevance],
    experiment_prefix="rag-doc-relevance",
    metadata={"model": "Mistral-7B-Instruct-v0.3"},
)
# Explore results locally as a dataframe if you have pandas installed
# experiment_results.to_pandas()

View the evaluation results for experiment: 'rag-doc-relevance-2df622df' at:
https://smith.langchain.com/o/016d1287-5053-4244-8348-7d1be6ad628a/datasets/a9aa81a5-2dc4-486c-b92f-bc54f07fb16e/compare?selectedSessions=99b9fecb-116e-4296-b437-9f72ceb72a72




1it [00:05,  5.90s/it]Error running evaluator <DynamicRunEvaluator relevance> on run 7e1f6fbb-7e26-472a-a13a-93fb3af37736: HfHubHTTPError('402 Client Error: Payment Required for url: https://router.huggingface.co/together/v1/chat/completions (Request ID: Root=1-6881f5e6-236ffb1b714cb15959f753c8;3f5b98c7-eaba-4af1-9053-1d88713f3cba)\n\nYou have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits.')
Traceback (most recent call last):
  File "c:\Users\Sam\Desktop\Test\vid-query\backend\.venv\Lib\site-packages\huggingface_hub\utils\_http.py", line 409, in hf_raise_for_status
    response.raise_for_status()
  File "c:\Users\Sam\Desktop\Test\vid-query\backend\.venv\Lib\site-packages\requests\models.py", line 1026, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 402 Client Error: Payment Required for url: https://router.huggingface.co/together/v1/chat/completions

The ab