In [None]:
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from dotenv import load_dotenv
import os
from ast import literal_eval
import json
from tqdm import tqdm
import pickle
import random

from yt_rag.agent import llm, build_prompt
from yt_rag.build_index import build_index, create_embeddings, embed_title
from yt_info.yt_video_data import Video, get_video_transcript


load_dotenv()

OLLAMA_URL = os.getenv("LOCAL_OLLAMA_URL")
CHANNELS = literal_eval(os.getenv("YT_CHANNELS"))
ES_URL = os.getenv("LOCAL_ES_URL")
ES_INDEX_NAME = os.getenv("ES_INDEX_NAME")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

ollama_client = OpenAI(base_url=OLLAMA_URL, api_key="ollama")
openai_client = OpenAI(api_key=OPENAI_API_KEY)

embedding_model = SentenceTransformer("multi-qa-distilbert-cos-v1")

es_client = Elasticsearch(ES_URL)

In [None]:
with open('../data/yt_videos_details.pkl', 'rb') as f:
    videos = pickle.load(f)


# change to results_tempDefault.pkl to use the LLM generated questions with the default temperature.
with open('../data/results_temp0.pkl', 'rb') as f:
    results = pickle.load(f)

In [None]:
prompt_llm = """
You are a user of a cooking and recipe assistant app.
Generate 5 general questions related to the cooking techniques, ingredients, or recipes discussed in the provided video.
The questions should be clear, concise, and relevant to the video's content.
Avoid being overly specific about particular ingredients or using too many details from the title and description.
Do not mention or reference the video title, description, or any URLs.
Focus on broader cooking themes that would be of interest to a home cook.

The video information:

title: {title}
description: {description}

Output the result as a JSON object without using code blocks:

{{"questions": ["question1", "question2", "question3", "question4", "question5"]}}
""".strip()

## Generate questions

In [None]:
def generate_questions(video):
    prompt = prompt_llm.format(**video.__dict__)

    response = llm(prompt, client=ollama_client)

    return response

In [None]:
results = {}
failed_video_ids = {}

In [None]:
for video in tqdm(videos): 
    if video.video_id in results:
        continue

    try:
        questions_raw = generate_questions(video)
    except Exception as err:
        print(f"Failed {video.video_id}", err)
        continue


    try:
        questions = json.loads(questions_raw.replace("json", "").replace("`", "").strip())
    except json.JSONDecodeError:
        # print(f"JSON fail for {video.video_id}")
        failed_video_ids[video.video_id] = questions_raw
        continue        
    except Exception as err:
        # print(f"Failed {video.video_id}", err)
        failed_video_ids[video.video_id] = questions_raw
        continue
    else:
        
        try:
            results[video.video_id] = questions['questions']
        except KeyError:
            failed_video_ids[video.video_id] = questions_raw
            continue
        

## Retrieval Evaluation

In [None]:
def elastic_search_text(query):
    search_query = {
        "_source": ["title", "is_short", "description", "video_id"],
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["title", "description"],
                        "type": "best_fields",
                    }
                },
            }
        },
    }

    response = es_client.search(index=ES_INDEX_NAME, body=search_query)
    return [hit["_source"] for hit in response["hits"]["hits"]]


def elastic_search_knn(query, index=ES_INDEX_NAME, field="title_description_vector"):
        
    vector = embedding_model.encode(query)
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
    }

    search_query = {
        "knn": knn,
        "_source": ["title", "is_short", "description", "course", "video_id"],
    }
    es_results = es_client.search(index=index, body=search_query)

    result_docs = []

    for hit in es_results["hits"]["hits"]:
        result_docs.append(hit["_source"])

    return result_docs

In [None]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['video_id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }


In [None]:
ground_truth = [{"id": k, "question": _v} for k, v in results.items() for _v in v]

### Vector+KNN search CONCAT(Title + description) with Phi3-Mini

In [None]:
embeddings = create_embeddings(videos)
build_index(videos, embeddings, ES_INDEX_NAME, es_client=es_client)

In [None]:
evaluate(ground_truth, lambda q: elastic_search_knn(q['question']))

### Text search on Title and description with Phi3-Mini

In [None]:
evaluate(ground_truth, lambda q: elastic_search_text(q['question']))

### Vector+KNN search on Title with Phi3-Mini

In [None]:
title_only_index_name = ES_INDEX_NAME+"-title-only"
embeddings = create_embeddings(videos, embedding_function=embed_title)
build_index(videos, embeddings, title_only_index_name, es_client=es_client, field="title_vector")


In [None]:
evaluate(ground_truth, lambda q: elastic_search_knn(q['question'], index=title_only_index_name, field="title_vector"))

## RAG Evaluation

In [None]:
prompt_rag_eval = """
You are an expert evaluator for a RAG system.
Your task is to analyze the relevance of the generated answer to the given question.
Based on the relevance of the generated answer, you will classify it
as "NON_RELEVANT", "PARTLY_RELEVANT", or "RELEVANT".

Here is the data for evaluation:

Question: {question}
Generated Answer: {answer_llm}

Please analyze the content and context of the generated answer in relation to the question
and provide your evaluation in parsable JSON without using code blocks:

{{
  "Relevance": "NON_RELEVANT" | "PARTLY_RELEVANT" | "RELEVANT",
  "Explanation": "[Provide a brief explanation for your evaluation]"
}}
""".strip()

In [None]:
prompt_template1 = """
You are a professional cook and recipe developer. Answer the QUESTION using only the information provided in the CONTEXT from the video transcript.
Do not include any information, assumptions, or details not present in the CONTEXT. If the CONTEXT does not provide enough information to answer the QUESTION, acknowledge the limitation.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

prompt_template2 = """
You are a professional cook and recipe developer. Your task is to answer the QUESTION based only on the information provided in the CONTEXT from the video transcript.

Please follow these guidelines:
- Provide a clear and concise answer directly related to the QUESTION.
- Avoid any conversational elements or references to the video format (such as "welcome back" or "subscribe").
- Do not introduce information, assumptions, or details that are not present in the CONTEXT.
- If the CONTEXT does not provide enough information to fully answer the QUESTION, acknowledge this and explain the limitation.

QUESTION: {question}

CONTEXT:
{context}
"""

def get_answer(query, prompt_template):
    es_videos = elastic_search_knn(query)
    es_videos = [Video(**video) for video in es_videos]
    transcripts = []
    for video in es_videos[:1]:
        transcript = get_video_transcript(video)
        transcripts.append("\n".join([line["text"] for line in transcript]))

    prompt = build_prompt(query, es_videos[:1], transcripts, prompt_template=prompt_template)
    answer = llm(prompt, model_choice="ollama/phi3:mini")
    

    return answer



In [None]:
sample = random.sample(ground_truth, 100)
rag_evals_template_1 = []
rag_evals_template_2 = []


In [None]:
for entry in tqdm(sample[7:]):
    question = entry["question"]
    
    llm_answer_template1 = get_answer(question, prompt_template1)
    llm_answer_template2 = get_answer(question, prompt_template2)
    
    rag_eval_prompt_template1 = prompt_rag_eval.format(question=question, answer_llm=llm_answer_template1)   
    rag_eval_prompt_template2 = prompt_rag_eval.format(question=question, answer_llm=llm_answer_template2)   

    rag_llm_eval_template1 = llm(rag_eval_prompt_template1, model_choice="openai/gpt-4o-mini")
    rag_llm_eval_template2 = llm(rag_eval_prompt_template2, model_choice="openai/gpt-4o-mini")
    
    rag_evals_template_1.append((entry, rag_llm_eval_template1))
    rag_evals_template_2.append((entry, rag_llm_eval_template2))
    
    