In [2]:
from elasticsearch import Elasticsearch
from sentence_transformers import SentenceTransformer
from openai import OpenAI
from dotenv import load_dotenv
import os
from ast import literal_eval
import json
from tqdm import tqdm
import pickle

from yt_rag.agent import llm
from yt_rag.build_index import build_index, create_embeddings

load_dotenv()

OLLAMA_URL = os.getenv("LOCAL_OLLAMA_URL")
CHANNELS = literal_eval(os.getenv("YT_CHANNELS"))
YT_API_KEY = os.getenv("YT_API_KEY")
LOCAL_YT_API_KEY = os.getenv("LOCAL_YT_API_KEY")
LOCAL_ES_URL = os.getenv("LOCAL_ES_URL")
LOCAL_ES_INDEX_NAME = os.getenv("ES_INDEX_NAME")

ollama_client = OpenAI(base_url=OLLAMA_URL, api_key="ollama")
embedding_model = SentenceTransformer("multi-qa-distilbert-cos-v1")

es_client = Elasticsearch(LOCAL_ES_URL)

  from tqdm.autonotebook import tqdm, trange


In [3]:
with open('../data/videos.pkl', 'rb') as f:
    videos = pickle.load(f)


with open('../data/results.pkl', 'rb') as f:
    results = pickle.load(f)

In [7]:
prompt_template = """
You emulate a user of our cooking and recipe assistant application.
Formulate 5 questions this user might ask that would be answered by the provided video.
Make the questions specific to this video, without mentioning the video title or description.
Avoid references similar to "to this video", "shown here", "the shared". 
Ignore product links and other urls.
Focus on ingredients, techniques or other kitchen related jargon.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

title: {title}
description: {description}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", ..., "question5"]}}
""".strip()

## Generate questions

In [8]:
def generate_questions(video):
    prompt = prompt_template.format(**video.__dict__)

    response = llm(prompt, client=ollama_client)

    return response

In [None]:
results = {}
failed_video_ids = {}

In [None]:
for video in tqdm([i for i in videos if i.video_id in failed_video_ids]): 
    if video.video_id in results or video.video_id in results:
        continue

    try:
        questions_raw = generate_questions(video)
    except Exception as err:
        print(f"Failed {video.video_id}", err)
        continue


    try:
        questions = json.loads(questions_raw.replace("json", "").replace("`", "").strip())
    except json.JSONDecodeError:
        # print(f"JSON fail for {video.video_id}")
        failed_video_ids[video.video_id] = questions_raw
        continue        
    except Exception as err:
        # print(f"Failed {video.video_id}", err)
        failed_video_ids[video.video_id] = questions_raw
        continue
    else:
        
        try:
            results[video.video_id] = questions['questions']
        except KeyError:
            failed_video_ids[video.video_id] = questions_raw
            continue
        

## Retrieval Evalutaion

In [66]:
def elastic_search_knn(query):
    field = "title_description_vector"
    
    vector = embedding_model.encode(query)
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 20,
        "num_candidates": 10000,
    }

    search_query = {
        "knn": knn,
        "_source": ["title", "is_short", "description", "course", "video_id"],
    }

    es_results = es_client.search(index=LOCAL_ES_INDEX_NAME, body=search_query)

    result_docs = []

    for hit in es_results["hits"]["hits"]:
        result_docs.append(hit["_source"])

    return result_docs

In [65]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['video_id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }


In [61]:
ground_truth = [{"id": k, "question": _v} for k, v in results.items() for _v in v]

## Title + description and Phi3-Mini

In [63]:
embeddings = create_embeddings(videos)
build_index(videos, embeddings, LOCAL_ES_INDEX_NAME, es_client=es_client)

Starting embedding...


  0%|          | 0/475 [00:00<?, ?it/s]

...embedding done.
Started indexing...


  0%|          | 0/475 [00:00<?, ?it/s]

...indexing done.


In [67]:
evaluate(ground_truth, lambda q: elastic_search_knn(q['question']))

100%|██████████| 2041/2041 [02:12<00:00, 15.46it/s]


{'hit_rate': 0.6947574718275356, 'mrr': 0.5127479915696481}