In [1]:
import json

dataset_path = "../app/data/dataset.json"
with open(dataset_path, 'rt') as file:
    documents = json.load(file)

In [2]:
documents[0]

{'id': 'd8917aa5',
 'text': "Hi, everybody and welcome to a new exciting video in the audio signal processing for machine learning series. This time, we'll look into a very important audio feature. In other words, Mal frequency seal coefficient or if we use their acronym MF CCS. But before we get started with this super cool topic, I want to remind you about the sound of the Ice L community. So if you sign up there, you can get feedback, share projects and share ideas with a community of people who are interested in A I audio A I music and audio signal processing. So I really invite you to check this community out and I'll leave you the link and the sign up link to the Slack workspace in the description box below. Now let's move on to the cool stuff. But before we get to M I want just like to remind you about what we did in the previous couple of videos and we focused on male spectrograms. Now male spectrograms are going to be like an important building block to understanding MF CCS. S

In [3]:
from sentence_transformers import SentenceTransformer

In [4]:
# model_name = 'multi-qa-MiniLM-L6-cos-v1'
model_name = "paraphrase-MiniLM-L6-v2"
model = SentenceTransformer(model_name)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.73k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [18]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 
ES_INDEX = "youtube-questions"

In [19]:
from tqdm.auto import tqdm

In [20]:
query = "How to extract frequency from audio file?"
playlist = "Audio Signal Processing for ML"

v_q = model.encode(query)

In [21]:
def elastic_search_knn(field, query_vector, playlist, num_results=5):
    
    knn = {
        "field": field,     # options: "text_vector", "video_vector", "text_video_vector"
        "query_vector": query_vector,
        "k": num_results,
        "num_candidates": 10000, 
        "filter": {
            "term": {
                "playlist": playlist
            }
        }
    }
    
    search_query = {
        "knn": knn,
        "_source": ["id", "text", "video", "playlist", "youtube_link"]
    }
    
    response = es_client.search(index=ES_INDEX, body=search_query)

    result_docs = []
    if 'hits' in response and 'hits' in response['hits']:
        for hit in response['hits']['hits']:
            result_docs.append(hit['_source'])
    return result_docs

In [22]:
def video_vector_knn(q):
    question = q['questions']
    playlist = q['playlist']

    v_q = model.encode(question)

    return elastic_search_knn('video_vector', v_q, playlist)

In [23]:
ground_truth_dataset_path = "../app/data/ground_truth_dataset.json"
with open(ground_truth_dataset_path, 'rt') as file:
    ground_truth = json.load(file)

In [24]:
ground_truth[0]

{'id': 'd8917aa5',
 'playlist': 'Audio Signal Processing for ML',
 'questions': 'What is the main focus of the video in the audio signal processing for machine learning series?'}

In [25]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [26]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [28]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [29]:
evaluate(ground_truth, video_vector_knn)

  0%|          | 0/36490 [00:00<?, ?it/s]

{'hit_rate': 0.01833379007947383, 'mrr': 0.008511921074266897}

In [30]:
def text_vector_knn(q):
    question = q['questions']
    playlist = q['playlist']

    v_q = model.encode(question)

    return elastic_search_knn('text_vector', v_q, playlist)

In [31]:
evaluate(ground_truth, text_vector_knn)

  0%|          | 0/36490 [00:00<?, ?it/s]

{'hit_rate': 0.4104960263085777, 'mrr': 0.2556312231661629}

In [32]:
def text_video_vector_knn(q):
    question = q['questions']
    playlist = q['playlist']

    v_q = model.encode(question)

    return elastic_search_knn('text_video_vector', v_q, playlist)

evaluate(ground_truth, text_video_vector_knn)

  0%|          | 0/36490 [00:00<?, ?it/s]

{'hit_rate': 0.37155385036996436, 'mrr': 0.2239919612679154}

In [33]:
def elastic_search_knn_combined(vector, playlist):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": [
                    {
                        "script_score": {
                            "query": {
                                "term": {
                                    "playlist": playlist
                                }
                            },
                            "script": {
                                "source": """
                                    cosineSimilarity(params.query_vector, 'video_vector') + 
                                    cosineSimilarity(params.query_vector, 'text_vector') + 
                                    cosineSimilarity(params.query_vector, 'text_video_vector') + 
                                    1
                                """,
                                "params": {
                                    "query_vector": vector
                                }
                            }
                        }
                    }
                ],
                "filter": {
                    "term": {
                        "playlist": playlist
                    }
                }
            }
        },
        "_source": ["id", "text", "video", "playlist", "youtube_link"]
    
    }

    es_results = es_client.search(
        index=ES_INDEX,
        body=search_query
    )
    
    result_docs = []
    
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [34]:
def vector_combined_knn(q):
    question = q['questions']
    playlist = q['playlist']

    v_q = model.encode(question)

    return elastic_search_knn_combined(v_q, playlist)

evaluate(ground_truth, vector_combined_knn)

  0%|          | 0/36490 [00:00<?, ?it/s]

{'hit_rate': 0.3520416552480132, 'mrr': 0.21325888371242185}

## The best approach for Vector searching is to use Text_Vector search.
