In [1]:
import json

dataset_path = "../app/data/dataset.json"
with open(dataset_path, 'rt') as file:
    documents = json.load(file)

In [2]:
documents[0]

{'id': 'd8917aa5',
 'text': "Hi, everybody and welcome to a new exciting video in the audio signal processing for machine learning series. This time, we'll look into a very important audio feature. In other words, Mal frequency seal coefficient or if we use their acronym MF CCS. But before we get started with this super cool topic, I want to remind you about the sound of the Ice L community. So if you sign up there, you can get feedback, share projects and share ideas with a community of people who are interested in A I audio A I music and audio signal processing. So I really invite you to check this community out and I'll leave you the link and the sign up link to the Slack workspace in the description box below. Now let's move on to the cool stuff. But before we get to M I want just like to remind you about what we did in the previous couple of videos and we focused on male spectrograms. Now male spectrograms are going to be like an important building block to understanding MF CCS. S

In [8]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 
ES_INDEX = "youtube-questions"

In [9]:
def elastic_search(query, playlist, num_results=5):
    search_query = {
        "size": num_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["text^4", "video^2", "playlist", "youtube_link"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "playlist": playlist
                    }
                }
            }
        }
    }

    response = es_client.search(index=ES_INDEX, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [10]:
elastic_search(
    query = "How to extract frequency from audio file?",
    playlist = "Audio Signal Processing for ML"
)

[{'id': '99b3dbde',
  'text': "Hi, everybody and welcome to a new exciting video in the audio signal processing for machine learning series. Last time we looked at mad frequency CEPT coefficients from a theoretical standpoint. In this video, I want to show you how you can extract MF CCS using Python and Libros. So let's get started with this Jupiter notebook. So the first thing I want to do is just import a bunch of libraries that we'll be using. So I'll import Li Brosa Libros dot display, ipython dot display, uh Pylot and then pie. So let's do that. Next thing we want to load uh an audio file. So which file are we gonna load? So it's a file that we, you should be familiar with by. Now it's a, a short passage of a piece from clothes, the Bey. So we're talking about classical music here.",
  'video': 'Extracting Mel-Frequency Cepstral Coefficients with Python',
  'playlist': 'Audio Signal Processing for ML',
  'youtube_video_id': 'WJI-17MNpdE',
  'youtube_link': 'https://www.youtube.com

In [11]:
ground_truth_dataset_path = "../app/data/ground_truth_dataset.json"
with open(ground_truth_dataset_path, 'rt') as file:
    ground_truth = json.load(file)

In [12]:
ground_truth[0]

{'id': 'd8917aa5',
 'playlist': 'Audio Signal Processing for ML',
 'questions': 'What is the main focus of the video in the audio signal processing for machine learning series?'}

In [17]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [18]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

- hit-rate (recall)
- Mean Reciprocal Rank (mrr)

In [21]:
import minsearch

index = minsearch.Index(
    text_fields=["text", "video"],
    keyword_fields=["playlist", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x75919a49c6e0>

In [22]:
def minsearch_search(query, playlist):
    boost = {'text': 3.0, 'video': 0.5}

    results = index.search(
        query=query,
        filter_dict={'playlist': playlist},
        boost_dict=boost,
        num_results=5
    )

    return results

In [25]:
from tqdm.auto import tqdm

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [26]:
evaluate(ground_truth, lambda q: elastic_search(q['questions'], q['playlist']))

  0%|          | 0/36490 [00:00<?, ?it/s]

{'hit_rate': 0.5807618525623458, 'mrr': 0.3475504704485478}

In [27]:
evaluate(ground_truth, lambda q: minsearch_search(q['questions'], q['playlist']))

  0%|          | 0/36490 [00:00<?, ?it/s]

{'hit_rate': 0.4871197588380378, 'mrr': 0.29026034530009026}

## The best approach for Keyword searching is to use ElasticSearch