In [3]:
import json

with open('../data_gloss/docs-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [5]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
  "settings": {
    "analysis": {
      "analyzer": {
        "english_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [
            "lowercase",
            "stop",
            "porter_stem"
          ]
        }
      }
    }
  },
  "mappings": {
    "properties": {
      "title": { "type": "text", "analyzer": "english_analyzer" },
      "source_file": { "type": "keyword" },
      "text": { "type": "text", "analyzer": "english_analyzer" }
    }
  }
}

index_name = "k8s_search"

es_client.indices.create(index=index_name, body=index_settings)

  es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'k8s_search'})

In [6]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  from .autonotebook import tqdm as notebook_tqdm
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 574.35it/s]


In [7]:
def elastic_search(query):
    search_query = {
        "bool": {
            "must": [
                {
                    "multi_match": {
                        "query": query,
                        "fields": ["text^3", "title", "source_file"],
                        "type": "best_fields"
                    }
                }
            ]
        }
    }
    
    response = es_client.search(index=index_name, query=search_query, size=5)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [8]:
elastic_search(
    query="How do I get logs for kubernetes pod?",
)

[{'title': 'kubectl exec',
  'text': 'Execute a command in a container.\n\n```\nkubectl exec (POD | TYPE/NAME) [-c CONTAINER] [flags] -- COMMAND [args...]\n```\n\n```\n  # Get output from running the \'date\' command from pod mypod, using the first container by default\n  kubectl exec mypod -- date\n  \n  # Get output from running the \'date\' command in ruby-container from pod mypod\n  kubectl exec mypod -c ruby-container -- date\n  \n  # Switch to raw terminal mode; sends stdin to \'bash\' in ruby-container from pod mypod\n  # and sends stdout/stderr from \'bash\' back to the client\n  kubectl exec mypod -c ruby-container -i -t -- bash -il\n  \n  # List contents of /usr from the first container of pod mypod and sort by modification time\n  # If the command you want to execute in the pod has any flags in common (e.g. -i),\n  # you must use two dashes (--) to separate your command\'s flags/arguments\n  # Also note, do not surround your command and its flags/arguments with quotes\n  # u

In [9]:
import pandas as pd

In [12]:
df_ground_truth = pd.read_csv('../data_gloss/ground-truth-data.csv')

In [13]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [14]:
ground_truth

[{'question': 'What are labels used for in Kubernetes?',
  'title': 'Label',
  'document': 'b021df6a'},
 {'question': 'How are labels structured in Kubernetes?',
  'title': 'Label',
  'document': 'b021df6a'},
 {'question': 'What is the role of key/value pairs in labeling objects?',
  'title': 'Label',
  'document': 'b021df6a'},
 {'question': 'Why might users find labels meaningful and relevant?',
  'title': 'Label',
  'document': 'b021df6a'},
 {'question': 'Which Kubernetes objects can have labels attached to them?',
  'title': 'Label',
  'document': 'b021df6a'},
 {'question': 'What is the purpose of a check that runs periodically on a container in a pod?',
  'title': 'Probe',
  'document': '76d16796'},
 {'question': 'How does the check influence the lifecycle of a container?',
  'title': 'Probe',
  'document': '76d16796'},
 {'question': 'Where can you find more detailed information about container probes?',
  'title': 'Probe',
  'document': '76d16796'},
 {'question': 'What is the prim

In [15]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = elastic_search(query=q['question'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 944/944 [00:02<00:00, 377.82it/s]


In [16]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [17]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

### hit-rate (recall)

In [19]:
hit_rate(relevance_total)

0.9375

### Mean Reciprocal Rank (mrr)

In [20]:
mrr(relevance_total)

0.8642655367231642

In [21]:
import minsearch

index = minsearch.Index(
    text_fields=["title", "text"],
    keyword_fields=["source_file"]
)

In [22]:
index.fit(documents)

<minsearch.minsearch.Index at 0x12c1e41f0>

In [26]:
def minsearch_search(query):
    boost = {'title': 3.0, 'source_file': 0.5}

    results = index.search(
        query=query,
        boost_dict=boost,
        num_results=5
    )

    return results

In [27]:
relevance_total_ms = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total_ms.append(relevance)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 944/944 [00:01<00:00, 921.43it/s]


In [29]:
hit_rate(relevance_total_ms), mrr(relevance_total_ms)

(0.7764830508474576, 0.590925141242937)

In [30]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }


In [31]:
evaluate(ground_truth, lambda q: elastic_search(q['question']))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 944/944 [00:02<00:00, 445.67it/s]


{'hit_rate': 0.9375, 'mrr': 0.8642655367231642}

In [32]:
evaluate(ground_truth, lambda q: minsearch_search(q['question']))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 944/944 [00:01<00:00, 924.85it/s]


{'hit_rate': 0.7764830508474576, 'mrr': 0.590925141242937}