## Before starting - setup index

```
cd notebooks/elasticsearch/vmware
python extract_use_es.py 0    # base reindex step
python extract_use_es.py 1    # enrich with USE
```

In [1]:
from ltr.client import ElasticClient
client=ElasticClient()

In [3]:
def exact_phrase_title(client, query):
    es = client.es
    body = {
        'size': 5,
        'query': {
            'multi_match': {
                'fields': ['title'],
                'type': 'phrase',
                'query': query
            }
        }
    }
    return es.search(index='vmware', body=body)['hits']['hits']


def exact_phrase_anywhere(client, query):
    es = client.es
    body = {
        'size': 5,
        'query': {
            'multi_match': {
                'fields': ['title^100', 'description^10', 'raw_text'],
                'type': 'phrase',
                'query': query
            }
        }
    }
    return es.search(index='vmware', body=body)['hits']['hits']


def search_bm25(client, query):
    es = client.es
    body = {
        'size': 5,
        'query': {
            'multi_match': {
                'fields': ['title', 'description', 'raw_text'],
                'type': 'cross_fields',
                'query': query,
                'minimum_should_match': '100%'
            }
        }
    }
    return es.search(index='vmware', body=body)['hits']['hits']


## Use exact phrase matches for USE expansion

Here we search Elasticsearch for exact phrase matches. We'll use the USE vectors of these matches later to expand these queries

In [56]:
import pandas as pd

def search_all(client, queries, how=search_bm25):
    """Create a submittable set of search results"""
    all_results = []
    for query in queries.to_dict(orient='records'):
        results = how(client, query['Query'])
        for rank, result in enumerate(results):
            source = result['_source']
            source['rank'] = rank
            source['score'] = result['_score']
            source['DocumentId'] = source['id']
            source['QueryId'] = query['QueryId']
            all_results.append(source)
    all_results = pd.DataFrame(all_results)
    return queries.merge(all_results, how='left', on='QueryId').sort_values(['QueryId', 'rank'])


queries = pd.read_csv("data/test.csv")
full_matches = search_all(client, queries, how=exact_phrase_anywhere)
full_matches

Unnamed: 0,QueryId,Query,viewport,microsites-at-utag,onetrust-data-domain,titleTag,title,twitter:card,description,datePublished,...,DC.Publisher.Address,DC.Contributor,DC.Type,DC.Rights,DC.Date.X-MetadataLastModified,twitter:partner,uberflip:hub_id,application-name,msapplication-starturl,msapplication-TileColor
0,0,what is hypervisor,,,,,,,,,...,,,,,,,,,,
1,1,what is a hypervisor,"width=device-width, initial-scale=1.0",https://tags.tiqcdn.com/utag/vmware/microsites...,b9242434-9feb-47be-a894-3a9e658fdd50,What is a Hypervisor? | VMware Glossary,What is a Hypervisor? | VMware Glossary,summary,"Hypervisor, also known as a virtual machine mo...",11-09-2020,...,,,,,,,,,,
2,2,what is vmware,,,,,What is VMware Cloud,,VMware Cloud™ enables you to manage your entir...,,...,,,,,,,,,,
3,2,what is vmware,"width=device-width, initial-scale=1, minimum-...",,,What is VMware Workstation | FAQ,What is VMware Workstation | FAQ,summary,Get answers and information for questions rela...,,...,,,,,,,,,,
4,2,what is vmware,"width=device-width, initial-scale=1, minimum-...",,,What is VMware Fusion | FAQs,What is VMware Fusion | FAQs,summary,VMware Fusion Support and VMware Fusion Pro FA...,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2819,2346,vsphere cluster members do not match vsan clus...,,,,,vSAN Alert Definitions,,vRealize Operations Manager generates an alert...,,...,,,,,,,,,,
2820,2347,how to create a content library,width=device-width,,b9242434-9feb-47be-a894-3a9e658fdd50,Managing Horizon Gold Images Across Multi-Site...,,summary,,,...,,,,,,,,,,
2821,2348,how to restore vmdk file in esxi,,,,,,,,,...,,,,,,,,,,
2822,2349,how to open vmware workstation,,,,,,,,,...,,,,,,,,,,


In [57]:
first_phrase_match_for_query = full_matches.groupby('QueryId').first()
first_phrase_match_for_query = first_phrase_match_for_query[~first_phrase_match_for_query['title'].isna()]
first_phrase_match_for_query

Unnamed: 0_level_0,Query,viewport,microsites-at-utag,onetrust-data-domain,titleTag,title,twitter:card,description,datePublished,id,...,DC.Publisher.Address,DC.Contributor,DC.Type,DC.Rights,DC.Date.X-MetadataLastModified,twitter:partner,uberflip:hub_id,application-name,msapplication-starturl,msapplication-TileColor
QueryId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,what is a hypervisor,"width=device-width, initial-scale=1.0",https://tags.tiqcdn.com/utag/vmware/microsites...,b9242434-9feb-47be-a894-3a9e658fdd50,What is a Hypervisor? | VMware Glossary,What is a Hypervisor? | VMware Glossary,summary,"Hypervisor, also known as a virtual machine mo...",11-09-2020,https---www.vmware.com-topics-glossary-content...,...,,,,,,,,,,
2,what is vmware,"width=device-width, initial-scale=1, minimum-...",,,What is VMware Workstation | FAQ,What is VMware Cloud,summary,VMware Cloud™ enables you to manage your entir...,,en-2021-VMware-Cloud-services-what-s-new-on-vm...,...,,,,,,,,,,
4,what is virtualization,"width=device-width, initial-scale=1, minimum-...",https://tags.tiqcdn.com/utag/vmware/microsites...,b9242434-9feb-47be-a894-3a9e658fdd50,What is virtualization technology& virtual mac...,What is virtualization technology & virtual ma...,summary,Virtualization is the single most effective wa...,05-11-2021,https---www.vmware.com-solutions-virtualizatio...,...,,,,,,,,,,
5,what is a virtual machine,"width=device-width, initial-scale=1.0",https://tags.tiqcdn.com/utag/vmware/microsites...,b9242434-9feb-47be-a894-3a9e658fdd50,What is a Virtual Machine? | VMware Glossary,What Is a Virtual Machine?,summary,"A virtual machine is a software computer that,...",09-18-2020,en-2021-VMware-Fusion-11-com-vmware-fusion-usi...,...,,,,,,,,,,
7,how to use vmware,"width=device-width, initial-scale=1, shrink-to...",,,Become a Threat Hunter | VMware Carbon Black,Using VMware Workstation Pro,summary_large_image,Learn more about threat hunting and how to use...,,https---www.carbonblack.com-resources-become-a...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2248,how to configure syslog,"width=device-width, initial-scale=1",,b9242434-9feb-47be-a894-3a9e658fdd50,How to configure syslog for NSX-V components u...,Configure Firewall for Profiles,summary_large_image,"As an enterprise administrator, you can config...",,https---blogs.vmware.com-customer-experience-a...,...,,,,,,,,,,
2257,what is virtualization security,"width=device-width, initial-scale=1.0",https://tags.tiqcdn.com/utag/vmware/microsites...,b9242434-9feb-47be-a894-3a9e658fdd50,What is Virtualized Security? | VMware Glossary,What is Virtualized Security? | VMware Glossary,summary,"Virtualized security, or security virtualizati...",03-19-2021,https---www.vmware.com-topics-glossary-content...,...,,,,,,,,,,
2263,what is application modernization,"width=device-width, initial-scale=1.0",https://tags.tiqcdn.com/utag/vmware/microsites...,b9242434-9feb-47be-a894-3a9e658fdd50,What is Application Modernization? | VMware Gl...,What is Application Modernization? | VMware Gl...,summary,Application modernization is the practice of u...,09-29-2020,https---www.vmware.com-topics-glossary-content...,...,tanzu@vmware.com,"VMware, Inc. or its affiliates",text/html,"Copyright 2020 VMware, Inc. or its affiliates....",2021-01-21,,,,,
2310,what is data center virtualization,"width=device-width, initial-scale=1, minimum-...",,,What is Data Center Virtualization? | Virtual ...,What is Data Center Virtualization? | Virtual ...,summary,Data center virtualization is the process of c...,,https---www.vmware.com-products-datacenter-vir...,...,,,,,,,,,,


In [58]:
def use_phrase_feedback_search(client, query):
    if query in first_phrase_match_for_query['Query'].tolist():
        query_vector = first_phrase_match_for_query[first_for_query['Query'] == query]['raw_text_use'].iloc[0]
        es = client.es
        body = {
          "query": {
              "script_score": {
                "query": {"match_all": {}},
                "script": {
                  "source": "cosineSimilarity(params.query_vector, 'raw_text_use') + 1.0",
                  "params": {"query_vector": query_vector}
                }
              }
          }
        }
        return es.search(index='vmware', body=body)['hits']['hits']
    return []

results = use_phrase_feedback_search(client=client, query='what is application modernization')
for result in results:
    print(result['_source']['titleTag'])

What is Application Modernization? | VMware Glossary
What is App Modernization | VMware Tanzu
What are Cloud Native Applications? | VMware Tanzu
Modernize Existing Apps With Software Supply Chain Solutions | VMware
Why you should consider adding serverless compute capability to existing applications - VMware Customer Experience and Success
Replatforming | VMware Tanzu
Digital Transformation Impact on Enterprise Architecture | Network and Security Virtualization | VMware
Strategic Approaches for a Successful Digital Transformation - Cloud Native Apps Blog
Modernize Applications: The Application Platform - VMware Customer Experience and Success
Multi-Cloud Connectivity and Security Needs of Kubernetes Applications


## Use just BM25 matches for USE expansion

Here we search Elasticsearch for less restrictive BM25 matches. We'll use the USE vectors of these matches later to expand these queries

In [59]:
bm25_results = search_all(client, queries, how=search_bm25)
bm25_results

Unnamed: 0,QueryId,Query,viewport,microsites-at-utag,onetrust-data-domain,titleTag,title,twitter:card,description,datePublished,...,DC.Date.X-MetadataLastModified,msapplication-TileImage,twitter:partner,uberflip:hub_id,application-name,msapplication-starturl,msapplication-TileColor,url,lastModified,md5
0,0,what is hypervisor,"width=device-width, initial-scale=1.0",https://tags.tiqcdn.com/utag/vmware/microsites...,b9242434-9feb-47be-a894-3a9e658fdd50,What is a Bare Metal Hypervisor? | VMware Glos...,What is a Bare Metal Hypervisor? | VMware Glos...,summary,The bare metal hypervisor is the most commonly...,12-21-2020,...,,,,,,,,,,
1,0,what is hypervisor,"width=device-width, initial-scale=1, minimum-...",,,What is a vSphere Hypervisor? | Free Hyperviso...,What is a vSphere Hypervisor? | Free Hyperviso...,summary_large_image,vSphere Hypervisor is a bare-metal hypervisor ...,,...,,,,,,,,,,
2,0,what is hypervisor,"width=device-width, initial-scale=1.0",https://tags.tiqcdn.com/utag/vmware/microsites...,b9242434-9feb-47be-a894-3a9e658fdd50,What is a Hypervisor? | VMware Glossary,What is a Hypervisor? | VMware Glossary,summary,"Hypervisor, also known as a virtual machine mo...",11-09-2020,...,,,,,,,,,,
3,0,what is hypervisor,"width=device-width, initial-scale=1.0",https://tags.tiqcdn.com/utag/vmware/microsites...,b9242434-9feb-47be-a894-3a9e658fdd50,What is a Cloud Hypervisor? | VMware Glossary,What is a Cloud Hypervisor? | VMware Glossary,summary,A Cloud Hypervisor is software that enables th...,02-19-2021,...,,,,,,,,,,
4,0,what is hypervisor,"width=device-width, initial-scale=1.0",https://tags.tiqcdn.com/utag/vmware/microsites...,b9242434-9feb-47be-a894-3a9e658fdd50,What is Network Functions Virtualization (NFV)...,What is Network Functions Virtualization (NFV)...,summary,Network functions virtualization (NFV) is the ...,09-18-2020,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11415,2350,kubernetes what is a namespace,"width=device-width, initial-scale=1.0",https://tags.tiqcdn.com/utag/vmware/microsites...,b9242434-9feb-47be-a894-3a9e658fdd50,What is a Kubernetes Namespace? | VMware Glossary,What is a Kubernetes Namespace? | VMware Glossary,summary,Namespaces are a way to organize clusters into...,11-18-2020,...,,,,,,,,,,
11416,2350,kubernetes what is a namespace,,,,,Configure a Kubernetes Zone in vRealize Automa...,,Kubernetes zones enable cloud administrators t...,,...,,,,,,,,,,
11417,2350,kubernetes what is a namespace,,,,,Configure a Kubernetes Zone in Cloud Assembly,,Kubernetes zones enable cloud administrators t...,,...,,,,,,,,,,
11418,2350,kubernetes what is a namespace,,,,,Configure a Kubernetes Zone in vRealize Automa...,,Kubernetes zones enable cloud administrators t...,,...,,,,,,,,,,


In [117]:
bm25_results.groupby('QueryId')['raw_text_use'].first()

QueryId
0       [-0.05170373246073723, 0.05167187750339508, 0....
1       [-0.05170373246073723, 0.05167187750339508, 0....
2       [-0.02821541205048561, 0.007544746622443199, 0...
3       [-0.051722653210163116, -0.04737328365445137, ...
4       [-0.05178454890847206, 0.04079513996839523, -0...
                              ...                        
2346    [-0.05430607870221138, -0.05002814531326294, 0...
2347    [-0.04561568796634674, -0.055037450045347214, ...
2348    [-0.06008719280362129, -0.05313912034034729, 0...
2349    [-0.04363827779889107, -0.028596023097634315, ...
2350    [-0.051316555589437485, -0.05038265511393547, ...
Name: raw_text_use, Length: 2351, dtype: object

In [119]:
bm25_results.groupby('QueryId')['raw_text_use'].sum()

QueryId
0       [-0.05170373246073723, 0.05167187750339508, 0....
1       [-0.05170373246073723, 0.05167187750339508, 0....
2       [-0.02821541205048561, 0.007544746622443199, 0...
3       [-0.051722653210163116, -0.04737328365445137, ...
4       [-0.05178454890847206, 0.04079513996839523, -0...
                              ...                        
2346    [-0.05430607870221138, -0.05002814531326294, 0...
2347    [-0.04561568796634674, -0.055037450045347214, ...
2348    [-0.06008719280362129, -0.05313912034034729, 0...
2349    [-0.04363827779889107, -0.028596023097634315, ...
2350    [-0.051316555589437485, -0.05038265511393547, ...
Name: raw_text_use, Length: 2351, dtype: object

In [143]:
import numpy as np

bm25_results = bm25_results.loc[~bm25_results['title'].isna(), :]
bm25_results['raw_text_use_np'] = bm25_results['raw_text_use'].apply(np.array)
bm25_use_expansion = bm25_results.groupby(['Query', 'QueryId'])['raw_text_use_np'].sum().to_frame().reset_index()

In [144]:
bm25_use_expansion

Unnamed: 0,Query,QueryId,raw_text_use_np
0,an error occurred on the server when processin...,360,"[-0.04687787592411041, -0.041850946843624115, ..."
1,an error occurred when vcenter server attempte...,520,"[-0.25700870528817177, 0.12822924554347992, 0...."
2,cdo mode,1314,"[-0.29352957010269165, -0.1958983251824975, 0...."
3,check if a port is in use,942,"[-0.11049962043762207, -0.027149999514222145, ..."
4,check if elasticsearch is installed,550,"[-0.214654590934515, -0.03759882634039968, 0.2..."
...,...,...,...
1539,windows check what ports are in use,1383,"[-0.1717958301305771, 0.12960114888846874, 0.1..."
1540,with server virtualization what happens when a...,563,"[-0.05817187950015068, -0.05154603347182274, 0..."
1541,"you do not have ""read customization specificat...",872,"[-0.2785758636891842, -0.27418680116534233, 0...."
1542,"you do not have the privilege ""resource > assi...",479,"[-0.1518806405365467, 0.058443326503038406, 0...."


In [146]:
def use_bm25_feedback_search(client, query):
    if query in bm25_use_expansion['Query'].tolist():
        query_vector = bm25_use_expansion[bm25_use_expansion['Query'] == query]['raw_text_use_np'].iloc[0].tolist()
        es = client.es
        body = {
          "query": {
              "script_score": {
                "query": {"match_all": {}},
                "script": {
                  "source": "cosineSimilarity(params.query_vector, 'raw_text_use') + 1.0",
                  "params": {"query_vector": query_vector}
                }
              }
          }
        }
        return es.search(index='vmware', body=body)['hits']['hits']
    return []

results = use_bm25_feedback_search(client=client, query='what is application modernization')
for result in results:
    print(result['_source']['titleTag'])

What is Application Modernization? | VMware Glossary
Modernize Existing Apps With Software Supply Chain Solutions | VMware
What is App Modernization | VMware Tanzu
What are Cloud Native Applications? | VMware Tanzu
Application Modernization Solutions | VMware
Build Cloud Native Apps | Understanding Cloud Native Apps | VMware
VMware Tanzu Labs: Improve Agility with App Modernization | VMware Tanzu
Replatforming | VMware Tanzu
Modernize Applications: The Application Platform - VMware Customer Experience and Success
Microservices Meets Micro-segmentation: Delivering Developer-Ready Infrastructure for Modern Application Development - Cloud Native Apps Blog


In [228]:
if 'use' not in queries.columns:

    import tensorflow_text
    import tensorflow_hub as hub
    import tensorflow as tf
    use = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

    queries['use'] = queries['Query'].apply(use)

def use_direct_search(client, query):
    query_vector = queries.loc[queries['Query'] == query, 'use']
    query_vector = use(query).numpy().tolist()[0]
    es = client.es
    body = {
      "query": {
          "script_score": {
            "query": {"match_all": {}},
            "script": {
              "source": "cosineSimilarity(params.query_vector, 'raw_text_use') + 1.0",
              "params": {"query_vector": query_vector}
            }
          }
      }
    }
    return es.search(index='vmware', body=body)['hits']['hits']

results = use_direct_search(client=client, query='how to uninstall vmware fusion')
#for result in results:
#    print(result['_source']['titleTag'])

In [267]:
from time import perf_counter

def recip_rank_fusion(client, queries, how=[search_bm25, use_bm25_feedback_search, use_direct_search]):
    """Create a submittable set of search results w/ multiple strategies using RRF"""
    all_results = []
    start_time = perf_counter()
    for idx, query in enumerate(queries.to_dict(orient='records')):
        query_results = {}
        for searcher in how:
            results = searcher(client, query['Query'])
            for rank, result in enumerate(results):
                doc_id = result['_id']
                if doc_id not in query_results:
                    source = result['_source']
                    source['score'] = {}
                    source['recip_rank'] = {}
                    source['rrf_score'] = 0.0
                    source['rank'] = {}
                    source['DocumentId'] = doc_id
                    source['QueryId'] = query['QueryId']
                    if 'titleTag' in source:
                        source['title'] = source['titleTag']
                    else:
                        source['title'] = source['raw_text'].split("\n")[0]
                    
                    query_results[doc_id] = source
                searcher_name = searcher.__name__
                query_results[doc_id]['score'][searcher_name] = result['_score']
                query_results[doc_id]['recip_rank'][searcher_name] = 1 / (rank + 1)
                query_results[doc_id]['rrf_score'] += 1 / (rank + 1)
                query_results[doc_id]['rank'][searcher_name] = rank
        
        # sorted_by_rrf = sorted(query_results.values(),  key=lambda val: val['rrf_score'], reverse=True)
        if (idx % 100) == 0:
            print(idx, query['Query'], len(query_results), perf_counter() - start_time)
        all_results.extend(query_results.values())
        
    all_results = pd.DataFrame(all_results)
    return queries.merge(all_results, how='left', on='QueryId').sort_values(['QueryId', 'rrf_score'], ascending=[True, False])

In [268]:
rrf_results = recip_rank_fusion(client, queries)

0 what is hypervisor 21 0.6760410879996925
100 how to create virtual machine in vmware 20 58.8630855399997
200 how to migrate vm from one cluster to another cluster in vmware 15 115.21639400599997
300 how virtualization works 23 174.16022110699942
400 pci passthrough devices cannot be added when nested hardware-assisted virtualization is enabled. 12 235.7956680469997
500 alarma del estado del servicio vmware vapi endpoint 10 292.4947534189996
600 how to create distributed switch in vmware 6.7 23 348.9513741289993
700 what is the role of virtualization technology? 15 405.620856558
800 how to enable evc mode in vmware 6.5 15 463.89047832100005
900 how to migrate hyper v virtual machine to vmware 15 525.1157584969997
1000 what is intent-based networking 20 587.0289793129996
1100 how to download vmware certificate 24 648.9708017590001
1200 what can intelligent hub see 17 710.6756454289998
1300 what is cloud native architecture 15 765.7563220719994
1400 how to upgrade vmware tools on multip

In [269]:
for result in rrf_results[rrf_results['Query'] == 'how to use vmware workstation'].to_dict(orient='record'):
    print(result['title'], result['rrf_score'], result['recip_rank'])

Using VMware Workstation Pro describes how to use VMware Workstation Pro to create, configure, and manage virtual machines. 2.0 {'search_bm25': 1.0, 'use_bm25_feedback_search': 1.0}
Using VMware Workstation Pro describes how to use VMware Workstation Pro to create, configure, and manage virtual machines. 1.0 {'search_bm25': 0.5, 'use_bm25_feedback_search': 0.5}
The most convenient way to use vmware command options is to incorporate them into the command that a Windows shortcut generates. 1.0 {'use_direct_search': 1.0}
Using VMware Workstation Pro describes how to use VMware Workstation Pro to create, configure, and manage virtual machines. 0.6666666666666666 {'search_bm25': 0.3333333333333333, 'use_bm25_feedback_search': 0.3333333333333333}
Using VMware Workstation Pro describes how to use VMware Workstation Pro to create, configure, and manage virtual machines. 0.5 {'search_bm25': 0.25, 'use_bm25_feedback_search': 0.25}
The most convenient way to use vmware command options is to incor





In [207]:
results = search_bm25(client, 'what is vmware')
for result in results:
    print(result['_source']['titleTag'], result['_score'])

What’s New in Horizon 8 (2012) | VMware 12.005382
Feature Friday episode 8 – What is the Tenant App? - VMware Cloud Provider Blog 11.770632
KBTV Webinars - SSL certificate handling in VMware vSphere 6 - VMware Support Insider 11.372724


KeyError: 'titleTag'

In [270]:
def write_submission():
    from time import time
    timestamp = str(time()).replace('.', '')
    fname = f'data/use_feedback_rrf_turnbull_submission_{timestamp}.csv'
    print("Writing To: ", fname)
    rrf_results[['QueryId', 'DocumentId']].to_csv(fname, index=False)
    
write_submission()

Writing To:  data/use_feedback_rrf_turnbull_submission_1653226391886872.csv
