## Before starting - setup index

```
cd notebooks/elasticsearch/vmware
python extract_use_es.py 0    # base reindex step
python extract_use_es.py 1    # enrich with USE
```

In [None]:
from ltr.client import ElasticClient
client=ElasticClient()

In [None]:
def exact_phrase_title(client, query):
    es = client.es
    body = {
        'size': 5,
        'query': {
            'multi_match': {
                'fields': ['title'],
                'type': 'phrase',
                'query': query
            }
        }
    }
    return es.search(index='vmware', body=body)['hits']['hits']


def exact_phrase_anywhere(client, query):
    es = client.es
    body = {
        'size': 5,
        'query': {
            'multi_match': {
                'fields': ['title^100', 'description^10', 'raw_text'],
                'type': 'phrase',
                'query': query
            }
        }
    }
    return es.search(index='vmware', body=body)['hits']['hits']


def search_bm25(client, query):
    es = client.es
    body = {
        'size': 5,
        'query': {
            'multi_match': {
                'fields': ['title', 'description', 'raw_text'],
                'type': 'cross_fields',
                'query': query,
                'minimum_should_match': '100%'
            }
        }
    }
    return es.search(index='vmware', body=body)['hits']['hits']


## Use exact phrase matches for USE expansion

Here we search Elasticsearch for exact phrase matches. We'll use the USE vectors of these matches later to expand these queries

In [None]:
import pandas as pd

def search_all(client, queries, how=search_bm25):
    """Create a submittable set of search results"""
    all_results = []
    for query in queries.to_dict(orient='records'):
        results = how(client, query['Query'])
        for rank, result in enumerate(results):
            source = result['_source']
            source['rank'] = rank
            source['score'] = result['_score']
            source['DocumentId'] = source['id']
            source['QueryId'] = query['QueryId']
            all_results.append(source)
    all_results = pd.DataFrame(all_results)
    return queries.merge(all_results, how='left', on='QueryId').sort_values(['QueryId', 'rank'])


queries = pd.read_csv("data/test.csv")
full_matches = search_all(client, queries, how=exact_phrase_anywhere)
full_matches

In [None]:
first_phrase_match_for_query = full_matches.groupby('QueryId').first()
first_phrase_match_for_query = first_phrase_match_for_query[~first_phrase_match_for_query['title'].isna()]
first_phrase_match_for_query

In [None]:
def use_phrase_feedback_search(client, query):
    if query in first_phrase_match_for_query['Query'].tolist():
        query_vector = first_phrase_match_for_query[first_for_query['Query'] == query]['raw_text_use'].iloc[0]
        es = client.es
        body = {
          "query": {
              "script_score": {
                "query": {"match_all": {}},
                "script": {
                  "source": "cosineSimilarity(params.query_vector, 'raw_text_use') + 1.0",
                  "params": {"query_vector": query_vector}
                }
              }
          }
        }
        return es.search(index='vmware', body=body)['hits']['hits']
    return []

results = use_phrase_feedback_search(client=client, query='what is application modernization')
for result in results:
    print(result['_source']['titleTag'])

## Use just BM25 matches for USE expansion

Here we search Elasticsearch for less restrictive BM25 matches. We'll use the USE vectors of these matches later to expand these queries

In [None]:
bm25_results = search_all(client, queries, how=search_bm25)
bm25_results

In [None]:
bm25_results.groupby('QueryId')['raw_text_use'].first()

In [None]:
bm25_results.groupby('QueryId')['raw_text_use'].sum()

In [None]:
import numpy as np

bm25_results = bm25_results.loc[~bm25_results['title'].isna(), :]
bm25_results['raw_text_use_np'] = bm25_results['raw_text_use'].apply(np.array)
bm25_use_expansion = bm25_results.groupby(['Query', 'QueryId'])['raw_text_use_np'].sum().to_frame().reset_index()

In [None]:
bm25_use_expansion

In [None]:
def use_bm25_feedback_search(client, query):
    if query in bm25_use_expansion['Query'].tolist():
        query_vector = bm25_use_expansion[bm25_use_expansion['Query'] == query]['raw_text_use_np'].iloc[0].tolist()
        es = client.es
        body = {
          "query": {
              "script_score": {
                "query": {"match_all": {}},
                "script": {
                  "source": "cosineSimilarity(params.query_vector, 'raw_text_use') + 1.0",
                  "params": {"query_vector": query_vector}
                }
              }
          }
        }
        return es.search(index='vmware', body=body)['hits']['hits']
    return []

results = use_bm25_feedback_search(client=client, query='what is application modernization')
for result in results:
    print(result['_source']['titleTag'])

In [None]:
if 'use' not in queries.columns:

    import tensorflow_text
    import tensorflow_hub as hub
    import tensorflow as tf
    use = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

    queries['use'] = queries['Query'].apply(use)

def use_direct_search(client, query):
    query_vector = queries.loc[queries['Query'] == query, 'use']
    query_vector = use(query).numpy().tolist()[0]
    es = client.es
    body = {
      "query": {
          "script_score": {
            "query": {"match_all": {}},
            "script": {
              "source": "cosineSimilarity(params.query_vector, 'raw_text_use') + 1.0",
              "params": {"query_vector": query_vector}
            }
          }
      }
    }
    return es.search(index='vmware', body=body)['hits']['hits']

results = use_direct_search(client=client, query='how to uninstall vmware fusion')
#for result in results:
#    print(result['_source']['titleTag'])

In [None]:
from time import perf_counter

def recip_rank_fusion(client, queries, how=[search_bm25, use_bm25_feedback_search, use_direct_search]):
    """Create a submittable set of search results w/ multiple strategies using RRF"""
    all_results = []
    start_time = perf_counter()
    for idx, query in enumerate(queries.to_dict(orient='records')):
        query_results = {}
        for searcher in how:
            results = searcher(client, query['Query'])
            for rank, result in enumerate(results):
                doc_id = result['_id']
                if doc_id not in query_results:
                    source = result['_source']
                    source['score'] = {}
                    source['recip_rank'] = {}
                    source['rrf_score'] = 0.0
                    source['rank'] = {}
                    source['DocumentId'] = doc_id
                    source['QueryId'] = query['QueryId']
                    if 'titleTag' in source:
                        source['title'] = source['titleTag']
                    else:
                        source['title'] = source['raw_text'].split("\n")[0]
                    
                    query_results[doc_id] = source
                searcher_name = searcher.__name__
                query_results[doc_id]['score'][searcher_name] = result['_score']
                query_results[doc_id]['recip_rank'][searcher_name] = 1 / (rank + 1)
                query_results[doc_id]['rrf_score'] += 1 / (rank + 1)
                query_results[doc_id]['rank'][searcher_name] = rank
        
        # sorted_by_rrf = sorted(query_results.values(),  key=lambda val: val['rrf_score'], reverse=True)
        if (idx % 100) == 0:
            print(idx, query['Query'], len(query_results), perf_counter() - start_time)
        all_results.extend(query_results.values())
        
    all_results = pd.DataFrame(all_results)
    return queries.merge(all_results, how='left', on='QueryId').sort_values(['QueryId', 'rrf_score'], ascending=[True, False])

In [None]:
rrf_results = recip_rank_fusion(client, queries)

In [None]:
for result in rrf_results[rrf_results['Query'] == 'how to use vmware workstation'].to_dict(orient='record'):
    print(result['title'], result['rrf_score'], result['recip_rank'])

In [None]:
results = search_bm25(client, 'what is vmware')
for result in results:
    print(result['_source']['titleTag'], result['_score'])

In [None]:
def write_submission():
    from time import time
    timestamp = str(time()).replace('.', '')
    fname = f'data/use_feedback_rrf_turnbull_submission_{timestamp}.csv'
    print("Writing To: ", fname)
    rrf_results[['QueryId', 'DocumentId']].to_csv(fname, index=False)
    
write_submission()