## Download the data

In [None]:
import os.path
if not os.path.isfile('data/vmware_ir_content.csv'):
    !pip install kaggle
    !kaggle competitions download -c vmware-zero-shot-information-retrieval
    !mkdir -p data/
    !unzip -o vmware-zero-shot-information-retrieval.zip
    !mv *.csv data/

## Parse the vmware corpus

- Load vmware text
- Read metadata into fields

In [None]:
import pandas as pd
corpus = pd.read_csv('data/vmware_ir_content.csv')
queries = pd.read_csv('data/test.csv')
corpus = corpus.fillna('')

import json
import ast

parsed_rows = []

for row in corpus.to_dict(orient='records'):
    row_dict = ast.literal_eval(row['raw_meta'])
    row_dict['id'] = row['f_name']
    row_dict['f_name'] = row['f_name']
    row_dict['raw_text'] = row['raw_text']
    row_dict['document_group'] = row['document_group']

    # fields that dont index for some reason
    bad_fields = ["DC.Date", "DC.Publisher"]
    for bad_field in bad_fields:
        if bad_field in row_dict:
            del row_dict[bad_field]
        
    parsed_rows.append(row_dict)
    
corpus = pd.DataFrame(parsed_rows)
corpus

## 

In [None]:
#for row in corpus[corpus['id'].str.contains('GUID')]['id']:
#    print(row)

corpus

In [None]:
from ltr.client import ElasticClient
client=ElasticClient()

from ltr.index import rebuild
rebuild(client, index='vmware', doc_src=parsed_rows, force=True)

In [None]:
def exact_phrase_title(client, query):
    es = client.es
    body = {
        'size': 5,
        'query': {
            'multi_match': {
                'fields': ['title'],
                'type': 'phrase',
                'query': query
            }
        }
    }
    return es.search(index='vmware', body=body)['hits']['hits']


def exact_phrase(client, query):
    es = client.es
    body = {
        'size': 5,
        'query': {
            'multi_match': {
                'fields': ['title^100', 'description', 'raw_text'],
                'type': 'phrase',
                'query': query
            }
        }
    }
    return es.search(index='vmware', body=body)['hits']['hits']

In [None]:
def search(client, query):
    es = client.es
    body = {
        'size': 5,
        'query': {
            'multi_match': {
                'fields': ['title', 'description', 'raw_text'],
                'type': 'cross_fields',
                'query': query
            }
        }
    }
    return es.search(index='vmware', body=body)['hits']['hits']


def search_all(client, queries, how=search):
    all_results = []
    for query in queries.to_dict(orient='records'):
        print(f"Searching for {query['QueryId']},{query['Query']}")
        results = how(client, query['Query'])
        for rank, result in enumerate(results):
            source = result['_source']
            if rank == 0 and 'titleTag' in source:
                print(source['titleTag'])
            source['rank'] = rank
            source['score'] = result['_score']
            source['DocumentId'] = source['id']
            source['QueryId'] = query['QueryId']
            all_results.append(source)
        print()
        print()
    all_results = pd.DataFrame(all_results)
    return queries.merge(all_results, how='left', on='QueryId').sort_values(['QueryId', 'rank'])
    
#submission = search_all(client, queries)

In [None]:
exact_phrase_matches = search_all(client, queries, how=exact_phrase)

In [None]:
exact_phrase_matches[(exact_phrase_matches['score'] > 20) & 
                     (~exact_phrase_matches['titleTag'].isna())][['Query', 'titleTag', 'score']]

In [None]:
def write_submission():
    from time import time
    timestamp = str(time()).replace('.', '')
    fname = f'data/turnbull_submission_{timestamp}.csv'
    print("Writing To: ", fname)
    submission[['QueryId', 'DocumentId']].to_csv(fname, index=False)

In [None]:
submission[['QueryId', 'DocumentId']]

In [None]:
write_submission()

In [None]:
corpus[corpus['id'].str.contains('https---blogs.vmware.com-cloudprovider-2015-11-simplifying-cloud-spending-with-vmware-subscription-purchase-program.txt')]

## What's different

In [None]:
pd.read_csv('data/turnbull_submission_1652544680901428.csv')