## Download the data

In [None]:
#!pip install tensorflow
#!pip install tensorflow_hub
#!pip install tensorflow_text
#!pip install keras


import os.path
if not os.path.isfile('data/vmware_ir_content.csv'):
    !pip install kaggle
    !kaggle competitions download -c vmware-zero-shot-information-retrieval
    !mkdir -p data/
    !unzip -o vmware-zero-shot-information-retrieval.zip
    !mv *.csv data/
    
import tensorflow_text
import tensorflow_hub as hub
import tensorflow as tf
use = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual/3")

import numpy as np

import pandas as pd
queries = pd.read_csv('data/test.csv')

# Created with 'extract_use.py' script
queries = pd.read_csv("data/test.csv")
corpus = pd.read_pickle('data/vmware_ir_content_parsed.pkl')

## Title - body pair

Intuition:

* titles are like queries
* title,body label a positive example
* the body of the least similar title in an embedding space, is a negative example
* Task: use title_vect,body_vect to classify positive from negative examples in a supervised setting
*       during inference, use the query instead of a title to score each body as going with on not going with the title/query

In [None]:
def identify_question_in_title(row):
    
    title = row['titleTag']
    if isinstance(title, str) and '?' in title:
        true_question = ''
        base_title = title.split('|')[0]
        if '-' in base_title:
            for part in base_title.split('-'):
                if '?' in part:
                    true_question = part
                    break
        else:
            true_question = base_title
        true_question = true_question.strip()
        if len(true_question) > 17 and true_question.endswith('?'):
            return true_question
    return None
    
corpus['question'] = corpus.apply(identify_question_in_title, axis=1)
corpus

In [None]:
question_use = corpus[~corpus['question'].isna()]['question'].apply(use)

positives = pd.DataFrame()
positives['question_use'] = question_use
positives['raw_text_use'] = raw_text_use
positives['question'] = corpus[~corpus['question'].isna()]['question']
positives['raw_text'] = corpus[~corpus['question'].isna()]['raw_text']
positives['raw_text_use'] = corpus[~corpus['question'].isna()]['raw_text_use']


positives['label'] = 1.0

positives

In [None]:
def remove_question_from_text(row):
    question = row['question']
    return row['raw_text'].replace(question, '')

positives['raw_text_cleaned'] = positives.apply(remove_question_from_text, axis=1)
positives['raw_text_cleaned_use'] = positives['raw_text_cleaned'].apply(use)
positives

In [None]:
negative_rounds = 1

all_negs = []

for _ in range(0, negative_rounds):
    negatives = positives.copy()
    negatives['raw_text_cleaned_use'] = negatives.sample(frac=1.0)['raw_text_use'].values
    negatives['label'] = -1.0
    all_negs.append(negatives)
    
for _ in range(0, negative_rounds):
    negatives = positives.copy()
    negatives['raw_text_cleaned_use'] = corpus.sample(len(positives))['raw_text_use'].values
    negatives['label'] = -1.0
    all_negs.append(negatives)

In [None]:
training_set = pd.concat(all_negs + [positives])
training_set

## Confirm training set behaves as expected

- All question embeddings should be identical per question
- Raw text embeddings should NOT

In [None]:
dev_sec_ops = training_set[training_set['question'] == 'What is DevSecOps?']
assert (dev_sec_ops['question_use'] == dev_sec_ops.iloc[0]['question_use']).all()
dev_sec_ops

In [None]:
def format_X(training_set):
    raw_text_use = training_set['raw_text_cleaned_use'].numpy()[0]
    question_use = training_set['question_use'].numpy()[0]
    concated = np.concatenate([question_use, raw_text_use])
    return concated
    #return np.concatenate(training_set['question_use'].numpy()[0],
    #                      training_set['raw_text_use'].numpy()[0])

training_set['X'] = training_set.apply(format_X, axis=1)
training_set

## Confirm embeddings work as expected

For a given question/title - our question embeddings should still be first 512, and be identical.

In [None]:
dev_sec_ops = training_set[training_set['question'] == 'What is DevSecOps?']
assert (dev_sec_ops['question_use'] == dev_sec_ops.iloc[0]['question_use']).all()

def first_of_X(X):
    """First 512 should be identical"""
    sliced = X[:512]
    assert sliced.size == 512, f"Sliced size is {sliced.size}"
    return sliced

should_be_same = dev_sec_ops['X'].apply(first_of_X)
first_value = should_be_same.values[0]
# should be a cleaner way to do this with pandas, why doesn't the assert above work? Check later
for value in should_be_same.values:
    assert (first_value == value).all()

## Evaluate a model with cross validation

Train a binary classifier with Keras using binary classification, use cross validation to find the best model

In [None]:
!pip install tqdm

In [None]:
import tqdm
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score

def build_model():
    model = Sequential()
    model.add(Dense(64, input_dim=1024, activation='sigmoid'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

kfold = StratifiedKFold(n_splits=8, shuffle=True)
estimator = KerasClassifier(build_fn=build_model, epochs=500, batch_size=100, verbose=1)

results = cross_val_score(estimator, 
                          np.stack(training_set['X'].to_numpy()),
                          training_set['label'],
                          cv=kfold)

print(results, results.mean(), results.std())

In [None]:
len(training_set)//10

In [None]:
estimator.fit( np.stack(training_set['X'].to_numpy()),
               training_set['label'])

In [None]:
estimator.predict(np.stack([training_set['X'].iloc[0]]))

In [None]:
estimator.predict(np.stack([training_set['X'].iloc[-5]]))

In [None]:
# queries['query_use'] = queries['Query'].apply(use)

In [None]:
query_arr = use("what is a hypervisor?").numpy()[0]

corpus_for_query = corpus.copy()

def format_to_rank(training_set):
    raw_text_use = training_set['raw_text_use'].numpy()[0]
    concated = np.concatenate([query_arr, raw_text_use])
    return concated


corpus_for_query['use_with_query'] = corpus_for_query.apply(format_to_rank, axis=1)
corpus_for_query['is_match'] = estimator.predict(np.stack(corpus_for_query['use_with_query']))

for row in corpus_for_query[(corpus_for_query['is_match'] == 1.0)].to_dict(orient='record')[:10]:
    print('----------')
    if 'titleTag' in row:
        print(row['titleTag'])
    else:
        print(row['raw_text'][:100])
    print()
    print()
    print()
    print()

In [None]:
def write_submission():
    from time import time
    timestamp = str(time()).replace('.', '')
    fname = f'data/turnbull_submission_{timestamp}.csv'
    print("Writing To: ", fname)
    submission[['QueryId', 'DocumentId']].to_csv(fname, index=False)

In [None]:
submission[['QueryId', 'DocumentId']]

In [None]:
write_submission()

In [None]:
corpus[corpus['id'].str.contains('https---blogs.vmware.com-cloudprovider-2015-11-simplifying-cloud-spending-with-vmware-subscription-purchase-program.txt')]