In [1]:
import dsp
import os

os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

In [2]:
train = [('Who produced the album that included a re-recording of "Lithium"?', ['Butch Vig']),
         ('Who was the director of the 2009 movie featuring Peter Outerbridge as William Easton?', ['Kevin Greutert']),
         ('The heir to the Du Pont family fortune sponsored what wrestling team?', ['Foxcatcher', 'Team Foxcatcher', 'Foxcatcher Team']),
         ('In what year was the star of To Hell and Back born?', ['1925']),
         ('Which award did the first book of Gary Zukav receive?', ['U.S. National Book Award', 'National Book Award']),
         ('What city was the victim of Joseph Druces working in?', ['Boston, Massachusetts', 'Boston']),]

train = [dsp.Example(question=question, answer=answer) for question, answer in train]

In [3]:
import numpy as np

EMB_DIM = 768
np.random.seed(42)


three_closest_embeddings = np.random.randn(3, EMB_DIM)
# shift the last three so they won't be the closest ones
three_furthest_embeddings = three_closest_embeddings.copy() + 10
train_embeddings = np.concatenate([three_closest_embeddings, three_furthest_embeddings], axis=0)

for idx, example in enumerate(train):
    example.vectorized = train_embeddings[idx]

In [4]:
dev = [('Who has a broader scope of profession: E. L. Doctorow or Julia Peterkin?', ['E. L. Doctorow', 'E.L. Doctorow', 'Doctorow'])]

dev = [dsp.Example(question=question, answer=answer) for question, answer in dev]
dev[0].vectorized = train_embeddings[1] # the same as in train, 2nd will be closest 100%

In [5]:
knn_func = dsp.knn(train)

In [6]:
knn_res_dev = knn_func(dev[0], 3)
[train.index(i) for i in knn_res_dev] # first 3 train examples are the closest, the first one w index 1 is the first in the list

[1, 0, 2]

In [7]:
knn_res_train = knn_func(train[-2], 3)
[train.index(i) for i in knn_res_train] # as expected

[4, 3, 5]

### SentenceTransformer-based knn

In [8]:
# we need to add some relevant and irrelevant questions to double-check the logic

train = [
    # base questions
    ('Who produced the album that included a re-recording of "Lithium"?', ['Butch Vig']),
    ('Who was the director of the 2009 movie featuring Peter Outerbridge as William Easton?', ['Kevin Greutert']),
    ('The heir to the Du Pont family fortune sponsored what wrestling team?', ['Foxcatcher', 'Team Foxcatcher', 'Foxcatcher Team']),
    ('In what year was the star of To Hell and Back born?', ['1925']),
    ('Which award did the first book of Gary Zukav receive?', ['U.S. National Book Award', 'National Book Award']),
    ('What city was the victim of Joseph Druces working in?', ['Boston, Massachusetts', 'Boston']),
    # extra
    ('Who founded Smart Studios in Madison, while still performing drums in Spooner at night and driving a taxi cab during the day?', ['Butch Vig']),
    ('There were two founders of recording studio called "Smart Studios". The first one was Butch Vig. Who was the second founder?', ['Steve Marker']),

    ('What is the first book of Gary Zukav?', ['"The Dancing Wu Li Masters"', "'The Dancing Wu Li Masters'", "The Dancing Wu Li Masters"]),
    ('When did Zukav returned to the United States?', ['1970'])
]

train = [dsp.Example(question=question, answer=answer) for question, answer in train]

In [9]:
transformer_vectorize = dsp.get_sentence_transformers_vectorize_func()
knn_func = dsp.knn(train, vectorize=transformer_vectorize)

In [10]:
QUERY_IDX = 0

knn_res_train_vec = knn_func(train[QUERY_IDX], 3)
print(train[QUERY_IDX].question)  # the query itself
# similar questions
print('-'*80)
for i in [train[train.index(i)].question for i in knn_res_train_vec]:
    print(i)

Who produced the album that included a re-recording of "Lithium"?
--------------------------------------------------------------------------------
Who produced the album that included a re-recording of "Lithium"?
Who founded Smart Studios in Madison, while still performing drums in Spooner at night and driving a taxi cab during the day?
There were two founders of recording studio called "Smart Studios". The first one was Butch Vig. Who was the second founder?


In [11]:
QUERY_IDX = 4

knn_res_train_vec = knn_func(train[QUERY_IDX], 3)
print(train[QUERY_IDX].question)  # the query itself
# similar questions
print('-'*80)
for i in [train[train.index(i)].question for i in knn_res_train_vec]:
    print(i)

Which award did the first book of Gary Zukav receive?
--------------------------------------------------------------------------------
Which award did the first book of Gary Zukav receive?
What is the first book of Gary Zukav?
When did Zukav returned to the United States?
