# Pre-Requisites

In [None]:
!pip3 install -U sentence-transformers

In [None]:
!pip3 install pinecone-client pandas

## Query Generation

In [1]:
import pandas as pd

query_passage_df = pd.read_csv('datasets/query_passage.csv')
query_passage_df.head()

Unnamed: 0.1,Unnamed: 0,query,passage
0,0,what does ocd mean,One of the most helpful things in my own recov...
1,1,is it good to talk about an ocd,One of the most helpful things in my own recov...
2,2,does ocd really help recovery,One of the most helpful things in my own recov...
3,3,what is being a positive person,There is something powerful about knowing that...
4,4,who is an inspirational person,There is something powerful about knowing that...


## Negative Mining

In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('msmarco-distilbert-base-tas-b')
model.max_seq_length = 256

model


  from .autonotebook import tqdm as notebook_tqdm


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [39]:
import pinecone

with open('secrets', 'r') as fp:
    API_KEY = fp.read()  # get api key app.pinecone.io

pinecone.init(
    api_key=API_KEY,
    environment='us-west1-gcp'
)
# create a new genq index if does not already exist
if 'negative-mine' not in pinecone.list_indexes():
    pinecone.create_index(
        'negative-mine',
        dimension=model.get_sentence_embedding_dimension(),
        metric='dotproduct',
        pods=1  # increase for faster mining
    )
# connect
index = pinecone.Index('negative-mine')

In [15]:
def get_pairs(df):
    pairs = []
    for index, row in df.iterrows():
        pairs.append((row['query'], row['passage']))

    return pairs

In [43]:
from tqdm import tqdm
import numpy as np

pairs_gen = get_pairs(query_passage_df)  # generator that loads (query, passage) pairs
blogs = pd.read_csv('datasets/blogs.csv')
blogs = blogs.replace({np.nan: None})


pairs = []
to_upsert = []
passage_batch = []
id_batch = []
metadata_batch = []
batch_size = 64  # encode and upload size

for i, (query, passage) in enumerate(tqdm(pairs_gen)):
    pairs.append((query, passage))
    # we do this to avoid passage duplication in the vector DB
    if passage not in passage_batch: 
        passage_batch.append(passage)
        id_batch.append(str(i))

        metadata = blogs[blogs['text'] == passage]
        par = metadata.iloc[0]['paragraph'] if metadata.iloc[0]['paragraph'] else ''
        article = metadata.iloc[0]['article'] if metadata.iloc[0]['article'] else ''
        
        metadata_batch.append({'paragraph': par, 'article': article})
    # on reaching batch_size, we encode and upsert
    if len(passage_batch) == batch_size:
        embeds = model.encode(passage_batch).tolist()
        # upload to index
        index.upsert(vectors=list(zip(id_batch, embeds, metadata_batch)))
        # refresh batches
        passage_batch = []
        id_batch = []
        metadata_batch = []
        
# check number of vectors in the index
index.describe_index_stats()

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10752/10752 [00:51<00:00, 209.34it/s]


{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 3584}},
 'total_vector_count': 3584}