# Pre-Requisites

In [None]:
!pip3 install -U sentence-transformers

In [None]:
!pip3 install pinecone-client

## Query Generation

In [1]:
import pandas as pd

query_passage_df = pd.read_csv('query_passage.csv')
query_passage_df.head()

Unnamed: 0.1,Unnamed: 0,query,passage
0,0,what does ocd mean,One of the most helpful things in my own recov...
1,1,is it good to talk about an ocd,One of the most helpful things in my own recov...
2,2,does ocd really help recovery,One of the most helpful things in my own recov...
3,3,what is being a positive person,There is something powerful about knowing that...
4,4,who is an inspirational person,There is something powerful about knowing that...


## Negative Mining

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('msmarco-distilbert-base-tas-b')
model.max_seq_length = 256

model


In [None]:
import pinecone

API_KEY = "55e5d3b2-c5f7-494d-8323-3c79b45150a6"

pinecone.init(
    api_key=API_KEY,
    environment='us-west1-gcp'
)
# create a new genq index if does not already exist
if 'negative-mine' not in pinecone.list_indexes():
    pinecone.create_index(
        'negative-mine',
        dimension=model.get_sentence_embedding_dimension(),
        metric='dotproduct',
        pods=1  # increase for faster mining
    )
# connect
index = pinecone.Index('negative-mine')

In [None]:
def get_text(df):
    pairs = []
    for index, row in df.iterrows():
        pairs.append((row['query'], row['passage']))

In [None]:
pair_gen = get_text()  # generator that loads (query, passage) pairs

pairs = []
to_upsert = []
passage_batch = []
id_batch = []
batch_size = 64  # encode and upload size

for i, (query, passage) in enumerate(pairs_gen):
    pairs.append((query, passage))
    # we do this to avoid passage duplication in the vector DB
    if passage not in passage_batch: 
        passage_batch.append(passage)
        id_batch.append(str(i))
    # on reaching batch_size, we encode and upsert
    if len(passage_batch) == batch_size:
        embeds = model.encode(passage_batch).tolist()
        # upload to index
        index.upsert(vectors=list(zip(id_batch, embeds)))
        # refresh batches
        passage_batch = []
        id_batch = []
        
# check number of vectors in the index
index.describe_index_stats()