# Pre-Requisites

In [4]:
!pip3 install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 5.7 MB/s  eta 0:00:01
Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 42.9 MB/s eta 0:00:01
Collecting scikit-learn
  Downloading scikit_learn-1.1.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31.2 MB)
[K     |████████████████████████████████| 31.2 MB 57.2 MB/s eta 0:00:01
[?25hCollecting scipy
  Downloading scipy-1.9.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (43.4 MB)
[K     |████████████████████████████████| 43.4 MB 67.0 MB/s eta 0:00:01
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 75.6 MB/s eta 0:00:01
Collecting torchvision
  Downloading torchvision-0.13.1-cp38-cp38-manylinux1_x86_64.whl (19.1 MB)
[K     |█████

In [7]:
!pip3 install pinecone-client

Collecting pinecone-client
  Downloading pinecone_client-2.0.13-py3-none-any.whl (175 kB)
[K     |████████████████████████████████| 175 kB 25.7 MB/s eta 0:00:01
[?25hCollecting loguru>=0.5.0
  Downloading loguru-0.6.0-py3-none-any.whl (58 kB)
[K     |████████████████████████████████| 58 kB 8.0 MB/s  eta 0:00:01
Collecting dnspython>=2.0.0
  Downloading dnspython-2.2.1-py3-none-any.whl (269 kB)
[K     |████████████████████████████████| 269 kB 62.0 MB/s eta 0:00:01
Installing collected packages: loguru, dnspython, pinecone-client
Successfully installed dnspython-2.2.1 loguru-0.6.0 pinecone-client-2.0.13


## Query Generation

In [9]:
import pandas as pd

query_passage_df = pd.read_csv('query_passage.csv')
query_passage_df.head()

Unnamed: 0.1,Unnamed: 0,query,passage
0,0,what does ocd mean,One of the most helpful things in my own recov...
1,1,is it good to talk about an ocd,One of the most helpful things in my own recov...
2,2,does ocd really help recovery,One of the most helpful things in my own recov...
3,3,what is being a positive person,There is something powerful about knowing that...
4,4,who is an inspirational person,There is something powerful about knowing that...


## Negative Mining

In [5]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('msmarco-distilbert-base-tas-b')
model.max_seq_length = 256

model


  from .autonotebook import tqdm as notebook_tqdm
Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 690/690 [00:00<00:00, 964kB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 190/190 [00:00<00:00, 113kB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3.99k/3.99k [00:00<00:00, 3.85MB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 548/548 [00:00<00:00, 427kB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 122/122 [00:00<00:00, 100kB/s]
Downloading: 100%|████████████████████████████████████████████████████

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [8]:
import pinecone

API_KEY = "55e5d3b2-c5f7-494d-8323-3c79b45150a6"

pinecone.init(
    api_key=API_KEY,
    environment='us-west1-gcp'
)
# create a new genq index if does not already exist
if 'negative-mine' not in pinecone.list_indexes():
    pinecone.create_index(
        'negative-mine',
        dimension=model.get_sentence_embedding_dimension(),
        metric='dotproduct',
        pods=1  # increase for faster mining
    )
# connect
index = pinecone.Index('negative-mine')

In [None]:
def get_text(df):
    pairs = []
    for index, row in df.iterrows():
        pairs.append((row['query'], row['passage']))

In [None]:
pair_gen = get_text()  # generator that loads (query, passage) pairs

pairs = []
to_upsert = []
passage_batch = []
id_batch = []
batch_size = 64  # encode and upload size

for i, (query, passage) in enumerate(pairs_gen):
    pairs.append((query, passage))
    # we do this to avoid passage duplication in the vector DB
    if passage not in passage_batch: 
        passage_batch.append(passage)
        id_batch.append(str(i))
    # on reaching batch_size, we encode and upsert
    if len(passage_batch) == batch_size:
        embeds = model.encode(passage_batch).tolist()
        # upload to index
        index.upsert(vectors=list(zip(id_batch, embeds)))
        # refresh batches
        passage_batch = []
        id_batch = []
        
# check number of vectors in the index
index.describe_index_stats()