# Pre-Requisites

In [None]:
!pip3 install -U sentence-transformers

In [3]:
!pip3 install pinecone-client pandas

Collecting pinecone-client
  Downloading pinecone_client-2.0.13-py3-none-any.whl (175 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.6/175.6 KB[0m [31m573.1 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pandas
  Using cached pandas-1.4.4-cp38-cp38-macosx_11_0_arm64.whl (10.3 MB)
Collecting typing-extensions>=3.7.4
  Using cached typing_extensions-4.3.0-py3-none-any.whl (25 kB)
Collecting dnspython>=2.0.0
  Downloading dnspython-2.2.1-py3-none-any.whl (269 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m269.1/269.1 KB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting pyyaml>=5.4
  Using cached PyYAML-6.0-cp38-cp38-macosx_12_0_arm64.whl
Collecting urllib3>=1.21.1
  Using cached urllib3-1.26.12-py2.py3-none-any.whl (140 kB)
Collecting requests>=2.19.0
  Using cached requests-2.28.1-py3-none-any.whl (62 kB)
Collecting loguru>=0.5.0
  Downloading loguru-0.6.0-py3-none-any.whl (58 kB)
[2K     [90m━━━━

## Query Generation

In [5]:
import pandas as pd

query_passage_df = pd.read_csv('datasets/query_passage.csv')
query_passage_df.head()

Unnamed: 0.1,Unnamed: 0,query,passage
0,0,what does ocd mean,One of the most helpful things in my own recov...
1,1,is it good to talk about an ocd,One of the most helpful things in my own recov...
2,2,does ocd really help recovery,One of the most helpful things in my own recov...
3,3,what is being a positive person,There is something powerful about knowing that...
4,4,who is an inspirational person,There is something powerful about knowing that...


## Negative Mining

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('msmarco-distilbert-base-tas-b')
model.max_seq_length = 256

model


In [None]:
import pinecone

API_KEY = "55e5d3b2-c5f7-494d-8323-3c79b45150a6"

pinecone.init(
    api_key=API_KEY,
    environment='us-west1-gcp'
)
# create a new genq index if does not already exist
if 'negative-mine' not in pinecone.list_indexes():
    pinecone.create_index(
        'negative-mine',
        dimension=model.get_sentence_embedding_dimension(),
        metric='dotproduct',
        pods=1  # increase for faster mining
    )
# connect
index = pinecone.Index('negative-mine')

In [8]:
def get_pairs(df):
    pairs = []
    for index, row in df.iterrows():
        pairs.append((row['query'], row['passage']))

    return pairs

In [None]:
pairs_gen = get_pairs()  # generator that loads (query, passage) pairs
blogs = pd.read_csv('datasets/blogs.csv')

pairs = []
to_upsert = []
passage_batch = []
id_batch = []
metadata_batch = []
batch_size = 64  # encode and upload size

for i, (query, passage) in enumerate(pairs_gen):
    pairs.append((query, passage))
    # we do this to avoid passage duplication in the vector DB
    if passage not in passage_batch: 
        passage_batch.append(passage)
        id_batch.append(str(i))

        metadata = blogs[blogs['text'] == passage]
        metadata_batch.append({'paragraph': metadata['paragraph'], 'article': metadata['article']})
    # on reaching batch_size, we encode and upsert
    if len(passage_batch) == batch_size:
        embeds = model.encode(passage_batch).tolist()
        # upload to index
        index.upsert(vectors=list(zip(id_batch, embeds, metadata_batch)))
        # refresh batches
        passage_batch = []
        id_batch = []
        metadata_batch = []
        
# check number of vectors in the index
index.describe_index_stats()