# Pre-Requisites

In [None]:
!pip3 install -U sentence-transformers

In [None]:
!pip3 install torch torchvision torchaudio pinecone-client

In [4]:
import pandas as pd

blogs = pd.read_csv('blogs.csv')

def get_text():
    for index, row in blogs.iterrows():
        if row['tag'] == 'p':
            yield row['text'] 

In [14]:
blogs['num_words'] = blogs['text'].str.split().str.len()
low_words = blogs[(blogs['num_words'] > 256) & (blogs['tag'] == 'p')]
blogs

## Query Generation

In [2]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name = 'doc2query/msmarco-t5-base-v1'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).cuda()

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
passages = get_text()
for index, passage in enumerate(passages):
    if index < 5:
        # tokenize the passage
        inputs = tokenizer(passage, return_tensors='pt')
        # generate three queries
        outputs = model.generate(
            input_ids=inputs['input_ids'].cuda(),
            attention_mask=inputs['attention_mask'].cuda(),
            max_length=64,
            do_sample=True,
            top_p=0.95,
            num_return_sequences=3)
        print("Paragraph:")
        print(passage)

        print("\nGenerated Queries:")
        for i in range(len(outputs)):
            query = tokenizer.decode(outputs[i], skip_special_tokens=True)
            print(f'{i + 1}: {query}')

        print()

Paragraph:
One of the most helpful things in my own recovery journey has been hearing about other people’s experiences with OCD. This might be because we relish in stories of triumph and are drawn toward people with shared experiences. I think this is true for many things in life. Personally, I learn and grow from these stories.

Generated Queries:
1: what helps with ocd recovery
2: what is an ocd support group
3: what helps ocd

Paragraph:
There is something powerful about knowing that someone else has walked the same path as you and that they have not only survived it, but possibly even thrived. I love hearing about the determination and grit of others who have faced similar obstacles. These stories of hope often inspired me to keep going, even when I didn’t think I could. 

Generated Queries:
1: the effect of networking on personal success
2: what is the meaning of being the victim of others
3: what is the purpose of seeing another person walk the same path

Paragraph:
Support is a 

## Negative Mining

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('msmarco-distilbert-base-tas-b')
model.max_seq_length = 256

model


In [None]:
import pinecone

with open('secret', 'r') as fp:
    API_KEY = fp.read()  # get api key app.pinecone.io

pinecone.init(
    api_key=API_KEY,
    environment='us-west1-gcp'
)
# create a new genq index if does not already exist
if 'negative-mine' not in pinecone.list_indexes():
    pinecone.create_index(
        'negative-mine',
        dimension=model.get_sentence_embedding_dimension(),
        metric='dotproduct',
        pods=1  # increase for faster mining
    )
# connect
index = pinecone.Index('negative-mine')

In [None]:
pair_gen = get_text()  # generator that loads (query, passage) pairs

pairs = []
to_upsert = []
passage_batch = []
id_batch = []
batch_size = 64  # encode and upload size

for i, (query, passage) in enumerate(pairs_gen):
    pairs.append((query, passage))
    # we do this to avoid passage duplication in the vector DB
    if passage not in passage_batch: 
        passage_batch.append(passage)
        id_batch.append(str(i))
    # on reaching batch_size, we encode and upsert
    if len(passage_batch) == batch_size:
        embeds = model.encode(passage_batch).tolist()
        # upload to index
        index.upsert(vectors=list(zip(id_batch, embeds)))
        # refresh batches
        passage_batch = []
        id_batch = []
        
# check number of vectors in the index
index.describe_index_stats()