## Query Generation

In [1]:
import pandas as pd

blogs_df = pd.read_csv('../datasets/blogs.csv', index_col=0)
blogs_df = blogs_df[blogs_df['num_words'] > 7]
blogs_df.head()

Unnamed: 0,text,tag,paragraph,article,num_words,num_sentences
0,NOCD Support Groups: Finding Help and Hope in ...,h1,title,/blog/nocd-support-groups-finding-help-and-hop...,11,1
1,One of the most helpful things in my own recov...,p,,/blog/nocd-support-groups-finding-help-and-hop...,56,5
2,There is something powerful about knowing that...,p,,/blog/nocd-support-groups-finding-help-and-hop...,60,4
3,Support is a key piece of your recovery journe...,p,,/blog/nocd-support-groups-finding-help-and-hop...,78,4
5,Support groups may help you realize that you a...,p,You are not alone,/blog/nocd-support-groups-finding-help-and-hop...,85,5


In [19]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('msmarco-distilbert-base-tas-b', device='cpu')
model.max_seq_length = 256

model


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: DistilBertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

In [16]:
import pinecone

with open('../secrets', 'r') as fp:
    API_KEY = fp.read()  # get api key app.pinecone.io

pinecone.init(
    api_key=API_KEY,
    environment='us-west1-gcp'
)
# create a new index if does not already exist
if 'nocd-search' not in pinecone.list_indexes():
    pinecone.create_index(
        'nocd-search',
        dimension=model.get_sentence_embedding_dimension(),
        metric='dotproduct',
        pods=1  # increase for faster mining
    )
# connect
index = pinecone.Index('nocd-search')

In [3]:
def get_text(df):
    passages = []
    for index, row in df.iterrows():
        passages.append(row['text'])

    return passages

In [20]:
from tqdm import tqdm
import numpy as np

passages = get_text(blogs_df)  # generator that loads (query, passage) pairs
blogs_df = blogs_df.replace({np.nan: ''})

batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(passages), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(passages))
    # get batch of lines and IDs
    passage_batch = passages[i: i+batch_size]
    ids_batch = [str(n) for n in range(i, i_end)]
    # create embeddings
    embeds = model.encode(passage_batch).tolist()
    # prep metadata and upsert batch
    metadata = [{'text': passage, 'paragraph_name': blogs_df[blogs_df['text'] == passage].iloc[0]['paragraph'], 'article_name': blogs_df[blogs_df['text'] == passage].iloc[0]['article']} for passage in passage_batch]
    to_upsert = zip(ids_batch, embeds, metadata)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

100%|██████████| 137/137 [15:22<00:00,  6.73s/it]


### Test Base Model

In [None]:
import pinecone

with open('secrets', 'r') as fp:
    API_KEY = fp.read()  # get api key app.pinecone.io

pinecone.init(
    api_key=API_KEY,
    environment='us-west1-gcp'
)
index = pinecone.Index('negative-mine')

In [None]:
def convert_url(passage):
    sep = passage.split(',')
    start = sep[0].split(' ')[:4]
    end = sep[-1].split(' ')[-4:]

    start = '%20'.join(start)
    end = '%20'.join(end)

    return start + ',' + end

def convert_url_v2(passage):
    sep = passage.split(',')
    start = sep[0].split(' ')[:4]
    end = sep[-1].split(' ')[-4:]
    if len(start) < 2:
        end = sep[-1].split(' ')[-6:]

    start = '%20'.join(start)
    end = '%20'.join(end)

    return start + ',' + end

def query_db(query, model, index, passages):
    query_emb = model.encode(query, convert_to_tensor=True, show_progress_bar=False)
    res = index.query(query_emb.tolist(), top_k=10, include_metadata=True)

    nocd = 'https://www.treatmyocd.com'

    print(f'Search Query: {query}\n')
    print('---------------------------------------------------------------------------------------------------------------------')
    print('Results\n')
    for item in res.matches:
        print(f"Article: {nocd}{item['metadata']['article']}#:~:text={convert_url(passage_dict[int(item['id'])])}")
        print(f"Paragraph Header: {item['metadata']['paragraph']}")
        print(f"{item['score']} {passage_dict[int(item['id'])]}...\n")

In [None]:
pairs_gen = get_pairs(query_passage_df)
passage_dict = {i: p for i, (q, p) in enumerate(pairs_gen)}

from IPython.display import clear_output

while True:
    clear_output(wait=True)
    query = input("Search NOCD: ")
    if query == 'quit': break
    query_db(query=query, model=model, index=index, passages=passage_dict)