## Query Generation

In [20]:
import pandas as pd
from dotenv import load_dotenv
import os

load_dotenv()

blogs_df = pd.read_csv('../datasets/blogs.csv', index_col=0)
blogs_df = blogs_df[blogs_df['num_words'] > 7]
blogs_df.head()


Unnamed: 0,text,tag,paragraph,article,num_words,num_sentences
0,NOCD Support Groups: Finding Help and Hope in ...,h1,title,/blog/nocd-support-groups-finding-help-and-hop...,11,1
1,One of the most helpful things in my own recov...,p,,/blog/nocd-support-groups-finding-help-and-hop...,56,5
2,There is something powerful about knowing that...,p,,/blog/nocd-support-groups-finding-help-and-hop...,60,4
3,Support is a key piece of your recovery journe...,p,,/blog/nocd-support-groups-finding-help-and-hop...,78,4
5,Support groups may help you realize that you a...,p,You are not alone,/blog/nocd-support-groups-finding-help-and-hop...,85,5


In [3]:
def get_text(df):
    passages = []
    for index, row in df.iterrows():
        passages.append(row['text'])

    return passages

In [6]:
import pinecone
index = pinecone.Index('nocd-search-huggingface')
index.delete(deleteAll=True)
print(index.describe_index_stats())

index = pinecone.Index('nocd-search-openai')
index.delete(deleteAll=True)
print(index.describe_index_stats())

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}
{'dimension': 2048,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


## HuggingFace Models

In [7]:
# upsert using sentence transformer model

from tqdm import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = 'msmarco-distilbert-base-tas-b'
model = SentenceTransformer(model_name, device=device)
model.max_seq_length = 256

import pinecone

pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment='us-west1-gcp'
)
# create a new index if does not already exist
print('Creating index...')
if 'nocd-search-huggingface' not in pinecone.list_indexes():
    pinecone.create_index(
        'nocd-search-huggingface',
        dimension=model.get_sentence_embedding_dimension(),
        metric='dotproduct',
        pods=1  # increase for faster mining
    )
# connect
index = pinecone.Index('nocd-search-huggingface')

passages = get_text(blogs_df)  # generator that loads (query, passage) pairs
blogs_df = blogs_df.replace({np.nan: ''})

batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(passages), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(passages))
    # get batch of lines and IDs
    passage_batch = passages[i: i+batch_size]
    ids_batch = [f"{str(n)}_{model_name}" for n in range(i, i_end)]
    # create embeddings
    embeds = model.encode(passage_batch).tolist()
    # prep metadata and upsert batch
    metadata = [
        {
            'text': passage, 
            'paragraph_name': blogs_df[blogs_df['text'] == passage].iloc[0]['paragraph'], 
            'article_name': blogs_df[blogs_df['text'] == passage].iloc[0]['article'], 
            'model': model_name
        } 
        for passage in passage_batch
    ]
    to_upsert = zip(ids_batch, embeds, metadata)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

print(index.describe_index_stats())

Creating index...


100%|██████████| 137/137 [00:41<00:00,  3.33it/s]

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4373}},
 'total_vector_count': 4373}





In [8]:
# upsert using sentence transformer model

from tqdm import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = 'all-distilroberta-v1'
model = SentenceTransformer(model_name, device=device)
model.max_seq_length = 256

import pinecone

pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment='us-west1-gcp'
)
# create a new index if does not already exist
print('Creating index...')
if 'nocd-search-huggingface' not in pinecone.list_indexes():
    pinecone.create_index(
        'nocd-search-huggingface',
        dimension=model.get_sentence_embedding_dimension(),
        metric='dotproduct',
        pods=1  # increase for faster mining
    )
# connect
index = pinecone.Index('nocd-search-huggingface')

passages = get_text(blogs_df)  # generator that loads (query, passage) pairs
blogs_df = blogs_df.replace({np.nan: ''})

batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(passages), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(passages))
    # get batch of lines and IDs
    passage_batch = passages[i: i+batch_size]
    ids_batch = [f"{str(n)}_{model_name}" for n in range(i, i_end)]
    # create embeddings
    embeds = model.encode(passage_batch).tolist()
    # prep metadata and upsert batch
    metadata = [
        {
            'text': passage, 
            'paragraph_name': blogs_df[blogs_df['text'] == passage].iloc[0]['paragraph'], 
            'article_name': blogs_df[blogs_df['text'] == passage].iloc[0]['article'], 
            'model': model_name
        } 
        for passage in passage_batch
    ]
    to_upsert = zip(ids_batch, embeds, metadata)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

print(index.describe_index_stats())

Creating index...


100%|██████████| 137/137 [00:42<00:00,  3.22it/s]

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 8746}},
 'total_vector_count': 8746}





In [21]:
# upsert using sentence transformer model

from tqdm import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = 'all-mpnet-base-v2'
model = SentenceTransformer(model_name, device=device)
model.max_seq_length = 256

import pinecone

pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment='us-west1-gcp'
)
# create a new index if does not already exist
print('Creating index...')
if 'nocd-search-huggingface' not in pinecone.list_indexes():
    pinecone.create_index(
        'nocd-search-huggingface',
        dimension=model.get_sentence_embedding_dimension(),
        metric='dotproduct',
        pods=1  # increase for faster mining
    )
# connect
index = pinecone.Index('nocd-search-huggingface')

passages = get_text(blogs_df)  # generator that loads (query, passage) pairs
blogs_df = blogs_df.replace({np.nan: ''})

batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(passages), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(passages))
    # get batch of lines and IDs
    passage_batch = passages[i: i+batch_size]
    ids_batch = [f"{str(n)}_{model_name}" for n in range(i, i_end)]
    # create embeddings
    embeds = model.encode(passage_batch).tolist()
    # prep metadata and upsert batch
    metadata = [
        {
            'text': passage, 
            'paragraph_name': blogs_df[blogs_df['text'] == passage].iloc[0]['paragraph'], 
            'article_name': blogs_df[blogs_df['text'] == passage].iloc[0]['article'], 
            'model': model_name
        } 
        for passage in passage_batch
    ]
    to_upsert = zip(ids_batch, embeds, metadata)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

print(index.describe_index_stats())

Creating index...


100%|██████████| 137/137 [00:54<00:00,  2.52it/s]

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 13119}},
 'total_vector_count': 13119}





In [22]:
# upsert using sentence transformer model

from tqdm import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = 'multi-qa-distilbert-cos-v1'
model = SentenceTransformer(model_name, device=device)
model.max_seq_length = 256

import pinecone

pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment='us-west1-gcp'
)
# create a new index if does not already exist
print('Creating index...')
if 'nocd-search-huggingface' not in pinecone.list_indexes():
    pinecone.create_index(
        'nocd-search-huggingface',
        dimension=model.get_sentence_embedding_dimension(),
        metric='dotproduct',
        pods=1  # increase for faster mining
    )
# connect
index = pinecone.Index('nocd-search-huggingface')

passages = get_text(blogs_df)  # generator that loads (query, passage) pairs
blogs_df = blogs_df.replace({np.nan: ''})

batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(passages), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(passages))
    # get batch of lines and IDs
    passage_batch = passages[i: i+batch_size]
    ids_batch = [f"{str(n)}_{model_name}" for n in range(i, i_end)]
    # create embeddings
    embeds = model.encode(passage_batch).tolist()
    # prep metadata and upsert batch
    metadata = [
        {
            'text': passage, 
            'paragraph_name': blogs_df[blogs_df['text'] == passage].iloc[0]['paragraph'], 
            'article_name': blogs_df[blogs_df['text'] == passage].iloc[0]['article'], 
            'model': model_name
        } 
        for passage in passage_batch
    ]
    to_upsert = zip(ids_batch, embeds, metadata)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

print(index.describe_index_stats())

Downloading: 100%|██████████| 737/737 [00:00<00:00, 301kB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 136kB/s]
Downloading: 100%|██████████| 9.46k/9.46k [00:00<00:00, 6.75MB/s]
Downloading: 100%|██████████| 523/523 [00:00<00:00, 318kB/s]
Downloading: 100%|██████████| 116/116 [00:00<00:00, 60.5kB/s]
Downloading: 100%|██████████| 25.5k/25.5k [00:00<00:00, 802kB/s]
Downloading: 100%|██████████| 265M/265M [00:06<00:00, 39.1MB/s] 
Downloading: 100%|██████████| 53.0/53.0 [00:00<00:00, 31.8kB/s]
Downloading: 100%|██████████| 112/112 [00:00<00:00, 73.7kB/s]
Downloading: 100%|██████████| 466k/466k [00:00<00:00, 2.80MB/s]
Downloading: 100%|██████████| 333/333 [00:00<00:00, 294kB/s]
Downloading: 100%|██████████| 13.8k/13.8k [00:00<00:00, 416kB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 1.71MB/s]
Downloading: 100%|██████████| 349/349 [00:00<00:00, 334kB/s]


Creating index...


100%|██████████| 137/137 [00:44<00:00,  3.09it/s]

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 17492}},
 'total_vector_count': 17492}





In [26]:
# upsert using sentence transformer model

from tqdm import tqdm
import numpy as np
from sentence_transformers import SentenceTransformer
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name, device=device)
model.max_seq_length = 256

import pinecone

pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment='us-west1-gcp'
)
# create a new index if does not already exist
print('Creating index...')
if 'nocd-search-huggingface-mini-lm' not in pinecone.list_indexes():
    pinecone.create_index(
        'nocd-search-huggingface-mini-lm',
        dimension=model.get_sentence_embedding_dimension(),
        metric='dotproduct',
        pods=1  # increase for faster mining
    )
# connect
index = pinecone.Index('nocd-search-huggingface-mini-lm')

passages = get_text(blogs_df)  # generator that loads (query, passage) pairs
blogs_df = blogs_df.replace({np.nan: ''})

batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(passages), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(passages))
    # get batch of lines and IDs
    passage_batch = passages[i: i+batch_size]
    ids_batch = [f"{str(n)}_{model_name}" for n in range(i, i_end)]
    # create embeddings
    embeds = model.encode(passage_batch).tolist()
    # prep metadata and upsert batch
    metadata = [
        {
            'text': passage, 
            'paragraph_name': blogs_df[blogs_df['text'] == passage].iloc[0]['paragraph'], 
            'article_name': blogs_df[blogs_df['text'] == passage].iloc[0]['article'], 
            'model': model_name
        } 
        for passage in passage_batch
    ]
    to_upsert = zip(ids_batch, embeds, metadata)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))

print(index.describe_index_stats())

Creating index...


100%|██████████| 137/137 [00:34<00:00,  3.94it/s]


{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4373}},
 'total_vector_count': 4373}


In [9]:
# upsert using OPENAI model

from tqdm import tqdm
import numpy as np
import openai

import os
import pinecone
import time

openai.api_key = os.getenv('OPENAI_API_KEY')
openai_model_doc = 'text-search-babbage-doc-001'

pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment='us-west1-gcp'
)

test_embedding = openai.Embedding.create(input='test', engine=openai_model_doc)
shape = [len(a['embedding']) for a in test_embedding['data']]

if 'nocd-search-openai' not in pinecone.list_indexes():
    pinecone.create_index(
        'nocd-search-openai',
        dimension=shape[0],
        metric='dotproduct',
        pods=1  # increase for faster mining
    )
# connect
index = pinecone.Index('nocd-search-openai')

passages = get_text(blogs_df)  
blogs_df = blogs_df.replace({np.nan: ''})

batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(passages), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(passages))
    # get batch of lines and IDs
    passage_batch = passages[i: i+batch_size]
    ids_batch = [f"{str(n)}_{openai_model_doc}" for n in range(i, i_end)]
    # create embeddings
    res = openai.Embedding.create(input=passage_batch, engine=openai_model_doc)
    embeds = [record['embedding'] for record in res['data']]
    # prep metadata and upsert batch
    metadata = [{'text': passage, 'paragraph_name': blogs_df[blogs_df['text'] == passage].iloc[0]['paragraph'], 'article_name': blogs_df[blogs_df['text'] == passage].iloc[0]['article'], 'model': openai_model_doc} for passage in passage_batch]
    to_upsert = zip(ids_batch, embeds, metadata)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))
    time.sleep(10)

print(index.describe_index_stats())

100%|██████████| 137/137 [24:55<00:00, 10.91s/it]

{'dimension': 2048,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 4373}},
 'total_vector_count': 4373}





In [16]:
# upsert using COHERE model

from tqdm import tqdm
import numpy as np
import cohere

import os
import pinecone
import time

co = cohere.Client(os.getenv('COHERE_API_KEY'))

pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),
    environment='us-west1-gcp'
)

test_embedding = co.embed(texts=['hello', 'goodbye'], model='medium').embeddings
shape = np.array(test_embedding).shape

if 'nocd-search-openai' not in pinecone.list_indexes():
    pinecone.create_index(
        'nocd-search-openai',
        dimension=shape[1],
        metric='dotproduct',
        pods=1  # increase for faster mining
    )
# connect
index = pinecone.Index('nocd-search-openai')

passages = get_text(blogs_df)  
blogs_df = blogs_df.replace({np.nan: ''})

batch_size = 32  # process everything in batches of 32
for i in tqdm(range(0, len(passages), batch_size)):
    # set end position of batch
    i_end = min(i+batch_size, len(passages))
    # get batch of lines and IDs
    passage_batch = passages[i: i+batch_size]
    ids_batch = [f"{str(n)}_cohere-medium" for n in range(i, i_end)]
    # create embeddings
    embeds = co.embed(texts=passage_batch, model='medium').embeddings
    # prep metadata and upsert batch
    metadata = [
        {
            'text': passage, 
            'paragraph_name': blogs_df[blogs_df['text'] == passage].iloc[0]['paragraph'], 
            'article_name': blogs_df[blogs_df['text'] == passage].iloc[0]['article'], 
            'model': 'cohere-medium'
        } 
            for passage in passage_batch
    ]
    to_upsert = zip(ids_batch, embeds, metadata)
    # upsert to Pinecone
    index.upsert(vectors=list(to_upsert))
    time.sleep(5)

print(index.describe_index_stats())

100%|██████████| 137/137 [13:18<00:00,  5.83s/it]

{'dimension': 2048,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 8746}},
 'total_vector_count': 8746}





### Test Base Model

In [None]:
def convert_url(passage):
    sep = passage.split(',')
    start = sep[0].split(' ')[:4]
    end = sep[-1].split(' ')[-4:]

    start = '%20'.join(start)
    end = '%20'.join(end)

    return start + ',' + end

def convert_url_v2(passage):
    sep = passage.split(',')
    start = sep[0].split(' ')[:4]
    end = sep[-1].split(' ')[-4:]
    if len(start) < 2:
        end = sep[-1].split(' ')[-6:]

    start = '%20'.join(start)
    end = '%20'.join(end)

    return start + ',' + end

def query_db(query, model, index, passages):
    query_emb = model.encode(query, convert_to_tensor=True, show_progress_bar=False)
    res = index.query(query_emb.tolist(), top_k=10, include_metadata=True)

    nocd = 'https://www.treatmyocd.com'

    print(f'Search Query: {query}\n')
    print('---------------------------------------------------------------------------------------------------------------------')
    print('Results\n')
    for item in res.matches:
        print(f"Article: {nocd}{item['metadata']['article']}#:~:text={convert_url(passage_dict[int(item['id'])])}")
        print(f"Paragraph Header: {item['metadata']['paragraph']}")
        print(f"{item['score']} {passage_dict[int(item['id'])]}...\n")

In [None]:
pairs_gen = get_pairs(query_passage_df)
passage_dict = {i: p for i, (q, p) in enumerate(pairs_gen)}

from IPython.display import clear_output

while True:
    clear_output(wait=True)
    query = input("Search NOCD: ")
    if query == 'quit': break
    query_db(query=query, model=model, index=index, passages=passage_dict)

In [1]:
import torch

torch.cuda.is_available()
torch.cuda.get_device_name(0)
torch.__version__

  from .autonotebook import tqdm as notebook_tqdm


'1.12.1+cu116'