In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec

import os
import time
import torch

In [3]:
from tqdm.auto import tqdm

In [4]:
from dotenv import load_dotenv

load_dotenv()

True

In [5]:
dataset = load_dataset('quora', split='train[240000:290000]', trust_remote_code=True)

In [6]:
dataset[:5]

{'questions': [{'id': [207550, 351729],
   'text': ['What is the truth of life?', "What's the evil truth of life?"]},
  {'id': [33183, 351730],
   'text': ['Which is the best smartphone under 20K in India?',
    'Which is the best smartphone with in 20k in India?']},
  {'id': [351731, 351732],
   'text': ['Steps taken by Canadian government to improve literacy rate?',
    'Can I send homemade herbal hair oil from India to US via postal or private courier services?']},
  {'id': [37799, 94186],
   'text': ['What is a good way to lose 30 pounds in 2 months?',
    'What can I do to lose 30 pounds in 2 months?']},
  {'id': [351733, 351734],
   'text': ['Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?',
    'How do you graph x + 2y = -2?']}],
 'is_duplicate': [False, True, False, True, False]}

In [7]:
questions = []
for record in dataset['questions']:
    questions.extend(record['text'])
question = list(set(questions))
print('\n'.join(questions[:10]))
print('-' * 50)
print(f'Number of questions: {len(questions)}')

What is the truth of life?
What's the evil truth of life?
Which is the best smartphone under 20K in India?
Which is the best smartphone with in 20k in India?
Steps taken by Canadian government to improve literacy rate?
Can I send homemade herbal hair oil from India to US via postal or private courier services?
What is a good way to lose 30 pounds in 2 months?
What can I do to lose 30 pounds in 2 months?
Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?
How do you graph x + 2y = -2?
--------------------------------------------------
Number of questions: 100000


In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device != 'cuda':
    print('Sorry no cuda.')
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

Sorry no cuda.


In [9]:
query = 'which city is the most populated in the world?'
xq = model.encode(query)
xq.shape

(384,)

In [10]:
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

# Set index name
INDEX_NAME = os.getenv("PINECONE_INDEX_NAME")

# Check if index exists and delete it
if pc.has_index(INDEX_NAME):
    pc.delete_index(INDEX_NAME)

# List of indexes
print("List of indexes:")
print(pc.list_indexes())


if not pc.has_index(INDEX_NAME):
    # Create index
    pc.create_index(
        name=INDEX_NAME, 
        dimension=model.get_sentence_embedding_dimension(), 
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        )
    )
    
    # Access index
    index = pc.Index(INDEX_NAME)
    print(index)

List of indexes:
[]
<pinecone.db_data.index.Index object at 0x30e5f9910>


In [11]:
# List of indexes
print("List of indexes:")
print(pc.list_indexes())

List of indexes:
[{
    "name": "dl-ai-index",
    "metric": "cosine",
    "host": "dl-ai-index-szkg0wr.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}]


In [12]:
batch_size=200
vector_limit=10000

questions = question[:vector_limit]

import json

for i in tqdm(range(0, len(questions), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(questions))
    # create IDs batch
    ids = [str(x) for x in range(i, i_end)]
    # create metadata batch
    metadatas = [{'text': text} for text in questions[i:i_end]]
    # create embeddings
    xc = model.encode(questions[i:i_end])
    # create records list for upsert
    records = zip(ids, xc, metadatas)
    # upsert to Pinecone
    index.upsert(vectors=records)

100%|██████████| 50/50 [00:57<00:00,  1.14s/it]


In [13]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000,
 'vector_type': 'dense'}

In [14]:
# small helper function so we can repeat queries later
def run_query(query):
  embedding = model.encode(query).tolist()
  results = index.query(top_k=10, vector=embedding, include_metadata=True, include_values=False)
  for result in results['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}")

In [15]:
run_query('which city has the highest population in the world?')

0.58: Which country has the highest per capita income?
0.54: Which city is the best in India? Why?
0.53: Which is the coldest country in the world?
0.5: Where is the most beautiful beach in the world?
0.49: How do I Construct a multiple bar chart to show population in 10000 of the given cities?
0.49: Which are the most dangerous places on the earth to live? Why?
0.49: Which city in China do you prefer to live in? Why?
0.49: What city have you visited that had the most romantic people?
0.49: What is the most racist country?
0.49: Which country is known for beautiful people?


In [16]:
query = 'how do i make chocolate cake?'
run_query(query)

0.51: Where can I found adorable baked cupcakes in Gold Coast?
0.51: How long does cake last in the fridge?
0.49: Are You Looking For Tasty Chocolates in Bangalore?
0.48: Where can I found different flavours for cupcakes at Gold Coast?
0.47: Where can I find delicious cupcakes at Gold Coast?
0.45: How do you make cotton candy flavoring? How is cotton candy made?
0.43: Where can I get best flavors, designs and decorations for cupcakes at Gold Coast?
0.43: How is dark chocolate good for one's health?
0.43: How do I make green tea?
0.43: Why is there no chocolate-flavored chewing gum?
