## Setup and Import Libraries

In [2]:
import os
import json
import time
import utils
import torch
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm
from utils import Utils

import warnings
warnings.filterwarnings('ignore')

## Load the Dataset

In [3]:
dataset = load_dataset('quora', split='train[240000:290000]')

In [4]:
dataset[:5]

{'questions': [{'id': [207550, 351729],
   'text': ['What is the truth of life?', "What's the evil truth of life?"]},
  {'id': [33183, 351730],
   'text': ['Which is the best smartphone under 20K in India?',
    'Which is the best smartphone with in 20k in India?']},
  {'id': [351731, 351732],
   'text': ['Steps taken by Canadian government to improve literacy rate?',
    'Can I send homemade herbal hair oil from India to US via postal or private courier services?']},
  {'id': [37799, 94186],
   'text': ['What is a good way to lose 30 pounds in 2 months?',
    'What can I do to lose 30 pounds in 2 months?']},
  {'id': [351733, 351734],
   'text': ['Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?',
    'How do you graph x + 2y = -2?']}],
 'is_duplicate': [False, True, False, True, False]}

In [5]:
questions = []
for record in dataset['questions']:
    questions.extend(record['text'])
    
question = list(set(questions))
print('\n'.join(questions[:10]))
print('-' * 50)
print(f'Number of questions: {len(questions)}')

What is the truth of life?
What's the evil truth of life?
Which is the best smartphone under 20K in India?
Which is the best smartphone with in 20k in India?
Steps taken by Canadian government to improve literacy rate?
Can I send homemade herbal hair oil from India to US via postal or private courier services?
What is a good way to lose 30 pounds in 2 months?
What can I do to lose 30 pounds in 2 months?
Which of the following most accurately describes the translation of the graph y = (x+3)^2 -2 to the graph of y = (x -2)^2 +2?
How do you graph x + 2y = -2?
--------------------------------------------------
Number of questions: 100000


## Check cuda and Setup the model

**Note**: "Checking cuda" refers to checking if you have access to GPUs (faster compute). If using CPUs, some code cells taking a little longer to run.

We are using *all-MiniLM-L6-v2* sentence-transformers model that maps sentences to a 384 dimensional dense vector space.

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device != 'cuda':
    print('Sorry no cuda.')
    
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

Sorry no cuda.


In [7]:
query = 'which city is the most populated in the world?'
xq = model.encode(query)
xq.shape

(384,)

## Setup Pinecone

In [8]:
utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()

In [11]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)
INDEX_NAME = utils.create_index_name('t-ind')

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(INDEX_NAME)
print(INDEX_NAME)
pinecone.create_index(name=INDEX_NAME, 
    dimension=model.get_sentence_embedding_dimension(), 
    metric='cosine',
    spec=ServerlessSpec(cloud='aws', region='us-east-1'))

index = pinecone.Index(INDEX_NAME)
print(index)

t-ind-thpugi5b8qynkab8tfzq0ae9gcrijw7ctqaa
<pinecone.data.index.Index object at 0x000001E8B5DEF7C0>


## Create Embeddings and Upsert to Pinecone

In [12]:
batch_size=200
vector_limit=10000

questions = question[:vector_limit]

In [13]:
for i in tqdm(range(0, len(questions), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(questions))
    
    # create IDs batch
    ids = [str(x) for x in range(i, i_end)]
    
    # create metadata batch
    metadatas = [{'text': text} for text in questions[i:i_end]]
    
    # create embeddings
    xc = model.encode(questions[i:i_end])
    
    # create records list for upsert
    records = zip(ids, xc, metadatas)
    
    # upsert to Pinecone
    index.upsert(vectors=records)

100%|██████████| 50/50 [01:25<00:00,  1.70s/it]


In [14]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10000}},
 'total_vector_count': 10000}

## Run Query

In [15]:
def run_query(query):
  embedding = model.encode(query).tolist()
  results = index.query(top_k=10, vector=embedding, include_metadata=True, include_values=False)
  for result in results['matches']:
      print(f"{round(result['score'], 2)}: {result['metadata']['text']}")

In [16]:
query = 'which city has the highest population in the world?'
run_query(query)

0.7: Which is the most beautiful city in world?
0.66: What is the greatest, most beautiful city in the world?
0.59: Which is the highest peak of the world?
0.58: Which country has the highest per capita income?
0.56: Why does China and several other Asian countries have such a high population?
0.55: What is the recent population of India?
0.55: Which is best city in India?
0.54: What are the largest slums in the world?
0.53: Which is the coldest country in the world?
0.52: What is the highest mountain in Europe?


In [17]:
query = 'how do i make chocolate cake?'
run_query(query)

0.77: How do I make a cake from scratch?
0.61: What is a cake mix?
0.57: How do you make candles?
0.55: What is the difference between chocolate and truffles and how are they made?
0.52: How do I make my chocolate last longer (preservation)?
0.51: Where can I found adorable baked cupcakes in Gold Coast?
0.49: How is pumpkin pie made?
0.49: How do you make whipped cream without heavy cream?
0.48: Where can I get great range of flavours for cupcakes at Gold Coast?
0.45: Why is banana bread considered a bread and not a cake?
