# Semantic Search


### Import the Needed Packages

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import sys

In [3]:
sys.path.append('../../../../DLAIUtils')

In [4]:
#!pip install python-dotenv
#!pip install datasets==2.15.0

In [5]:
#!pip install datasets==2.15.0

In [6]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from DLAIUtils import Utils
import os
import time
import torch
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

In [7]:
from tqdm.auto import tqdm

### Load the Dataset

In [8]:
dataset = load_dataset('quora', split='train[240000:290000]')

In [9]:
dataset[5:10]

{'questions': [{'id': [351735, 91369],
   'text': ['What is the purpose of yawning?',
    'Why do mammals yawn and stretch?']},
  {'id': [351736, 339786],
   'text': ['Can anyone help me solve this math riddle?',
    'Mathematics and Physics: Can anyone help me solve this?']},
  {'id': [351737, 351738],
   'text': ['What SAT/ACT scores are average for the University of Michigan?',
    'What is considered a low SAT/ACT score to get into Stanford?']},
  {'id': [351739, 351740],
   'text': ['What is the business can I do now that Modi is trying to change India to a cashless economy?',
    'What is the business I can do now that Modi is trying to change India to a cashless economy?']},
  {'id': [351741, 199625],
   'text': ["Do Christians really still believe in Noah's Ark, Santa Claus, and Jonah living inside the whale?",
    "Why do rational people believe in Noah's Ark?"]}],
 'is_duplicate': [False, False, False, True, False]}

In [20]:
questions = []
for record in dataset['questions']:
    questions.extend(record['text'])
question = list(set(questions))

# Clean text: remove problematic Unicode characters
def clean_text(text):
    # Replace problematic characters with ASCII equivalents
    text = text.replace('–', '-')  # en dash
    text = text.replace('—', '-')  # em dash
    text = text.replace(''', "'")  # left single quotation mark
    text = text.replace(''', "'")  # right single quotation mark
    text = text.replace('"', '"')  # left double quotation mark
    text = text.replace('"', '"')  # right double quotation mark
    # Remove any remaining non-ASCII characters
    text = text.encode('ascii', errors='ignore').decode('ascii')
    return text

question = [clean_text(q) for q in question]

print('\n'.join(question[:10]))
print('-' * 50)
print(f'Number of questions: {len(question)}')

Which one Angula 2 or Angular Js required to work with ionic 2 Apps?
How do I prevent iMovie for iPad from cropping the top and bottom off my video that was originally taken in portrait orientation?
What in your opinion is the best thing Jackie Peyton did in Showtime's nurse Jackie, and the worst?
How do I get a girlfriend in India?
What are some tips on finding your passion for work or a career?
Is there any legit trial codes for Xbox live gold w/o a generator?
How do I get into the ISB?
What are some funny video ideas?
How many dimensions are possible?
Which programming language is used for creating a Viber-like app?
--------------------------------------------------
Number of questions: 88919


### Check cuda and Setup the model

**Note**: "Checking cuda" refers to checking if you have access to GPUs (faster compute). In this course, we are using CPUs. So, you might notice some code cells taking a little longer to run.

We are using *all-MiniLM-L6-v2* sentence-transformers model that maps sentences to a 384 dimensional dense vector space.

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device != 'cuda':
    print('Sorry no cuda.')
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

Sorry no cuda.


In [12]:
query = 'which city is the most populated in the world?'
xq = model.encode(query)
xq.shape

(384,)

In [13]:
utils = Utils()

### Setup Pinecone

In [14]:
# Load API keys from .env file
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
ANTHROPIC_API_KEY = os.getenv('ANTHROPIC_API_KEY')

# Verify Pinecone API key is loaded
if not PINECONE_API_KEY or PINECONE_API_KEY == 'your_pinecone_api_key_here':
    raise ValueError("Please set PINECONE_API_KEY in .env file")

print("✓ API keys loaded from .env file")

✓ API keys loaded from .env file


In [17]:
pinecone = Pinecone(api_key=PINECONE_API_KEY)
INDEX_NAME = utils.create_dlai_index_name('dl-ai')

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(INDEX_NAME)
print(INDEX_NAME)
pinecone.create_index(name=INDEX_NAME, 
    dimension=model.get_sentence_embedding_dimension(), 
    metric='cosine',
    spec=ServerlessSpec(cloud='aws', region='us-east-1'))

index = pinecone.Index(INDEX_NAME)
print(index)

dl-ai-86hbriq-qif5lhm6p-ygourxdueztik7oaqa
<pinecone.db_data.index.Index object at 0x16cf376a0>
<pinecone.db_data.index.Index object at 0x16cf376a0>


### Create Embeddings and Upsert to Pinecone

In [21]:
batch_size=200
vector_limit=10000

questions = question[:vector_limit]

for i in tqdm(range(0, len(questions), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(questions))
    # create IDs batch
    ids = [str(x) for x in range(i, i_end)]
    # create metadata batch
    metadatas = [{'text': text} for text in questions[i:i_end]]
    # create embeddings
    xc = model.encode(questions[i:i_end])
    # create records list for upsert
    records = zip(ids, xc, metadatas)
    # upsert to Pinecone
    index.upsert(vectors=records)

  0%|          | 0/50 [00:00<?, ?it/s]

In [None]:
index.describe_index_stats()

### Run Your Query

In [22]:
# small helper function so we can repeat queries later
def run_query(query):
  embedding = model.encode(query).tolist()
  results = index.query(top_k=10, vector=embedding, include_metadata=True, include_values=False)
  for result in results['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}")

In [23]:
run_query('which city has the highest population in the world?')

0.59: Which are the top 10 largest cities of India by area?
0.57: Why does America have the most unsafe cities in the world?
0.55: Which is the highest mountain in the world?
0.55: Which country is the largest democracy in the world?
0.55: Which is best city in India?
0.54: What are the 20 most richest countries in the world?
0.53: Which is the coldest country in the world?
0.5: What's the most religious city in California?
0.48: What are the most peaceful country in the world without Islam?
0.48: Which country has most beautiful girls?


In [24]:
query = 'how do i make chocolate cake?'
run_query(query)

0.81: How can I make a delicious cake?
0.53: What should you do if your dog eats chocolate?
0.49: What is a red velvet cake?
0.49: How do you make whipped cream without heavy cream?
0.49: How can I make a banana pudding without bananas?
0.45: How do I make chili?
0.44: How do you make risotto?
0.42: Where can I buy very incredible and most amazing cupcakes in Gold Coast?
0.42: How do you make black food coloring?
0.42: How do you make Tim Horton's iced cappuccino recipe?


In [25]:
query = 'What is the capital of France?'
run_query(query)

0.58: What are the popular websites in France?
0.5: How do I immigrate to France?
0.49: Can I get a French working visa if I'm already in France?
0.48: Why does France have a permanent seat in the UN Security Council?
0.47: How much will it take to travel from south Africa to Paris?
0.45: Is French hard to learn?
0.44: Should I learn the French language?
0.43: Is the proper spelling of this word capiche, capisce, capishe, or capise?
0.43: What does it feel like to go through the educational system in France?
0.42: Where r all the teenage French speaking people on Quora?


In [26]:
query = 'how can I search for information effectively?'
run_query(query)

0.53: How do you search on Quora?
0.47: How can I learn advanced seo?
0.46: What are APIs for providing relevant articles by keyword?
0.45: Why do people ask question on Quora that can be easily and definitively answered by Googling?
0.45: What is the search algorithm used by the Google search engine? What is its complexity?
0.43: How can I get someone's search and browsing history through IP address?
0.43: Why are there so many people using Quora to answer questions that can easily be found with a simple Google search?
0.42: What's the single best source of information for medium to long term US stock investments?
0.42: How can I learn seo online?
0.41: Why do people write questions on Quora that could be answered with a quick web search?
