based on https://learn.deeplearning.ai/courses/building-applications-vector-databases

In [2]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
!conda list sentence

# packages in environment at C:\Users\Asus\.conda\envs\llms:
#
# Name                    Version                   Build  Channel
sentence-transformers     2.7.0              pyhd8ed1ab_0    conda-forge
sentencepiece             0.1.96          py310h476a331_1    conda-forge


In [4]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from pinecone import Pinecone, ServerlessSpec
from tqdm.auto import tqdm
from DLAIUtils import Utils
import DLAIUtils

import os
import time
import torch
import json

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device != 'cuda':
    print('Sorry no cuda.')

### Load dataset

In [6]:
dataset = load_dataset('quora', split='train[240000:290000]')

In [7]:
dataset.shape

(50000, 2)

In [10]:
dataset[5:10]

{'questions': [{'id': [351735, 91369],
   'text': ['What is the purpose of yawning?',
    'Why do mammals yawn and stretch?']},
  {'id': [351736, 339786],
   'text': ['Can anyone help me solve this math riddle?',
    'Mathematics and Physics: Can anyone help me solve this?']},
  {'id': [351737, 351738],
   'text': ['What SAT/ACT scores are average for the University of Michigan?',
    'What is considered a low SAT/ACT score to get into Stanford?']},
  {'id': [351739, 351740],
   'text': ['What is the business can I do now that Modi is trying to change India to a cashless economy?',
    'What is the business I can do now that Modi is trying to change India to a cashless economy?']},
  {'id': [351741, 199625],
   'text': ["Do Christians really still believe in Noah's Ark, Santa Claus, and Jonah living inside the whale?",
    "Why do rational people believe in Noah's Ark?"]}],
 'is_duplicate': [False, False, False, True, False]}

In [11]:
questions = [y for x in dataset['questions'] for y in x['text']]

In [12]:
questions[:5]

['What is the truth of life?',
 "What's the evil truth of life?",
 'Which is the best smartphone under 20K in India?',
 'Which is the best smartphone with in 20k in India?',
 'Steps taken by Canadian government to improve literacy rate?']

In [13]:
len(questions)

100000

### Load model

In [14]:
model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

In [15]:
query = 'which city is the most populated in the world?'
xq = model.encode(query)
xq.shape

(384,)

In [19]:
model.get_sentence_embedding_dimension()

384

### Setup Pinecone

In [16]:
utils = Utils()

In [17]:
PINECONE_API_KEY = utils.get_pinecone_api_key()
pinecone = Pinecone(api_key=PINECONE_API_KEY)

In [18]:
INDEX_NAME = utils.create_dlai_index_name('dl-ai')

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(INDEX_NAME)
print(INDEX_NAME)

dl-ai-j532omwwt3blbkfjv9hy1uwtzo8q3errx4vh


In [19]:
pinecone.create_index(name=INDEX_NAME, 
    dimension=model.get_sentence_embedding_dimension(), 
    metric='cosine',
    spec=ServerlessSpec(cloud='aws', region="us-east-1"))

index = pinecone.Index(INDEX_NAME)
print(index)

<pinecone.data.index.Index object at 0x0000023D69ECEAA0>


### Embeddings

In [20]:
batch_size=200
vector_limit=10000

questions = questions[:vector_limit]

In [21]:
for i in tqdm(range(0, len(questions), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(questions))
    # create IDs batch
    ids = [str(x) for x in range(i, i_end)]
    # create metadata batch
    metadatas = [{'text': text} for text in questions[i:i_end]]
    # create embeddings
    xc = model.encode(questions[i:i_end])
    # create records list for upsert
    records = zip(ids, xc, metadatas)
    # upsert to Pinecone
    index.upsert(vectors=records)

100%|███████████████████████████████████████████████████████████████████████████| 50/50 [00:54<00:00,  1.09s/it]


In [22]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10200}},
 'total_vector_count': 10200}

### Query

In [23]:
# small helper function so we can repeat queries later
def run_query(query):
  embedding = model.encode(query).tolist()
  results = index.query(top_k=10, vector=embedding, include_metadata=True, include_values=False)
  for result in results['matches']:
    print(f"{round(result['score'], 2)}: {result['metadata']['text']}")

In [24]:
query = 'what is zionism?'
run_query(query)

0.57: What is the conflict between Israel and Palestine?
0.53: Is Judaism polytheistic?
0.49: When did the conflict between Israel and Palestine start?
0.49: Was early Judaism polytheist?
0.47: What individuals and events in history are a source of pride for Israel?
0.46: What are some common misconceptions about Israel?
0.46: What is the definition of "Nation"?
0.46: What is demonitization?
0.45: What is the definition of a nation?
0.43: What are the contributions of Islamism to the world?
