In [19]:
from datasets import load_dataset
import json

In [5]:
dataset = load_dataset('quora', split='train[100000:150000]')
dataset

Dataset({
    features: ['questions', 'is_duplicate'],
    num_rows: 50000
})

In [7]:
# View first 2 records
dataset[:2]

{'questions': [{'id': [165932, 165933],
   'text': ['What should I ask my friend to get from UK to India?',
    'What is the process of getting a surgical residency in UK after completing MBBS from India?']},
  {'id': [123111, 39307],
   'text': ['How can I learn hacking for free?',
    'How can I learn to hack seriously?']}],
 'is_duplicate': [False, True]}

In [11]:
text = []

for ques in dataset['questions']:
    text.extend(ques['text'])

In [14]:
text = list(set(text))

In [15]:
text

['How do I compare the similarity of a string to an item within a list in Python?',
 '',
 'How is sensation different from perception?',
 'Why do body tissues other than the brain have low T2 values for MRI?',
 'What could be a synonym of "necessary evil"?',
 'How can I hack the others Facebook account?',
 'What causes temperature to change daily during each season?',
 'How did you feel about the ending of Rogue One?',
 'What is the best craft beer in the USA?',
 'What is the difference between sex and porn?',
 'Will Russia and U.S. go to war?',
 'How should I concentrate on my study?',
 'Do I have to worry about AI as a computer engineer?',
 'How do I deal with someone close to me blocking me on WhatsApp and Facebook?',
 'Why did JFK invite Nazi pilot Hanna Reitsch to the White House?',
 'How to learn MATLAB?',
 'I am an MBA graduate. I am looking for specialization courses in marketing. Can you suggest any such courses abroad?',
 "What effects will Andrew Cuomo's visit to Cuba in 201

In [16]:
len(text)

88720

In [17]:
from sentence_transformers import SentenceTransformer
import torch

# device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'mps'

In [18]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device = device)
model

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [21]:
pinecone_creds = json.load(open('PineCone.json'))

In [28]:
import pinecone
from pinecone import ServerlessSpec


pc = pinecone.Pinecone(api_key = pinecone_creds['PINECONE_API_KEY'])

cloud = 'aws'
region = 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

In [32]:
import time
index_name = 'semantic-search'

try:
    
    if index_name not in pc.list_indexes():
        pc.create_index(
            name=index_name,
            dimension=model.get_sentence_embedding_dimension(),
            metric='cosine',
            spec=spec
        )
except Exception as e:
    print(f'Index {index_name} already exists')
index = pc.Index(index_name)
time.sleep(1)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [34]:
from tqdm.auto import tqdm

batch_size = 128

for i in tqdm(range(0, len(text), batch_size)):

    i_end = min(i+batch_size, len(text))
    ids = [str(x) for x in range(i, i_end)]
    metadata = [{'text':question} for question in text[i:i_end]]
    xc = model.encode(text[i:i_end])
    records = zip(ids, xc, metadata)
    index.upsert(vectors=records)

index.describe_index_stats()

  0%|          | 0/694 [00:00<?, ?it/s]

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 96384}},
 'total_vector_count': 96384}

In [39]:
query = 'Which city has the largest population in the world?'

query_encodings = model.encode(query).tolist()

In [41]:
index.query(vector=query_encodings, top_k=5, include_metadata=True)

{'matches': [{'id': '52232',
              'metadata': {'text': 'What is the most populated city in the '
                                   'world?'},
              'score': 0.874452,
              'values': []},
             {'id': '15396',
              'metadata': {'text': 'Which is the most populated city in the '
                                   'world.?'},
              'score': 0.873866737,
              'values': []},
             {'id': '11783',
              'metadata': {'text': 'Which is biggest city in the world?'},
              'score': 0.857589185,
              'values': []},
             {'id': '49790',
              'metadata': {'text': 'What are the most populated cities in the '
                                   'world?'},
              'score': 0.842684507,
              'values': []},
             {'id': '56406',
              'metadata': {'text': 'What is the biggest city?'},
              'score': 0.799513638,
              'values': []}],
 'namespace': '',


In [43]:
query = 'Which is the top in demand job?'

query_encodings = model.encode(query).tolist()
index.query(vector=query_encodings, top_k=5, include_metadata=True)

{'matches': [{'id': '9307',
              'metadata': {'text': 'Which is the best job in the market?'},
              'score': 0.791630566,
              'values': []},
             {'id': '46299',
              'metadata': {'text': 'Which is Highest paid job in India?'},
              'score': 0.739718139,
              'values': []},
             {'id': '37499',
              'metadata': {'text': 'Which job has the highest salary?'},
              'score': 0.73333174,
              'values': []},
             {'id': '51465',
              'metadata': {'text': 'Which are the highest paid jobs in India?'},
              'score': 0.719765246,
              'values': []},
             {'id': '83606',
              'metadata': {'text': 'What job has the highest salary?'},
              'score': 0.716074824,
              'values': []}],
 'namespace': '',
 'usage': {'read_units': 6}}