
## For demonstrating the unstructured semantic similarity search using different indexing libraries such as FAISS-CPU, NMSLIB and ANNOY

#For loading and reading the unstructured data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("/content/train.csv")

In [3]:
data.head()

Unnamed: 0,questions,group
0,How did serfdom develop in and then leave Russ...,DESCRIPTION
1,What films featured the character Popeye Doyle ?,ENTITY
2,How can I find a list of celebrities ' real na...,DESCRIPTION
3,What fowl grabs the spotlight after the Chines...,ENTITY
4,What is the full form of .com ?,ABBREVIATION


In [4]:
data["questions"]

0       How did serfdom develop in and then leave Russ...
1        What films featured the character Popeye Doyle ?
2       How can I find a list of celebrities ' real na...
3       What fowl grabs the spotlight after the Chines...
4                         What is the full form of .com ?
                              ...                        
5449             What type of currency is used in China ?
5450                      What is the temperature today ?
5451                What is the temperature for cooking ?
5452                 What currency is used in Australia ?
5453         what is the sales of boost in february 2016?
Name: questions, Length: 5454, dtype: object

In [5]:
data["questions"].to_list()

['How did serfdom develop in and then leave Russia ?',
 'What films featured the character Popeye Doyle ?',
 "How can I find a list of celebrities ' real names ?",
 'What fowl grabs the spotlight after the Chinese Year of the Monkey ?',
 'What is the full form of .com ?',
 'What contemptible scoundrel stole the cork from my lunch ?',
 "What team did baseball 's St. Louis Browns become ?",
 'What is the oldest profession ?',
 'What are liver enzymes ?',
 'Name the scar-faced bounty hunter of The Old West .',
 'When was Ozzy Osbourne born ?',
 'Why do heavier objects travel downhill faster ?',
 'Who was The Pride of the Yankees ?',
 'Who killed Gandhi ?',
 'What is considered the costliest disaster the insurance industry has ever faced ?',
 'What sprawling U.S. state boasts the most airports ?',
 'What did the only repealed amendment to the U.S. Constitution deal with ?',
 'How many Jews were executed in concentration camps during WWII ?',
 "What is `` Nine Inch Nails '' ?",
 'What is an

# Vector embedding of textual sentences are creating using sentence transformers library


In [6]:
!pip install sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [9]:
sentence_embeddings = model.encode(data["questions"].to_list())
sentence_embeddings

array([[ 0.06284279,  0.00273532, -0.03178175, ..., -0.03608458,
         0.0421262 , -0.00992043],
       [-0.11667443, -0.0652365 , -0.08709509, ...,  0.02314842,
         0.09870717,  0.00510823],
       [-0.05369086, -0.10398319, -0.07077931, ..., -0.00346679,
        -0.0031372 ,  0.06796701],
       ...,
       [ 0.01460203, -0.01387092, -0.11486384, ...,  0.06358419,
        -0.07308903, -0.05935076],
       [ 0.03359708,  0.05150928, -0.10736816, ..., -0.10486734,
         0.09732234,  0.01799935],
       [-0.0965664 ,  0.02972623, -0.05166465, ..., -0.16755626,
         0.00207879,  0.0560682 ]], dtype=float32)

# If required we can store these embeddings in .npy format

In [10]:
sentence_embeddings.shape

(5454, 384)

In [11]:
import numpy as np
data_encoding = np.array(sentence_embeddings)
np.save("embeddings.npy",data_encoding)

## Indexing and performing semantic search using faiss-cpu

In [12]:
!pip install faiss-cpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [13]:
import faiss
vector_dimension = sentence_embeddings.shape[1]
index = faiss.IndexFlatL2(vector_dimension)
faiss.normalize_L2(sentence_embeddings)
index.add(sentence_embeddings)

Searching


In [14]:
search_text = 'I like to play cricket'
search_vector = model.encode(search_text)
_vector = np.array([search_vector])
faiss.normalize_L2(_vector)

In [15]:
k = 5
distances, ann = index.search(_vector, k=k)
results_faiss = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})
results_faiss

Unnamed: 0,distances,ann
0,0.884469,2455
1,1.102721,2046
2,1.130592,4654
3,1.144442,641
4,1.209917,2815


In [18]:
merge_faiss = pd.merge(results_faiss,data,left_on='ann',right_index=True)
merge_faiss

Unnamed: 0,distances,ann,questions,group
0,0.884469,2455,What Asian country once thrilled to the sport ...,LOCATION
1,1.102721,2046,Who invented the game bowling ?,HUMAN
2,1.130592,4654,How many bails are there in a cricket wicket ?,NUMERIC
3,1.144442,641,What sport do you shag flies in ?,ENTITY
4,1.209917,2815,How many seats does the Batmobile sport ?,NUMERIC


##Indexing and performing semantic search using NMSLIB

In [19]:
!pip install nmslib

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [20]:
import nmslib
vectors = sentence_embeddings.astype('float32')
index = nmslib.init(method='hnsw',space='cosinesimil')
index.addDataPointBatch(vectors)
index.createIndex({'post':2})

In [21]:
search_text = 'I like to play cricket'
search_vector = model.encode(search_text)
ann,distances = index.knnQuery(search_vector,5)
results_hnsw = pd.DataFrame({'distances': distances, 'ann': ann})
results_hnsw

Unnamed: 0,distances,ann
0,0.442234,2455
1,0.551361,2046
2,0.565296,4654
3,0.572221,641
4,0.604958,2815


In [22]:
merge_hnsw = pd.merge(results_hnsw,data,left_on='ann',right_index=True)
merge_hnsw

Unnamed: 0,distances,ann,questions,group
0,0.442234,2455,What Asian country once thrilled to the sport ...,LOCATION
1,0.551361,2046,Who invented the game bowling ?,HUMAN
2,0.565296,4654,How many bails are there in a cricket wicket ?,NUMERIC
3,0.572221,641,What sport do you shag flies in ?,ENTITY
4,0.604958,2815,How many seats does the Batmobile sport ?,NUMERIC


## Indexing and performing semantic search using Annoy

In [23]:
!pip install annoy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [24]:
from annoy import AnnoyIndex
euclideanIndex = AnnoyIndex(sentence_embeddings.shape[1], 'euclidean')
angularIndex = AnnoyIndex(sentence_embeddings.shape[1],'angular')

In [25]:
for index,embed in enumerate(sentence_embeddings):
  euclideanIndex.add_item(index,embed)
  angularIndex.add_item(index,embed)
euclideanIndex.build(1)
angularIndex.build(1)
euclideanIndex.save('euclideanIndex.ann')
angularIndex.save('angularIndex.ann')

True

In [26]:
search_text = 'I like to play cricket'
search_vector = model.encode(search_text)
vectors = euclideanIndex.get_nns_by_vector(search_vector, 5, search_k=-1, include_distances=True)
results_euclidean_annoy = pd.DataFrame({'distances': vectors[1], 'ann': vectors[0]})
results_euclidean_annoy

Unnamed: 0,distances,ann
0,1.132006,3103
1,1.177047,4458
2,1.238077,2767
3,1.239318,3421
4,1.249836,2099


In [27]:
merge_annoy_euclidean = pd.merge(results_euclidean_annoy,data,left_on='ann',right_index=True)
merge_annoy_euclidean

Unnamed: 0,distances,ann,questions,group
0,1.132006,3103,In which sport is there a `` scrum '' ?,ENTITY
1,1.177047,4458,How many innings constitute an official baseba...,NUMERIC
2,1.238077,2767,How many warmup pitches does a reliever get co...,NUMERIC
3,1.239318,3421,How can I get my product licensed by the NBA ?,DESCRIPTION
4,1.249836,2099,What 's the International Lawn Tennis Challeng...,ENTITY


In [28]:
vectors = angularIndex.get_nns_by_vector(search_vector, 5, search_k=-1, include_distances=True)
results_angular_annoy = pd.DataFrame({'distances': vectors[1], 'ann': vectors[0]})
results_angular_annoy

Unnamed: 0,distances,ann
0,0.940462,2455
1,1.151145,1477
2,1.17127,1044
3,1.183545,1828
4,1.210067,1795


In [29]:
merge_annoy_angular = pd.merge(results_angular_annoy,data,left_on='ann',right_index=True)
merge_annoy_angular

Unnamed: 0,distances,ann,questions,group
0,0.940462,2455,What Asian country once thrilled to the sport ...,LOCATION
1,1.151145,1477,What sport is played on the largest field ?,ENTITY
2,1.17127,1044,What net game sees its women 's world amateur ...,ENTITY
3,1.183545,1828,What Asian city boasts the world 's biggest bo...,LOCATION
4,1.210067,1795,Name a South African diamond producer ?,HUMAN
