<a href="https://colab.research.google.com/github/rahiakela/audio-processing-research-and-practice/blob/main/01_keyword_extraction_using_faiss.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Introduction to Faiss

**Reference**

[Introduction to Facebook AI Similarity Search (Faiss)](https://www.pinecone.io/learn/faiss-tutorial/)

https://stackoverflow.com/questions/70707551/saving-bert-sentence-embedding

##Setup

In [None]:
!pip install faiss-cpu
!pip install -U sentence-transformers

In [2]:
import requests
from io import StringIO
import pandas as pd
import numpy as np
import pickle

from sentence_transformers import SentenceTransformer

import faiss

##Building Vectors

In [3]:
data_df = pd.read_csv("icd_10_code_and_keywords_v2.csv")
data_df.head()

Unnamed: 0,Code,Keyword
0,A00,Cholera
1,A00.0,"Cholera due to Vibrio cholerae 01, biovar chol..."
2,A00.1,"Cholera due to Vibrio cholerae 01, biovar eltor"
3,A00.9,"Cholera, unspecified"
4,A01,Typhoid and paratyphoid fevers


In [5]:
sentences = data_df["Keyword"]
len(sentences)

96745

In [None]:
sentences.head()

0                                              Cholera
1    Cholera due to Vibrio cholerae 01, biovar chol...
2      Cholera due to Vibrio cholerae 01, biovar eltor
3                                 Cholera, unspecified
4                       Typhoid and paratyphoid fevers
Name: Keyword, dtype: object

In [6]:
# remove duplicates and NaN
sentences = [word for word in list(set(sentences)) if type(word) is str]

In [7]:
sentences[:5]

['Benign neoplasm of left breast',
 'Corros unsp deg mult left fingers (nail), inc thumb, init',
 'Spontaneous rupture of flexor tendons, unspecified shoulder',
 'Erythema nodosum',
 'Laceration without foreign body of left shoulder, sequela']

In [8]:
len(sentences)

95970

##Building Dense Vectors

In [None]:
# initialize sentence transformer model
model = SentenceTransformer("bert-base-nli-mean-tokens")

In [20]:
# create sentence embeddings
sentence_embeddings = model.encode(sentences)
sentence_embeddings.shape

(95970, 768)

In [21]:
# save model
model.save("icd_10_keyword_embeddings.h5")

In [30]:
model.load("icd_10_keyword_embeddings.h5")

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)

##Loading embedding

In [38]:
with open('sentence_embeddings.pickle', 'rb') as pkl:
  sentence_embeddings = pickle.load(pkl)

##Euclidean(L2) distance 

In [22]:
# vector dimensionality
d = sentence_embeddings.shape[1]
d

768

In [39]:
# initialize IndexFlatL2 index
index = faiss.IndexFlatL2(768)
print(index.is_trained)

True


In [40]:
# load our embeddings
index.add(sentence_embeddings)
index.ntotal

95970

##Query Sentence

In [41]:
# Then search given a query xq and number of nearest neigbors to return k
k = 4
xq = model.encode(["Contact with and (suspected) exposure to COVID-19"])

In [42]:
%%time

# search
D, I = index.search(xq, k)
print(I)

[[69401 30684 30569 94300]]
CPU times: user 51.5 ms, sys: 0 ns, total: 51.5 ms
Wall time: 56.9 ms


In [43]:
[f'{i}: {sentences[i]}' for i in I[0]]

['69401: Poikiloderma vasculare atrophicans',
 '30684: Laceration w/o foreign body of r rng fngr w damage to nail',
 '30569: Hydroxyapatite deposition disease, unspecified elbow',
 '94300: Infct of amniotic sac and membrns, unsp, second tri, oth']

```log
['69401: Contact with and (suspected) exposure to COVID-19',
 '30684: Encounter for screening for COVID-19',
 '30569: Unvaccinated for COVID-19',
 '94300: Post COVID-19 condition']
```

In [44]:
%%time

xq = model.encode(["Hypertriglyceridemia, sporadic"])

# search
D, I = index.search(xq, k)
print(I)

[[85952   467  1869 16091]]
CPU times: user 208 ms, sys: 988 µs, total: 209 ms
Wall time: 215 ms


In [45]:
[f'{i}: {sentences[i]}' for i in I[0]]

['85952: Aneurysmal bone cyst, unspecified shoulder',
 '467: Corros 20-29% of body surface w 20-29% third degree corros',
 '1869: Degenerative myopia, left eye',
 '16091: Traumatic amputation of nose']

```log
['85952: Hyperglycemia, unspecified',
 '467: Hyperlipidemia, unspecified',
 '1869: Generalized hyperhidrosis',
 '16091: Hypercalcemia']
 ```

In [None]:
%%time

xq = model.encode(["Diagnosis Cough"])

# search
D, I = index.search(xq, k)
print(I)

[[52012 50084 38774 56988]]
CPU times: user 112 ms, sys: 0 ns, total: 112 ms
Wall time: 117 ms


In [None]:
[f'{i}: {sentences[i]}' for i in I[0]]

['52012: Cough',
 '50084: Cough syncope',
 '38774: Subacute cough',
 '56988: Chronic cough']

In [None]:
%%time

xq = model.encode(["Testing done at Silver Pine Medical Group unless otherwise specified."])

# search
D, I = index.search(xq, k)
print(I)

[[61295 13794 69141  8205]]
CPU times: user 160 ms, sys: 999 µs, total: 161 ms
Wall time: 162 ms


In [None]:
[f'{i}: {sentences[i]}' for i in I[0]]

['61295: Encounter for screening for infec/parastc diseases, unsp',
 '13794: Obs & eval of NB for suspected genetic condition ruled out',
 '69141: Abn lev drug/meds/biol subst in specimens from oth org/tiss',
 '8205: Encntr for medical obs for susp diseases and cond ruled out']

##Saving and loading `sentence_embeddings`

In [None]:
import pickle

with open('sentence_embeddings.pickle', 'wb') as pkl:
  pickle.dump(sentence_embeddings, pkl)

In [None]:
with open('sentence_embeddings.pickle', 'rb') as pkl:
  sentence_embeddings2 = pickle.load(pkl)

# load our embeddings
index.add(sentence_embeddings2)
index.ntotal

191940

In [None]:
xq = model.encode(["Contact with and (suspected) exposure to COVID-19"])
# search
D, I = index.search(xq, k)
print(I)

[[ 69401 165371  30684 126654]]


In [None]:
sentences[69401]

'Contact with and (suspected) exposure to COVID-19'

In [None]:
sentences[30684]

'Encounter for screening for COVID-19'

In [None]:
[f'{i}: {sentences[i]}' for i in I[0]]

IndexError: ignored

##Partitioning The Index

In [None]:
# how many cells
n_list = 50
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, n_list)

In [None]:
# train index on data before adding any data to the index
index.is_trained

False

In [None]:
index.train(sentence_embeddings)
# check if index is now trained
index.is_trained

True

In [None]:
index.add(sentence_embeddings)
# number of embeddings indexed
index.ntotal

14504

In [None]:
# Let’s search again using the same indexed sentence embeddings and the same query vector
%%time

# search
D, I = index.search(xq, k)
print(I)

[[ 3853  3412 10113  4266]]
CPU times: user 589 µs, sys: 16 µs, total: 605 µs
Wall time: 833 µs


In [None]:
[f'{i}: {sentences[i]}' for i in I[0]]

['3853: A group of football players is running in the field',
 '3412: A group of people playing football is running in the field',
 '10113: Two groups of people are playing football',
 '4266: A person playing football is running past an official carrying a football']

In [None]:
# We can increase the number of nearby cells to search too with nprobe.
index.nprobe = 10

In [None]:
%%time

# search
D, I = index.search(xq, k)
print(I)

[[ 3853  3412 10113  4266]]
CPU times: user 1.08 ms, sys: 3 µs, total: 1.08 ms
Wall time: 1.09 ms


In [None]:
[f'{i}: {sentences[i]}' for i in I[0]]

['3853: A group of football players is running in the field',
 '3412: A group of people playing football is running in the field',
 '10113: Two groups of people are playing football',
 '4266: A person playing football is running past an official carrying a football']

##Vector Reconstruction

In [None]:
# now, to reconstruct the vectors, we must first create these direct mappings
index.make_direct_map()

In [None]:
index.reconstruct(11122)[:100]

array([ 1.2838421 ,  0.21651636,  0.9712451 ,  0.31156752,  0.28217337,
        0.91760635,  1.5379874 ,  0.18022525, -0.87787133, -0.02692633,
       -1.3939755 , -0.02931822, -0.0910926 ,  0.8252405 ,  1.1967825 ,
        0.46976212, -0.4116232 , -0.44104576,  0.00642029,  0.14851105,
       -0.43181732,  0.53935724,  0.6806572 , -1.2024508 , -0.5175477 ,
       -0.87696105, -0.14817108, -0.24578056, -1.1521579 ,  0.2925653 ,
        0.1566306 ,  0.9666366 ,  0.48437908, -0.14077239,  0.6266316 ,
       -0.28244448, -0.12196147, -1.0980004 ,  0.39501837, -0.4538445 ,
        0.03117619,  0.350108  ,  0.088641  , -0.04181439,  0.49622074,
       -0.02361662,  2.3334348 , -0.49966168, -0.1949345 , -0.37520537,
        0.02379034, -0.2796866 ,  1.5051224 ,  0.75192493, -0.23107927,
       -0.4343509 ,  0.41142032, -0.58381236,  0.6106839 ,  0.5567988 ,
        0.39692488, -0.09096396,  0.01875396,  0.06579515, -0.54263747,
       -0.29689386, -0.2615427 , -0.36894163,  0.02824443, -0.37

##Quantization

In [None]:
# number of centroid IDs in final compressed vectors
m = 8

# number of bits in each centroid
bits = 8

# we keep the same L2 distance flat index
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFPQ(quantizer, d, n_list, m, bits)

In [None]:
index.is_trained

True

In [None]:
index.train(sentence_embeddings)

In [None]:
index.add(sentence_embeddings)

In [None]:
# now we’re ready to begin searching using our new index
index.nprobe = 10

In [None]:
%%time

D, I = index.search(xq, k)
print(I)

[[ 3853 18357  3412 17916]]
CPU times: user 3.63 ms, sys: 46 µs, total: 3.68 ms
Wall time: 2.66 ms


In [None]:
[f'{i}: {sentences[i]}' for i in [3853, 3412]]

['3853: A group of football players is running in the field',
 '3412: A group of people playing football is running in the field']