In [3]:
pip install -U sentence-transformers

# for apple silicon macs
# conda install -c conda-forge sentence-transformers

In [1]:
import scipy
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
sentences = ['This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.']
sentence_embeddings = model.encode(sentences)

print("Sentence embeddings:")
print(sentence_embeddings.shape)

  0%|          | 0.00/245M [00:00<?, ?B/s]

Sentence embeddings:
(3, 768)


In [2]:
import pandas as pd
import numpy as np

In [27]:
data=pd.read_table('./test.tsv')

In [28]:
data=data.drop_duplicates()

In [29]:
data=data.iloc[data['review'].drop_duplicates().index,:]
data=data.reset_index()

In [30]:
data.head()

Unnamed: 0.1,index,Unnamed: 0,drugName,condition,review,rating,date,usefulCount
0,0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9.0,"May 20, 2012",27
1,1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8.0,"April 27, 2010",192
2,2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5.0,"December 14, 2009",17
3,3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8.0,"November 3, 2015",10
4,4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9.0,"November 27, 2016",37


In [31]:
data.dropna(inplace=True,axis=0)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   index        5 non-null      int64  
 1   Unnamed: 0   5 non-null      int64  
 2   drugName     5 non-null      object 
 3   condition    5 non-null      object 
 4   review       5 non-null      object 
 5   rating       5 non-null      float64
 6   date         5 non-null      object 
 7   usefulCount  5 non-null      int64  
dtypes: float64(1), int64(3), object(4)
memory usage: 448.0+ bytes


In [32]:
#GPU
import time
s=time.time()
complaint_embeddings = model.encode(list(data['review']))
time.time()-s

0.7194428443908691

In [33]:
complaint_embeddings.shape

(5, 768)

In [34]:
from sys import getsizeof
round(getsizeof(complaint_embeddings) / 1024 / 1024,2)

0.01

In [35]:
search_space_len=complaint_embeddings.shape[0]
embedding_dims=complaint_embeddings.shape[1]

n_buckets=search_space_len/16

n_buckets

0.3125

In [36]:
2**11

2048

In [37]:
n_planes=11
n_repeats=25

In [39]:
#Generate 11 planes randomly. This gives us a 768 X 11 dimensional matrix
planes_l = [np.random.normal(size=(embedding_dims, n_planes)) for i in range(n_repeats)]
print(len(planes_l))
planes_l[0].shape

25


(768, 11)

In [41]:
def hash_vector(v, planes):

    #Dot vector with randomly generated planes
    dot_product = np.dot(v.T,planes) #( 1 , 768 ) X (768, 11)

    # get the sign of the dot product (1,11) shaped vector
    sign_of_dot_product = np.sign(dot_product)

    h = np.squeeze(sign_of_dot_product>=0)


    hash_value = 0

    n_planes = planes.shape[1]
    for i in range(n_planes):
        # increment the hash value by 2^i * h_i
        hash_value += np.power(2,i)*h[i]

    hash_value = int(hash_value)

    return hash_value

In [42]:
def make_hash_tables(search_space, planes):


    num_of_planes = planes.shape[1]

    # number of buckets is 2^(number of planes)
    num_buckets = 2**num_of_planes

    # Keys are integers (0,1,2.. number of buckets)
    # Values are empty lists
    hash_table = {i:[] for i in range(num_buckets)}

    # Keys are integers (0,1,2... number of buckets)
    # Values are empty lists
    id_table = {i:[] for i in range(num_buckets)}

    # for each vector in 'search_space'
    for i, v in enumerate(search_space):
        # calculate the hash value for the vector
        h = hash_vector(v,planes)

        # store the vector into hash_table at key h,

        hash_table[h].append(v)

        # store the vector's index 'i' 
        id_table[h].append(i)


    return hash_table, id_table

In [43]:
# Creating the hashtables
hash_tables = []
id_tables = []
for i in range(n_repeats):  # there are 25 hash tables
    print('Creating  hash table :', i)
    planes = planes_l[i]
    hash_table, id_table = make_hash_tables(complaint_embeddings, planes)
    hash_tables.append(hash_table)
    id_tables.append(id_table)

Creating  hash table : 0
Creating  hash table : 1
Creating  hash table : 2
Creating  hash table : 3
Creating  hash table : 4
Creating  hash table : 5
Creating  hash table : 6
Creating  hash table : 7
Creating  hash table : 8
Creating  hash table : 9
Creating  hash table : 10
Creating  hash table : 11
Creating  hash table : 12
Creating  hash table : 13
Creating  hash table : 14
Creating  hash table : 15
Creating  hash table : 16
Creating  hash table : 17
Creating  hash table : 18
Creating  hash table : 19
Creating  hash table : 20
Creating  hash table : 21
Creating  hash table : 22
Creating  hash table : 23
Creating  hash table : 24


In [44]:
def reduce_search_space(v, planes_l, k=3, num_tables=n_repeats):

    # Vectors that will be checked as possible similar vectors
    candidate_vectors = []

    # list of IDs for our vectors
    candidate_ids = []

    candidate_ids_set = set()

    # loop through all hash tables 
    for table in range(num_tables):

        # get the set of planes from the planes_l list, for this particular hash table
        planes = planes_l[table]

        # get the hash value of the vector for this set of planes
        hash_value = hash_vector(v, planes)

        # get the hash table
        hash_table = hash_tables[table]

        # get the list of vectors for this hash table, with the same hash value as our vector v
        retreived_vectors = hash_table[hash_value]

        # get the id_table for this particular universe_id
        id_table = id_tables[table]

        new_ids_to_consider = id_table[hash_value]

        # loop through the subset of document vectors to consider
        for i, new_id in enumerate(new_ids_to_consider):

            # if the document ID is not yet in the set ids_to_consider...
            if new_id not in candidate_ids_set:
 

                # append the vector and id to corresponding lists
                candidate_vectors.append(retreived_vectors[i])
                candidate_ids.append(new_id)

                # also add the new_id to the set of ids to consider
                candidate_ids_set.add(new_id)


    # Now run k-NN on the smaller set of vecs-to-consider.
    print("Reduced space from 25971 documents to %d documents" % len(candidate_vectors))

    vecs_to_consider_arr = np.array(candidate_vectors)

    return vecs_to_consider_arr

In [24]:
problem='Chest feels heavy and difficulty in breathing, I keep coughing. Sweating alot'

In [25]:
problem_embedding=model.encode(problem)



In [34]:
vecs_to_consider=reduce_search_space(problem_embedding,planes_l)

Reduced space from 25971 documents to 2042 documents


In [35]:
from numpy import dot
from numpy.linalg import norm
def cosine_similarity(a,b):
  return dot(a, b)/(norm(a)*norm(b))

In [36]:
def nearest_k(v,search_embeddings,top_n=3):
  similarities=[]
  for i in search_embeddings:
    similarities.append(cosine_similarity(v,i))

  return search_embeddings[np.argsort(similarities)[::-1]][:3]
  


In [37]:
%%time
x=nearest_k(problem_embedding,complaint_embeddings)

CPU times: user 587 ms, sys: 44.3 ms, total: 631 ms
Wall time: 576 ms


In [38]:
%%time
y=nearest_k(problem_embedding,vecs_to_consider)

CPU times: user 43.9 ms, sys: 2.04 ms, total: 45.9 ms
Wall time: 53.1 ms


In [39]:
id_to_vec={}

for i in range(complaint_embeddings.shape[0]):
  id_to_vec[i]=complaint_embeddings[i]



In [40]:
#Results using entire search space
for k,v in id_to_vec.items():
  for t in x:
    if (v==t).all():
      print(data['review'][k],data['condition'][k])
      print()

"First it helped me breath I was wheezing and coughing terribly. The side effects suck. I get aggressive and irritable. Insomnia is terrible. I can feel swelling in my hands. I can feel my heart beat in my hands when I have fingers interlocked. 

I try to avoid all medications but this did help." Asthma

"Severe head feeling hot. Felt like head was swelling. Severe swelling of the face. Extreme itching in hands and feet. Confusion. Only took one tablet." Bacterial Infection

"Chest pain ,Trembling inside Body ,Dizziness," Seizures



In [41]:
#Results using reduced search space
for k,v in id_to_vec.items():
  for t in y:
    if (v==t).all():
      print(data['review'][k],data['condition'][k])
      print()

"First it helped me breath I was wheezing and coughing terribly. The side effects suck. I get aggressive and irritable. Insomnia is terrible. I can feel swelling in my hands. I can feel my heart beat in my hands when I have fingers interlocked. 

I try to avoid all medications but this did help." Asthma

"Severe head feeling hot. Felt like head was swelling. Severe swelling of the face. Extreme itching in hands and feet. Confusion. Only took one tablet." Bacterial Infection

"Chest pain ,Trembling inside Body ,Dizziness," Seizures

