In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from scipy.spatial import distance as ssd
import numpy as np

#### Data dictionary

The voice dataset has many different fields. This notebook introduces a semantic search of those fields.

Let's first read the the data dictionary into a pandas dataframe.

In [None]:
rcdict = pd.read_csv('bridge2ai-Voice/bridge2ai-voice-corpus-1//b2ai-voice-corpus-1-dictionary.csv')
rcdict.head()

In [None]:
sentences = rcdict['Field Label'].values.tolist()
print(len(sentences))
sentences[:10]

#### Turning sentences into numbers

We will use MiniLM model from HuggingFace for generating sentence embeddings

In [None]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

The model has been downloaded from HuggingFace. If you are running this on your own own setup, replace:

`'models/sentence-transformers/all-MiniLM-L6-v2' --> 'sentence-transformers/all-MiniLM-L6-v2'`

in the two places in the code below

In [None]:
def embed_sentences(text_list):
    # Load model from HuggingFace Hub
    tokenizer = AutoTokenizer.from_pretrained('models/sentence-transformers/all-MiniLM-L6-v2')
    model = AutoModel.from_pretrained('models/sentence-transformers/all-MiniLM-L6-v2')
    
    # Tokenize sentences
    encoded_input = tokenizer(text_list, padding=True, truncation=True, return_tensors='pt')
    
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
    
    # Perform pooling
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    
    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)    
    return sentence_embeddings

Let's turn all the sentences into embeddings. Vectors that represent aspects of semantic meaning of the sentence.

In [None]:
embeddings = embed_sentences(sentences)

In [None]:
np.array(embeddings).shape

This is where we get to define search query. Feel free to replace the string with your own.

In [None]:
search_string = 'terms related to smoking and respiration'
search_embedding = embed_sentences([search_string,])

In [None]:
# Compute cosine similarity scores for the search string to all other sentences
sims = []
for embedding in embeddings:
    sims.append(1 - ssd.cosine(search_embedding[0], embedding))

In [None]:
# Sort sentences by similarity score in descending order (the most similar ones are first)
sorted_index = np.argsort(sims)[::-1]
sentences_sorted = np.array(sentences)[sorted_index]
sims = np.array(sims)[sorted_index]

In [None]:
plt.plot(sims)
plt.title("Cosine similarity");

Select a cutoff value from the figure where you notice a bending of the elbow.

In [None]:
cutoff = 0.3
sentences_sorted[sims > cutoff].tolist()

Reorder the dataframe according to the sorted index to retrieve variable names

In [None]:
rcdict.loc[sorted_index[sims > cutoff], :]