<a href="https://colab.research.google.com/github/rahiakela/machine-learning-research-and-practice/blob/main/ai-powered-search/13-semantic-search/02_autocomplete.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Natural Language Autocomplete

In [None]:
!pip install sentence_transformers
!pip install nmslib

In [None]:
!pip install nmslib

In [None]:
!wget https://github.com/treygrainger/ai-powered-search/raw/main/docker/data-science/notebooks/densevectors/outdoors.py

In [5]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import util as STutil

import spacy
from spacy.matcher import Matcher

import nmslib

from outdoors import *
from plotnine import *

import pandas as pd
import pickle
import json
import tqdm
from IPython.display import display,HTML
pd.set_option('display.max_rows', 1000)

##Load Dataset

In [10]:
!rm -rf outdoors

In [None]:
#outdoors
!mkdir -p outdoors

!git clone https://github.com/ai-powered-search/outdoors.git
!cd outdoors && cat outdoors.tgz.part_* > outdoors.tgz
!cd outdoors && mkdir -p 'data/outdoors/' && tar -xvf outdoors.tgz -C 'data/outdoors/'

In [19]:
# let's transform the the outdoors/posts.csv file into a useable dataframe
outdoors_dataframe = cleanDataset('outdoors/data/outdoors/posts.csv')

In [20]:
print(len(outdoors_dataframe))

19585


##Build Vocabulary

In [24]:
nlp = spacy.load('en_core_web_sm')

def normalize(span):
    #normalizes a noun or verb phrase
    return ' '.join([tok.lemma_.lower() for tok in span])

def yield_tuple(df,column,total=100):
    #yields a spacy nlp.pipe compliant tuple of the column text values and its dataframe row as the context
    for idx,row in df.iterrows():
        if idx<total:
            yield (row[column],idx)

def get_concepts(df, total=None, load_from_cache=True):

    if load_from_cache:
        with open('outdoors/data/outdoors/outdoors_concepts.pickle','rb') as fd:
            concepts = pickle.load(fd)
        with open('outdoors/data/outdoors/outdoors_labels.pickle','rb') as fd:
            labels = pickle.load(fd)
        return concepts,labels

    #Setting load_from_cache to False will bring you here
    print('Extracting concepts. This could take a while. Take a break and stretch :)')

    #You can limit the number of rows processed by passing in total (an integer)
    if not total:
        total = len(df)

    #Get all the noun and verb phrases in the content
    phrases = [] #All the normalized noun/verb phrases ("concepts") in the corpus
    sources = [] #The original text labels that was normalized to the concept

    #Use the spacy matcher to chunk patterns into concept labels
    #We don't need a full taxonomy graph extraction, just a vocabulary with term frequencies
    matcher = Matcher(nlp.vocab)
    nountags = ['NN','NNP','NNS','NOUN'] #Nouns
    verbtags = ['VB','VBD','VBG','VBN','VBP','VBZ','VERB'] #Verbs
    matcher.add("noun_phrases", [[{"TAG":{"IN": nountags}, "IS_ALPHA": True,"OP":"+"}]])
    matcher.add("verb_phrases", [[{"TAG":{"IN": verbtags}, "IS_ALPHA": True,"OP":"+", "LEMMA":{"NOT_IN":["be"]}}]])
    for doc,idx in tqdm.tqdm(nlp.pipe(yield_tuple(df,"body",total=total), batch_size=40, n_threads=4, as_tuples=True),total=total):
        text = doc.text
        matches = matcher(doc)
        for matchid,start,end in matches:
            span = doc[start:end]
            phrases.append(normalize(span))
            sources.append(span.text)


    #Aggregate the normalized concepts by term frequency
    concepts = {}
    labels = {}
    for i in range(len(phrases)):
        phrase = phrases[i]
        if phrase not in concepts:
            concepts[phrase] = 0
            labels[phrase] = sources[i]
        concepts[phrase] += 1

    sorted_concepts = {k: v for k, v in sorted(concepts.items(), key=lambda item: item[1], reverse=True)}

    with open('outdoors/data/outdoors/outdoors_concepts.pickle','wb') as fd:
        pickle.dump(sorted_concepts,fd)
    with open('outdoors/data/outdoors/outdoors_labels.pickle','wb') as fd:
        pickle.dump(labels,fd)

    return sorted_concepts,labels

What are the concepts with the highest frequency?

In [28]:
# let's do examining the vocabulary
concepts, labels = get_concepts(outdoors_dataframe, load_from_cache=True)
top_cons = {k: v for (k, v) in concepts.items() if v > 5}

print(f"Total number of labels: {len(labels.keys())}")
print(f"Total number of concepts: {len(concepts.keys())}")
print(f"Concepts with greater than 5 term frequency: {len(top_cons.keys())}")

Total number of labels: 124366
Total number of concepts: 124366
Concepts with greater than 5 term frequency: 12375


In [None]:
json.dumps(top_cons, indent=2)

##Loading model

In [None]:
stsb = SentenceTransformer("roberta-base-nli-stsb-mean-tokens")
print(stsb)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 128, 'do_lower_case': True}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
)


##Encoding phrases

In [None]:
phrases = [
    "it's raining hard",
    "it is wet outside",
    "cars drive fast",
    "motorcycles are loud"
]

embeddings = stsb.encode(phrases, convert_to_tensor=True)
print(f"Number of embeddings: {len(embeddings)}")
print(f"Dimensions per embedding: {len(embeddings[0])}")
print(f"The embedding feature values of 'it\'s raining hard': \n{embeddings[0]}")

##Calculate similarity

In [None]:
similarities = STutil.pytorch_cos_sim(embeddings, embeddings)
print(f"The shape of the resulting similarities: {similarities.shape}")

The shape of the resulting similarities: torch.Size([4, 4])


In [None]:
a_phrases = []
b_phrases = []
scores = []

for a in range(len(similarities) - 1):
  for b in range(a + 1, len(similarities)):
    a_phrases.append(phrases[a])
    b_phrases.append(phrases[b])
    scores.append(float(similarities[a][b]))

df = pd.DataFrame({"phrase a": a_phrases, "phrase b": b_phrases, "score": scores})
df.sort_values(by=["score"], ascending=False, ignore_index=True)

Unnamed: 0,phrase a,phrase b,score
0,it's raining hard,it is wet outside,0.66906
1,cars drive fast,motorcycles are loud,0.590783
2,it's raining hard,cars drive fast,0.281166
3,it's raining hard,motorcycles are loud,0.2808
4,it is wet outside,motorcycles are loud,0.204867
5,it is wet outside,cars drive fast,0.138172
