In [None]:
import numpy as np
import pandas as pd

# import plotly
# import colorlover as cl
# import plotly.offline as py
# import plotly.graph_objs as go

# plotly.tools.set_credentials_file(username='nholloway', api_key='Ef8vuHMUdvaIpvtC2lux')
# py.init_notebook_mode(connected=True)

import os
import urllib
import zipfile
import nltk
import numpy as np
import tensorflow as tf

In [None]:
FASTTEXT_PATH = '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
GLOVE_PATH = '../input/glove840b300dtxt/glove.840B.300d.txt'
NUMBERBATCH_PATH = '../input/conceptnet-numberbatch-vectors/numberbatch-en-17.06.txt/numberbatch-en-17.06.txt'

First we define our procedure for loading the embeddings and vocabulary into an embedding matrix.

In [None]:
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path) as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

def build_matrix(word_index, path):
    embedding_index = load_embeddings(path)
    embedding_matrix = np.zeros((len(word_index) + 1, 300))
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding_index[word]
        except KeyError:
            pass
    return embedding_matrix

We will preprocess our text by mapping punctuation and contractions to strings to make it easier to find embeddings. 

In [None]:
bold_start = '\033[1m'
bold_end = '\033[0m'
    
def cosine_similarity(a, b):
    nominator = np.dot(a, b)
    
    a_norm = np.sqrt(np.sum(a**2))
    b_norm = np.sqrt(np.sum(b**2))
    
    denominator = a_norm * b_norm
    
    cosine_similarity = nominator / denominator
    
    return cosine_similarity

def similarity_text(embed_name,embed_dict,word1,word2):

    if embed_name in ['bert','roberta','distilbert']:
        w1 = embed_dict.encode(word1,show_progress_bar=False)
        w2 = embed_dict.encode(word2,show_progress_bar=False)
    else:
        w1 = embed_dict[word1]
        w2 = embed_dict[word2]

    print(f"Cosine similarity using {bold_start}'{embed_name}'{bold_end} for pair \t\t\t\t\t ({word1,word2}) = {cosine_similarity(w1,w2)}")
    return 

# similarity_text("insurance","policy")

<a id='fasttext'></a>
## FastText 
---

In [None]:
%%time
fasttext_dict = load_embeddings(FASTTEXT_PATH)

In [None]:
similarity_text(embed_name="Fasttext Embedding",embed_dict=fasttext_dict,word1="insurance",word2="policy")

<a id='glove'></a>
## GloVE 
---

In [None]:
%%time
glove_dict = load_embeddings(GLOVE_PATH)

In [None]:
similarity_text(embed_name="Glove Embedding",embed_dict=glove_dict,word1="insurance",word2="policy")

Other popular models seem to use GloVe and Fasttext embeddings- but I have yet to see the use of [ConceptNet Numberbatch](https://github.com/commonsense/conceptnet-numberbatch) embeddings- which according to the README were specifically created for dealing with bias in text.  

<a id='numberbatch'></a>
## Conceptnet Numberbatch
---

In [None]:
%%time
conceptnet_numberbatch_dict = load_embeddings(NUMBERBATCH_PATH)

In [None]:
similarity_text(embed_name="Conceptnet Numberbatch Embedding",embed_dict=conceptnet_numberbatch_dict,word1="insurance",word2="policy")

<a id='bert'></a>
## BERT Embeddings
---

In this section I'll show how to create custom BERT embeddings from a pretrained BERT model. Unfortunately, the BERT embeddings have 768 dimensions when trained on the small pre-trained model (and 1024 for the larger one) and I wasn't able train a model to benchmark because there isn't a lot of text pre-processing in this kernel and the vocabulary is really large. If you run into memory issues with the larger BERT embeddings consider decreasing the vocabulary.

In [None]:
!pip install -q sentence-transformers
!pip install --upgrade numpy

In [None]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [None]:
similarity_text(embed_name="bert",embed_dict=sbert_model,word1="health insurance",word2="health policy")
similarity_text(embed_name="bert",embed_dict=sbert_model,word1="life insurance",word2="car insurance")
similarity_text(embed_name="bert",embed_dict=sbert_model,word1="home loan",word2="credit loan")
similarity_text(embed_name="bert",embed_dict=sbert_model,word1="car insurance",word2="personal policy")
similarity_text(embed_name="bert",embed_dict=sbert_model,word1="health life insurance",word2="life insurance policy")
similarity_text(embed_name="bert",embed_dict=sbert_model,word1="investment insurance plan",word2="claim insurance policy")

## Roberta

In [None]:
roberta_model = SentenceTransformer('stsb-roberta-base-v2')

In [None]:
similarity_text(embed_name="roberta",embed_dict=roberta_model,word1="health insurance",word2="health policy")
similarity_text(embed_name="roberta",embed_dict=roberta_model,word1="life insurance",word2="car insurance")
similarity_text(embed_name="roberta",embed_dict=roberta_model,word1="home loan",word2="credit loan")
similarity_text(embed_name="roberta",embed_dict=roberta_model,word1="car insurance",word2="personal policy")
similarity_text(embed_name="roberta",embed_dict=roberta_model,word1="health life insurance",word2="life insurance policy")
similarity_text(embed_name="roberta",embed_dict=roberta_model,word1="investment insurance plan",word2="claim insurance policy")

## msmarco-distilbert-base-v3 Model finetune to use for cosine score

In [None]:
distilbert_model = SentenceTransformer('msmarco-distilbert-base-v3')

In [None]:
similarity_text(embed_name="distilbert",embed_dict=distilbert_model,word1="health insurance",word2="health policy")
similarity_text(embed_name="distilbert",embed_dict=distilbert_model,word1="life insurance",word2="car insurance")
similarity_text(embed_name="distilbert",embed_dict=distilbert_model,word1="home loan",word2="credit loan")
similarity_text(embed_name="distilbert",embed_dict=distilbert_model,word1="car insurance",word2="personal policy")
similarity_text(embed_name="distilbert",embed_dict=distilbert_model,word1="health life insurance",word2="life insurance policy")
similarity_text(embed_name="distilbert",embed_dict=distilbert_model,word1="investment insurance plan",word2="claim insurance policy")

## Scope
https://www.sbert.net/docs/pretrained_models.html

# Test

In [None]:
my_list = [("table","desk"),
("football","baseball"),
('water','fire'),
('table','desk'),
('football','baseball'),
('water','fire'),
('computer','calculator'),
('number','math'),
('boy','girl'),
('sad','happy'),
('good','bad'),
('turkey','television'),
('awesome','great'),
('coffee','giraffe'),
('cat','barcelona'),
('school','disaster')]

for i in my_list:
    a,b = i
    similarity_text(embed_name="Glove",embed_dict=glove_dict,word1=a,word2=b)
    similarity_text(embed_name="Fasttext",embed_dict=fasttext_dict,word1=a,word2=b)
    similarity_text(embed_name="Conceptnet",embed_dict=conceptnet_numberbatch_dict,word1=a,word2=b)
    similarity_text(embed_name="bert",embed_dict=sbert_model,word1=a,word2=b)
    similarity_text(embed_name="roberta",embed_dict=roberta_model,word1=a,word2=b)
    similarity_text(embed_name="distilbert",embed_dict=distilbert_model,word1=a,word2=b)
    print("-----------------------------------------------------------------------------------------------------------")

In [None]:
my_list = [("health","policy"),
("insurance","health"),
('customer','bot'),
('bajaj','health'),
('costly','expensive'),
('cheap','costly'),
('king','queen')]

for i in my_list:
    a,b = i
    similarity_text(embed_name="Glove",embed_dict=glove_dict,word1=a,word2=b)
    similarity_text(embed_name="Fasttext",embed_dict=fasttext_dict,word1=a,word2=b)
    similarity_text(embed_name="Conceptnet",embed_dict=conceptnet_numberbatch_dict,word1=a,word2=b)
    similarity_text(embed_name="bert",embed_dict=sbert_model,word1=a,word2=b)
    similarity_text(embed_name="roberta",embed_dict=roberta_model,word1=a,word2=b)
    similarity_text(embed_name="distilbert",embed_dict=distilbert_model,word1=a,word2=b)
    print("-----------------------------------------------------------------------------------------------------------")