In [3]:
import spacy
from math import sqrt, pow, exp
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from simple_elmo import ElmoModel
from sentence_transformers import SentenceTransformer, util

nlp = spacy.load('en_core_web_md')

In [None]:
def jaccard_similarity(x,y):
    """ Jaccard similarity takes into account only the set of unique words for each text document
    if a term like “HD” or “thermal efficiency” is used multiple times in one product description
    and just once in another product description, the Euclidean distance and cosine similarity would drop.
    On the other hand, if the total number of unique words stays the same, the Jaccard similarity
    will remain unchanged. """
    return len(set.intersection(*[set(x), set(y)]))/ len(set.union(*[set(x), set(y)]))


In [2]:
def euclidean_distance(x,y):
    """ We use spaCy’s in-built Word2Vec model to create text embeddings.
    Euclidean distance doesn’t work well with the sparse vectors of text embeddings.
    So cosine similarity is generally preferred over Euclidean distance when working with text data"""
    embeddings_x = nlp(x).vector
    embeddings_y = nlp(y).vector
    distance = sqrt(sum(pow(a-b,2) for a, b in zip(embeddings_x, embeddings_y)))
    #we need to normalize distance to the range of 0 to 1
    #we can use the Euler’s constant as less sensitive to outliers
    return 1/exp(distance)

In [4]:
def levenshtein_distance(str1, str2):

    '''Aim is to build a 2D matrix and track the cost or changes required
       by comparing each both strings character by character.
    '''
    # Initialize the zero matrix
    row_length = len(str1)+1
    col_length = len(str2)+1
    distance = np.zeros((row_length,col_length),dtype = int)

    # Populate matrix of zeros with the indices of each character of both strings
    for i in range(1, row_length):
        for k in range(1,col_length):
            distance[i][0] = i
            distance[0][k] = k

    # writng loops to find the cost of deletion, addition and substitution
    for col in range(1, col_length):
        for row in range(1, row_length):
            if str1[row-1] == str2[col-1]:
                cost = 0 # If the characters are the same in the two strings in a given position [i,j] then the cost is 0
            else:
                cost = 1

            distance[row][col] = min(distance[row-1][col] + 1,      # Cost of removal
                                 distance[row][col-1] + 1,          # Cost of addition
                                 distance[row-1][col-1] + cost)     # Cost of substitution

    return distance[row][col]


In [5]:
def cosine_similarity(x,y):
    """ similarity of two vectors as the cosine of the angle between two vectors.
    It determines whether two vectors are pointing in roughly the same direction.
    So if the angle between the vectors is 0 degrees, then the cosine similarity is 1.
    Cosine similarity is not affected by the magnitude/length of the feature vectors. """
    #spaCy’s in-built Word2Vec model to create text embeddings
    embeddings_x = nlp(x).vector
    embeddings_y = nlp(y).vector
    numerator = sum(a*b for a,b in zip(embeddings_x,embeddings_y))
    #return 3 rounded square rooted value
    squared_sum = round(sqrt(sum([a*a for a in embeddings])),3)
    denominator = squared_sum(embeddings_x)*squared_sum(embeddings_y)
    return round(numerator/float(denominator),3)


In [10]:
def bag_of_words_embeddings_similarity(list_of_sentences):
     """bag of words representation (also called count vectorizing), each word is represented by its count instead of 1.
     Regardless of that, both these approaches create huge, sparse vectors that capture absolutely no
     relational information"""
     vectorizer = CountVectorizer()
     X = vectorizer.fit_transform(list_of_sentences)
     return cosine_similarity(X.toarray())


In [14]:
def tfidf_embeddings_similarity(list_of_sentences):
    """TF-IDF vectors are an extension of the one-hot encoding model. Instead of considering the frequency of words
     in one document, the frequency of words across the whole corpus is taken into account. The big idea is that words
     that occur a lot everywhere carry very little meaning or significance. Although TF-IDF vectors offer a slight
     improvement over simple count vectorizing, they still have very high dimensionality and don’t capture semantic
     relationships."""
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(list_of_sentences)
    return cosine_similarity(X.toarray())


In [17]:
def word2vec_cosine_similarity(list_of_sentences):
    """CBOW is better at learning syntactic relationships between words while skip-gram is better at understanding
    the semantic relationships. In practical terms, this means that for a word like ‘dog’, CBOW would return
    morphologically similar words like plurals like 'dogs'. On the other hand, Skip-gram would consider morphologically
    different but semantically similar words like 'cat' or 'hamster'"""
    docs = [nlp(sentence) for sentence in list_of_sentences]
    similarity = []
    for i in range(len(docs)):
        row = []
        for j in range(len(docs)):
            row.append(docs[i].similarity(docs[j]))
        similarity.append(row)
    return similarity

In [18]:
def elmo_embeddings(sentence):
    """ELMo computes the embeddings from the internal states of a two-layer bidirectional Language Model (LM),
     thus the name “ELMo”: Embeddings from Language Models."""
    model = ElmoModel()
    model.load("/content/209.zip")
    return model.get_elmo_vectors(sentence, layers="average")


In [19]:
def sbert(sentences):
    """Sentence-BERT (SBERT) is a modified BERT network that uses siamese and triplet network structures to derive
    semantically meaningful sentence embeddings. This reduces the effort for finding the most similar pair from 65
    hours with BERT / RoBERTa to about 5 seconds with SBERT, while maintaining the accuracy from BERT"""
    model = SentenceTransformer('stsb-roberta-large')
    embeddings = model.encode(sentences, convert_to_tensor=True)
    similarity = []
    for i in range(len(sentences)):
        row = []
        for j in range(len(sentences)):
          row.append(util.pytorch_cos_sim(embeddings[i], embeddings[j]).item())
        similarity.append(row)
    return similarity