In [None]:
import numpy as np
import pyspark.sql.functions as F
from typing import Callable
from ngram import NGram
from textdistance import damerau_levenshtein, jaro_winkler, sorensen_dice, jaccard, overlap, ratcliff_obershelp

In [None]:
%run ./function_tools.ipynb

In [None]:
@F.pandas_udf("double")
@function_vectorizer
def compound_similarity(s1: str, s2: str) -> float:
  """This function computes the a compound score measuring the similarity
  between two strings. The score is based on the following 7 metrics:
    - Damerau-Levenshtein - edit distance that also takes in account transpositions.
    - Jaro-Winkler - similarity based on common letters adjusted for the higher likelihood
        spelling to be correct in the beginning of a string.
    - n-gram - This similarity is based on the counts of n-grams (sequence of substrings 
        of length n) which are matching. It has been emprirically selected that the length
        of the n-grams in this case is set to N=2.
    - Jaccard - like n-grams without taking into account the cardinality (length) of the
        n-grams. Effectively, this gives n-gram similarity score for N=1.
    - Sorensen-Dice - Similar logic as Jaccard but with slight adjustments.
    - Overlap - measures the 'overlap' between two strings based on the number of common
        characters in them.
    - Ratcliff-Obershelp - takes into account the length of the fully matching substrings
        but also the number of matching characters from substrings that do not match completely.
        
  Arguments:
    s1 {str} -- The first strings.
    s2 {str} -- The second strings.
    
  Returns:
    float -- The mean of the similarity scores coming from the 7 algorithms. 0 means not similar
      at all and 1 means that the two strings match perfectly. If Either of the two strings are
      empty, the similarity will be treated as 0.
  """
  if s1 is None:
    s1 = ""
  if s2 is None:
    s2 = ""
  if s1 == "" and s2 == "":
    return 0.
  scores = [
    damerau_levenshtein.normalized_similarity(s1, s2),
    jaro_winkler.normalized_similarity(s1, s2),
    sorensen_dice.normalized_similarity(s1, s2),
    jaccard.normalized_similarity(s1, s2),
    overlap.normalized_similarity(s1, s2),
    ratcliff_obershelp.normalized_similarity(s1, s2),
    NGram.compare(s1, s2, N=2)
  ]
  return np.mean(scores)

In [None]:
compound_similarity_sdf = F.pandas_udf(function_vectorizer(compound_similarity), "double")