In [108]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import string

In [116]:
## clean the text and convert to lower string and replace all of punctuation
def clean_text(text):
    text = text.strip().lower()
    punctuation = string.punctuation

    for p in punctuation:
        text = text.replace(p, " ")

    return text


## caculate the consine similarity score based on TfidfVectorizer
def string_similarity(string1, string2, method="TF-IDF"):
    if string1 is None or string2 is None:
        return -1

    string1 = clean_text(string1)
    string2 = clean_text(string2)

    if len(string1) == 0 or len(string2) == 0:
        return -1
    
    vectorizer = TfidfVectorizer(analyzer='char')
    if method == "Count":
        vectorizer = CountVectorizer(analyzer='char')
    vectors = vectorizer.fit_transform([string1, string2])
    similarity = cosine_similarity(vectors)
    return similarity[0][1]

In [117]:
string1 = "Lucy Anderson"
string2 = "Lucia Ivenson"

print("string1: {}, string2: {}".format(string1, string2))
print("string1 and string2 matching score: {}".format(string_similarity(string1, string2,)))

string1: Lucy Anderson, string2: Lucia Ivenson
string1 and string2 matching score: 0.6059671415042287


In [118]:
string1 = "Lucy Anderson"
string2 = "  "

print("string1: {}, string2: {}".format(string1, string2))
print("string1 and string2 matching score: {}".format(string_similarity(string1, string2,)))

string1: Lucy Anderson, string2:   
string1 and string2 matching score: -1


In [119]:
def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0  # To avoid division by zero

def string_jaccard_similarity(string1, string2):
    if string1 is None or string2 is None:
        return -1

    string1 = clean_text(string1)
    string2 = clean_text(string2)

    if len(string1) == 0 or len(string2) == 0:
        return -1

    string1_set = set(string1)
    string2_set = set(string2)
    return jaccard_similarity(string1_set, string2_set)

In [120]:
string1 = "Tom Jeff"
string2 = "Jeff Tom"

similarity_score = string_jaccard_similarity(string1, string2)
print("Jaccard Similarity:", similarity_score)

Jaccard Similarity: 1.0


In [121]:
import textdistance

string1 = "Tom Jeff"
string2 = "Jeff Tom"

jaro_similarity_score = textdistance.jaro(string1, string2)
jaro_winkler_similarity_score = textdistance.jaro_winkler(string1, string2)
jaccard_similarity_score = textdistance.jaccard(string1, string2)
print("Jaro Similarity:", jaro_similarity_score)
print("Jaro-Winkler Similarity:", jaro_winkler_similarity_score)
print("Jaccard Similarity:", jaccard_similarity_score)

Jaro Similarity: 0.3333333333333333
Jaro-Winkler Similarity: 0.3333333333333333
Jaccard Similarity: 1.0


In [122]:
import torch
import torch.nn.functional as F

In [123]:
def text_to_char_tensor(text):
    # Convert text to lowercase and split into characters
    text = clean_text(text)
    chars = list(text)
    # Convert characters to ASCII values
    char_values = [ord(char) for char in chars]
    # Convert ASCII values into a tensor
    return torch.tensor(char_values, dtype=torch.float)

In [124]:
def string_consine_similarity(string1, string2):
    
    if string1 is None or string2 is None:
        return -1

    string1 = clean_text(string1)
    string2 = clean_text(string2)

    if len(string1) == 0 or len(string2) == 0:
        return -1
    # Convert text strings to tensors
    tensor1 = text_to_char_tensor(string1)
    tensor2 = text_to_char_tensor(string2)
    similarity = F.cosine_similarity(tensor1, tensor2, dim=0)
    return similarity.item()

In [125]:
string1 = "Tom Jeff"
string2 = "Jeff Tom"

print("String Consince Similarity:", string_consine_similarity(string1, string2))

String Consince Similarity: 0.9321610927581787


In [126]:
string1 = "Tom Jeff"
string2 = ""

print("String Consince Similarity:", string_consine_similarity(string1, string2))

String Consince Similarity: -1


In [127]:
string1 = "Lucy Anderson"
string2 = "Lucia Ivenson"

print("String Consince Similarity:", string_consine_similarity(string1, string2))

String Consince Similarity: 0.9681525230407715


In [128]:
string1 = "Tom Jeff"
string2 = "Jeff Tom"

string_similarity(string1, string2)

1.0000000000000007