### Jaccard Similarity

In [2]:
def jaccard_similarity(sentence1, sentence2):
    # Tokenize the sentences
    set1 = set(sentence1.lower().split())
    set2 = set(sentence2.lower().split())
    
    # Compute intersection and union of the token sets
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    
    print("Intersection: ", intersection)
    print("Union: ", union)
    
    # Compute Jaccard similarity
    jaccard_similarity = intersection / union if union != 0 else 0
    
    return jaccard_similarity

# Example usage:
sentence1 = "The quick brown fox jumps over the lazy dog"
sentence2 = "A quick brown dog jumps over the lazy fox"
print("Jaccard similarity:", jaccard_similarity(sentence1, sentence2))

Intersection:  8
Union:  9
Jaccard similarity: 0.8888888888888888


### W-Shingling

In [3]:
def generate_shingles(text, w):
    shingles = set()
    words = text.split()
    for i in range(len(words) - w + 1):
        shingle = ' '.join(words[i:i+w])
        shingles.add(shingle)
    return shingles

def jaccard_similarity(set1, set2):
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

# Example usage:
document1 = "The quick brown fox jumps over the lazy dog"
document2 = "A quick brown dog jumps over the lazy fox"
w = 3  # Shingle length

shingles1 = generate_shingles(document1, w)
shingles2 = generate_shingles(document2, w)

print("Shingles of Document 1:", shingles1)
print("Shingles of Document 2:", shingles2)

similarity = jaccard_similarity(shingles1, shingles2)
print("Jaccard similarity:", similarity)

Shingles of Document 1: {'brown fox jumps', 'jumps over the', 'the lazy dog', 'fox jumps over', 'over the lazy', 'The quick brown', 'quick brown fox'}
Shingles of Document 2: {'quick brown dog', 'brown dog jumps', 'jumps over the', 'the lazy fox', 'A quick brown', 'dog jumps over', 'over the lazy'}
Jaccard similarity: 0.16666666666666666


### Levenshtein

In [8]:
def levenshtein_distance(str1, str2):
    m = len(str1)
    n = len(str2)

    # Create a matrix to store distances
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    # Initialize the first row and column of the matrix
    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j

    # Calculate edit distance
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if str1[i - 1] == str2[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j],        # deletion
                                   dp[i][j - 1],        # insertion
                                   dp[i - 1][j - 1])    # substitution

    # Return the edit distance between the two strings
    return dp[m][n]

# Example usage:
str1 = "kitten"
str2 = "sitting"
distance = levenshtein_distance(str1, str2)
print("Levenshtein distance between '{}' and '{}': {}".format(str1, str2, distance))


Levenshtein distance between 'kitten' and 'sitting': 3


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def sentence_similarity(sentence1, sentence2):
    # Create TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Fit and transform the sentences
    tfidf_matrix = vectorizer.fit_transform([sentence1, sentence2])

    # Calculate cosine similarity between the vectors
    similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
    
    return similarity

# Example usage:
sentence1 = "The quick brown fox jumps over the lazy dog"
sentence2 = "A quick brown dog jumps over the lazy fox"

similarity = sentence_similarity(sentence1, sentence2)
print("TF-IDF Cosine Similarity:", similarity)


TF-IDF Cosine Similarity: 0.959403223600247
