In [1]:
import numpy as np

In [2]:
h1 = np.array([0.24, 0.2, 0.16, 0.12, 0.08, 0.04, 0.12, 0.04])
h2 = np.array([0.22, 0.23, 0.16, 0.13, 0.11, 0.08, 0.05, 0.02])

In [4]:
#KL Distance

def kl_distance(P, Q):
    sum_p = np.where(P == 0, 1e-10, P)
    sum_q = np.where(Q == 0, 1e-10, Q)

    return np.sum(sum_p * np.log(sum_p / sum_q))

def bhattacharya_distance(P, Q):
    return -np.log(np.sum(np.sqrt(P * Q)))
    
kld = kl_distance(h1, h2)
bd = bhattacharya_distance(h1, h2)

print(f'KL distance: {kld}, Bhattacharya distance: {bd}')

KL distance: 0.06290516707464022, Bhattacharya distance: 0.014830056106628986


In [5]:
file1 = 'doc1.txt'

with open(file1, 'w') as file:
    file.write('MATLAB is a program for solving engineering and mathematical problems. The basic MATLAB objects are vectors and matrices, so you must be familiar with these before making extensive use of this program.')

file2 = 'doc2.txt'

with open(file2, 'w') as file:
    file.write('MATLAB works with essentially one kind of object, a rectangular numerical matrix. Here is some basic information on using MATLAB matrix commands.')
    

In [7]:
doc1 = open('doc1.txt', 'r').read()
doc2 = open('doc2.txt', 'r').read()

import math

# Step 1: Tokenize the documents and convert to lower case
def tokenize(doc):
    return doc.lower().split()

# Step 2: Create a vocabulary (unique words)
def build_vocabulary(doc1_tokens, doc2_tokens):
    return list(set(doc1_tokens) | set(doc2_tokens))

# Step 3: Vectorize the documents
def vectorize(doc_tokens, vocabulary):
    vector = [0] * len(vocabulary)
    for word in doc_tokens:
        if word in vocabulary:
            vector[vocabulary.index(word)] += 1
    return vector

# Step 4: Compute the dot product
def dot_product(vector1, vector2):
    return sum([v1 * v2 for v1, v2 in zip(vector1, vector2)])

# Step 5: Compute the magnitude (Euclidean norm)
def magnitude(vector):
    return math.sqrt(sum([v**2 for v in vector]))

# Step 6: Compute Cosine Similarity
def cosine_similarity(vector1, vector2):
    dot_prod = dot_product(vector1, vector2)
    magnitude1 = magnitude(vector1)
    magnitude2 = magnitude(vector2)
    return dot_prod / (magnitude1 * magnitude2)

# Process the documents
doc1_tokens = tokenize(doc1)
doc2_tokens = tokenize(doc2)

# Build the vocabulary from both documents
vocabulary = build_vocabulary(doc1_tokens, doc2_tokens)

# Vectorize both documents
vector1 = vectorize(doc1_tokens, vocabulary)
vector2 = vectorize(doc2_tokens, vocabulary)

# Calculate cosine similarity
cosine_sim = cosine_similarity(vector1, vector2)

print(f"Cosine Similarity: {cosine_sim}")


Cosine Similarity: 0.3061862178478973
