In [6]:
def tokenize(sentence):
    return sentence.lower().split()

def term_frequency(sentence):
    tf = {}
    for word in sentence:
        if word not in tf:
            tf[word] = 0
        tf[word] += 1
    return tf

def dot_product(vec1, vec2):
    return sum((vec1.get(f, 0) * vec2.get(f, 0) for f in set(vec1.keys()).union(set(vec2.keys()))))

def magnitude(vec):
    return sum((vec.get(f, 0)**2 for f in vec))

def cosine_similarity(vec1, vec2):
    return dot_product(vec1, vec2) / ((magnitude(vec1) * magnitude(vec2))**0.5)

# Define the sentences
sentence1 = "This is a foo bar sentence."
sentence2 = "This sentence is similar to a foo bar sentence."

# Tokenize the sentences
tokens1 = tokenize(sentence1)
tokens2 = tokenize(sentence2)

print(tokens1, tokens2)

# Create the term frequency vectors
tf1 = term_frequency(tokens1)
tf2 = term_frequency(tokens2)

print(tf1, tf2)

# Compute Cosine Similarity
cosine_sim = cosine_similarity(tf1, tf2)

print(f"Cosine similarity: {cosine_sim}")

['this', 'is', 'a', 'foo', 'bar', 'sentence.'] ['this', 'sentence', 'is', 'similar', 'to', 'a', 'foo', 'bar', 'sentence.']
{'this': 1, 'is': 1, 'a': 1, 'foo': 1, 'bar': 1, 'sentence.': 1} {'this': 1, 'sentence': 1, 'is': 1, 'similar': 1, 'to': 1, 'a': 1, 'foo': 1, 'bar': 1, 'sentence.': 1}
Cosine similarity: 0.816496580927726


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Define the sentences
sentence1 = "This is a foo bar sentence."
sentence2 = "This sentence is similar to a foo bar sentence."

# Create the Document Term Matrix
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform([sentence1, sentence2])

# Compute Cosine Similarity
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

print(f"Cosine similarity: {cosine_sim[0][0]}")

  (0, 1)	0.5773502691896258
  (0, 0)	0.5773502691896258
  (0, 2)	0.5773502691896258
  (1, 1)	0.3540997415957358
  (1, 0)	0.3540997415957358
  (1, 2)	0.7081994831914716
  (1, 3)	0.4976748316029239
Cosine similarity: 0.8177583245211001
