In [41]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from scipy.spatial.distance import cosine
import gensim.downloader as api
import spacy

# Load SpaCy model
nlp = spacy.load('en_core_web_sm')  # Medium-sized model with word vectors

# Sample texts
texts = [
    "I love programming in Python",
    "Python is a great programming language"
]

# Function to compute TF-IDF and cosine similarity
def tfidf_cosine_similarity(texts):
    # Vectorize the texts using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    
    # Compute cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix)
    return cosine_sim

# Function to compute Word2Vec embeddings and cosine similarity
def word2vec_cosine_similarity(texts):
    # Preprocess texts
    processed_texts = [simple_preprocess(text) for text in texts]
    
    # Train a Word2Vec model
    model = Word2Vec(processed_texts, vector_size=100, window=5, min_count=1, workers=4)
    
    # Function to get the average vector for a text
    def get_average_vector(text):
        words = simple_preprocess(text)
        word_vectors = [model.wv[word] for word in words if word in model.wv]
        if len(word_vectors) == 0:
            return np.zeros(model.vector_size)
        return np.mean(word_vectors, axis=0)
    
    # Compute average vectors for all texts
    text_vectors = np.array([get_average_vector(text) for text in texts])
    
    # Compute cosine similarity
    cosine_sim = cosine_similarity(text_vectors)
    return cosine_sim

# Function to compute SpaCy word embeddings and cosine similarity
def spacy_cosine_similarity(texts):
    # Compute SpaCy embeddings for the texts
    text_vectors = np.array([nlp(text).vector for text in texts])
    
    # Compute cosine similarity
    cosine_sim = cosine_similarity(text_vectors)
    return cosine_sim

# Function to compute GloVe embeddings and cosine similarity
def glove_cosine_similarity(texts):
    # Load pretrained GloVe model
    glove_model = api.load("glove-wiki-gigaword-50")
    
    # Function to get the embedding for a text
    def get_embedding(text, model):
        words = text.lower().split()
        word_vectors = [model[word] for word in words if word in model]
        if not word_vectors:
            return np.zeros(model.vector_size)
        return np.mean(word_vectors, axis=0)
    
    # Compute embeddings for the texts
    embeddings = np.array([get_embedding(text, glove_model) for text in texts])
    
    # Compute cosine similarity
    cosine_sim = cosine_similarity(embeddings)
    return cosine_sim

# Compute similarities
tfidf_sim = tfidf_cosine_similarity(texts)
word2vec_sim = word2vec_cosine_similarity(texts)
spacy_sim = spacy_cosine_similarity(texts)
glove_sim = glove_cosine_similarity(texts)

# Print results
print("TF-IDF Cosine Similarity:")
print(tfidf_sim[0][1])

print("\nWord2Vec Cosine Similarity:")
print(word2vec_sim[0][1])

print("\nSpaCy Cosine Similarity:")
print(spacy_sim[0][1])

print("\nGloVe Cosine Similarity:")
print(glove_sim[0][1])

# Compare effectiveness
def compare_similarity_measures(*similarity_matrices):
    # Flatten the matrices for comparison
    flattened_matrices = [matrix.flatten() for matrix in similarity_matrices]
    
    # Compute correlation between all pairs of measures
    for i, matrix1 in enumerate(flattened_matrices):
        for j, matrix2 in enumerate(flattened_matrices):
            if i < j:
                correlation = np.corrcoef(matrix1, matrix2)[0, 1]
                print(f"Correlation between measure {i+1} and measure {j+1}: {correlation:.2f}")

# Compare all similarity measures
compare_similarity_measures(tfidf_sim, word2vec_sim, spacy_sim, glove_sim)

TF-IDF Cosine Similarity:
0.2912194185636897

Word2Vec Cosine Similarity:
0.5193556

SpaCy Cosine Similarity:
0.2715796

GloVe Cosine Similarity:
0.9222071
Correlation between measure 1 and measure 2: 1.00
Correlation between measure 1 and measure 3: 1.00
Correlation between measure 1 and measure 4: 1.00
Correlation between measure 2 and measure 3: 1.00
Correlation between measure 2 and measure 4: 1.00
Correlation between measure 3 and measure 4: 1.00


In [28]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from scipy.spatial.distance import cosine

# Sample texts
texts = [
    "I love programming in Python",
    "Python is a great programming language"
]

# Function to compute TF-IDF and cosine similarity
def tfidf_cosine_similarity(texts):
    # Vectorize the texts using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(texts)
    
    # Compute cosine similarity
    cosine_sim = cosine_similarity(tfidf_matrix)
    return cosine_sim

# Function to compute Word2Vec embeddings and" cosine similarity
def word2vec_cosine_similarity(texts):
    # Preprocess texts
    processed_texts = [simple_preprocess(text) for text in texts]
    
    # Train a Word2Vec model
    model = Word2Vec(processed_texts, vector_size=100, window=5, min_count=1, workers=4)
    
    # Function to get the average vector for a text
    def get_average_vector(text):
        words = simple_preprocess(text)
        word_vectors = [model.wv[word] for word in words if word in model.wv]
        if len(word_vectors) == 0:
            return np.zeros(model.vector_size)
        return np.mean(word_vectors, axis=0)
    
    # Compute average vectors for all texts
    text_vectors = np.array([get_average_vector(text) for text in texts])
    
    # Compute cosine similarity
    cosine_sim = cosine_similarity(text_vectors)
    return cosine_sim

# Compute similarities
tfidf_sim = tfidf_cosine_similarity(texts)
word2vec_sim = word2vec_cosine_similarity(texts)
print(f"TF-IDF Cosine Similarity: {tfidf_sim[0][1]}")
print(f"Word2Vec Cosine Similarity: {word2vec_sim[0][1]}")


TF-IDF Cosine Similarity: 0.2912194185636897
Word2Vec Cosine Similarity: 0.5193555951118469


In [24]:
import numpy as np
import gensim.downloader as api
from sklearn.metrics.pairwise import cosine_similarity
# Load pretrained GloVe model
glove_model = api.load("glove-wiki-gigaword-50")
def get_embedding(text, model):
    words = text.lower().split()
    word_vectors = [model[word] for word in words if word in model]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)
# Compute embeddings
embedding1 = get_embedding(texts[0], glove_model)
embedding2 = get_embedding(texts[1], glove_model)
# Compute cosine similarity
cosine_sim = cosine_similarity([embedding1], [embedding2])
print(f"Cosine Similarity (Word Embeddings - GloVe): {cosine_sim[0][0]:.4f}")

Cosine Similarity (Word Embeddings - GloVe): 0.9222


In [38]:
# Compare effectiveness
def compare_similarity_measures(tfidf_sim, word2vec_sim):
    # Flatten the matrices for comparison
    tfidf_values = tfidf_sim.flatten()
    word2vec_values = word2vec_sim.flatten()
    
    # Compute correlation between the two measures
    correlation = np.corrcoef(tfidf_values, word2vec_values)[0, 1]
    print(f"Correlation between TF-IDF and Word2Vec cosine similarity: {correlation:.2f}")

compare_similarity_measures(tfidf_sim, word2vec_sim)

Correlation between TF-IDF and Word2Vec cosine similarity: 1.00
