In [9]:
# Download stopwords
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sahyadri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sahyadri\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [10]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import euclidean_distances, cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize



# Preprocess the documents
stop_words = set(stopwords.words('english'))

def preprocess(doc):
    tokens = word_tokenize(doc.lower())
    filtered_tokens = [w for w in tokens if w.isalnum() and w not in stop_words]
    return ' '.join(filtered_tokens)

documents = [
    "It is going to rain today",
    "Today Rama is not going outside to watch rain",
    "I am going to watch the movie tomorrow with Rama",
    "Tomorrow Rama is going to watch the rain at sea shore"
]

query = "Rama watching the rain"

preprocessed_docs = [preprocess(doc) for doc in documents]
preprocessed_query = preprocess(query)

# Create term-document matrix
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_docs + [preprocessed_query])

# Apply LSA
lsa = TruncatedSVD(n_components=2)
X_lsa = lsa.fit_transform(X)

# Separate the query vector
query_vec_lsa = X_lsa[-1]
doc_vecs_lsa = X_lsa[:-1]

# Similarity Measures
euclidean = euclidean_distances(doc_vecs_lsa, query_vec_lsa.reshape(1, -1)).flatten()
cosine = cosine_similarity(doc_vecs_lsa, query_vec_lsa.reshape(1, -1)).flatten()
cosine = 1 - cosine  # Invert cosine similarity to treat it as a distance

def jaccard_similarity(doc1, doc2):
    set1, set2 = set(doc1.split()), set(doc2.split())
    return 1 - len(set1 & set2) / len(set1 | set2)

def dice_similarity(doc1, doc2):
    set1, set2 = set(doc1.split()), set(doc2.split())
    return 1 - (2 * len(set1 & set2)) / (len(set1) + len(set2))

jaccard = np.array([jaccard_similarity(preprocessed_query, doc) for doc in preprocessed_docs])
dice = np.array([dice_similarity(preprocessed_query, doc) for doc in preprocessed_docs])

# Find the top 2 relevant documents
def top_k_documents(similarity, k=2):
    return np.argsort(similarity)[:k]

top_k_euclidean = top_k_documents(euclidean)
top_k_cosine = top_k_documents(cosine)
top_k_jaccard = top_k_documents(jaccard)
top_k_dice = top_k_documents(dice)

# Output the most relevant documents
results = {
    "Euclidean Distance": top_k_euclidean,
    "Cosine Similarity": top_k_cosine,
    "Jaccard Similarity": top_k_jaccard,
    "Dice Similarity Coefficient": top_k_dice
}
print("Top two relevant documents for the query document with the content 'Rama watching the rain': \n")
for measure, indices in results.items():
    print(f"{measure}: {[documents[i] for i in indices]}"+"\n")

Top two relevant documents for the query document with the content 'Rama watching the rain': 

Euclidean Distance: ['Today Rama is not going outside to watch rain', 'It is going to rain today']

Cosine Similarity: ['Today Rama is not going outside to watch rain', 'It is going to rain today']

Jaccard Similarity: ['Today Rama is not going outside to watch rain', 'Tomorrow Rama is going to watch the rain at sea shore']

Dice Similarity Coefficient: ['Today Rama is not going outside to watch rain', 'Tomorrow Rama is going to watch the rain at sea shore']

