In [6]:
#Question-1. Consider the following documents and compute TFIDF values

from sklearn.feature_extraction.text import TfidfVectorizer

# List of documents
docs = [
    "the house had a tiny little mouse",
    "the cat saw the mouse",
    "the mouse ran away from the house",
    "the cat finally ate the mouse",
    "the end of the mouse story"
]

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform your documents to obtain TF-IDF feature vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(docs)

# Get the feature names (words)

try:
    feature_names = tfidf_vectorizer.get_feature_names_out()
except AttributeError:
    feature_names = tfidf_vectorizer.get_feature_names()


# Create a DataFrame to display the TF-IDF values
import pandas as pd

tfidf_df = pd.DataFrame(data=tfidf_matrix.toarray(), columns=feature_names)
tfidf_df.index = ["Document {}".format(i + 1) for i in range(len(docs))]

# Display the TF-IDF values
print("TF-IDF Values:")
print(tfidf_df)


TF-IDF Values:
                 ate      away       cat       end   finally     house  \
Document 1  0.000000  0.000000  0.000000  0.000000  0.000000  0.475575   
Document 2  0.000000  0.000000  0.588732  0.000000  0.000000  0.000000   
Document 3  0.000000  0.589463  0.000000  0.000000  0.000000  0.475575   
Document 4  0.589463  0.000000  0.475575  0.000000  0.589463  0.000000   
Document 5  0.000000  0.000000  0.000000  0.670092  0.000000  0.000000   

              little     mouse       ran       saw     story      tiny  
Document 1  0.589463  0.280882  0.000000  0.000000  0.000000  0.589463  
Document 2  0.000000  0.347715  0.000000  0.729718  0.000000  0.000000  
Document 3  0.000000  0.280882  0.589463  0.000000  0.000000  0.000000  
Document 4  0.000000  0.280882  0.000000  0.000000  0.000000  0.000000  
Document 5  0.000000  0.319302  0.000000  0.000000  0.670092  0.000000  


In [1]:
#Question-2. Compute cosine similarity between 3rd document (“the mouse ran away from the house”) with all other documents. Which is the most similar document?.

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# List of documents
docs = [
    "the house had a tiny little mouse",
    "the cat saw the mouse",
    "the mouse ran away from the house",
    "the cat finally ate the mouse",
    "the end of the mouse story"
]

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform your documents to obtain feature vectors
features = tfidf_vectorizer.fit_transform(docs)

# Cosine similarity between the 3rd document and all other documents
query = "the mouse ran away from the house"
qfeature = tfidf_vectorizer.transform([query])
scores = linear_kernel(qfeature, features).flatten()

# Print the cosine similarity scores
print("Cosine Similarity Scores:")
for i, score in enumerate(scores):
    print(f"Document {i + 1}: {score}")

# Find the most similar document
most_similar_doc_index = scores.argmax()
print(f"\nThe most similar document is Document {most_similar_doc_index + 1} with a cosine similarity score of {scores[most_similar_doc_index]}")




Cosine Similarity Scores:
Document 1: 0.3050665531435638
Document 2: 0.09766691363237201
Document 3: 0.9999999999999999
Document 4: 0.07889487560344138
Document 5: 0.08968637798033531

The most similar document is Document 3 with a cosine similarity score of 0.9999999999999999


In [7]:
#Question-3. Find Top-2 similar documents for the 3rd document based on Cosine similarity values.

from sklearn.metrics.pairwise import linear_kernel

# Cosine similarity between the 3rd document and all other documents
query_index = 2  # Index of the 3rd document
query_vector = tfidf_matrix[query_index:query_index + 1]  # Extract the TF-IDF vector for the 3rd document

cosine_similarities = linear_kernel(query_vector, tfidf_matrix).flatten()

# Exclude the similarity score with the 3rd document (it will be 1.0)
cosine_similarities[query_index] = -1.0

# Find the indices of the top-2 similar documents
top_indices = cosine_similarities.argsort()[-2:][::-1]

# Display the top-2 similar documents and their cosine similarity scores
print("Top-2 Similar Documents:")
for i, index in enumerate(top_indices, 1):
    print(f"{i}. Document {index + 1} (Cosine Similarity: {cosine_similarities[index]})")


Top-2 Similar Documents:
1. Document 1 (Cosine Similarity: 0.3050665531435638)
2. Document 2 (Cosine Similarity: 0.09766691363237201)
