<a href="https://colab.research.google.com/github/singlaG554/Mini-projects/blob/main/Cosine_similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Imagine turning each document, sentence, or item into a list of numbers (a vector). Cosine similarity checks how close in angle those two vectors are.
# If two items are very similar, their vectors point in almost the same direction, so cosine similarity is close to 1.
# If they are very different, their vectors are far apart, and cosine similarity is closer to 0.
# In search engines: It tells which document best matches your query.
# In recommendation engines: It helps recommend items similar to what you like.
# In text classification: It groups similar texts together.

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import os

def read_file(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        return f.read()

def compute_cosine_similarity(file_paths):
    # Read all file contents
    documents = [read_file(path) for path in file_paths]

    # Convert texts to TF-IDF vectors
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)

    # Compute cosine similarity matrix
    similarity_matrix = cosine_similarity(tfidf_matrix)

    return similarity_matrix

if __name__ == "__main__":
    # Example file paths (you can change these to your actual file paths)
    file_paths = [
        "/content/Arthur (1).txt",
        "/content/Ben (1).txt"
    ]

    if not all(os.path.exists(fp) for fp in file_paths):
        print("❌ One or more files do not exist. Please check the paths.")
    else:
        similarity_matrix = compute_cosine_similarity(file_paths)

        print("✅ Cosine Similarity Matrix:")
        for i in range(len(file_paths)):
            for j in range(len(file_paths)):
                print(f"Similarity between {os.path.basename(file_paths[i])} and {os.path.basename(file_paths[j])}: {similarity_matrix[i][j]:.4f}")
