In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import math
import numpy as np

In [3]:
# Function to tokenize the text
def tokenize(text):
    return text.lower().split()

In [4]:
# Function to calculate term frequency (TF)
def term_frequency(term, document):
    return document.count(term) / len(document)

In [5]:
# Function to calculate inverse document frequency (IDF)
def inverse_document_frequency(term, all_documents):
    num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
    return math.log(len(all_documents) / (1 + num_docs_containing_term))

In [6]:
# Function to compute cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

In [7]:
# Function to compute TF-IDF for a document
def compute_tfidf(document, all_documents, vocab):
    tfidf_vector = []
    for term in vocab:
        tf = term_frequency(term, document)
        idf = inverse_document_frequency(term, all_documents)
        tfidf_vector.append(tf * idf)
    return np.array(tfidf_vector)

In [10]:
# Main function
def main():
    # Directory containing the text documents
    directory = '/content/drive/MyDrive/PBS BSc IT/Year 3/Information Retrievsl System (TECH 400)/Week 3/Assignment Lab/Text Documents'

    # Reading all files from the directory
    docs = []
    filenames = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), "r") as file:
                content = file.read()
                docs.append(content)
                filenames.append(filename)

    # Hardcoded queries
    queries = ['time',
               'time universe',
               'complex time universe']

    # Tokenizing documents and queries
    tokenized_docs = [tokenize(doc) for doc in docs]
    tokenized_queries = [tokenize(query) for query in queries]

    # Building the vocabulary (unique words across all documents)
    vocab = sorted(set([word for doc in tokenized_docs for word in doc]))

    # Calculate TF-IDF vectors for documents and queries
    doc_tfidf_vectors = [compute_tfidf(doc, tokenized_docs, vocab) for doc in tokenized_docs]
    query_tfidf_vectors = [compute_tfidf(query, tokenized_docs, vocab) for query in tokenized_queries]

    # Calculate cosine similarities
    cosine_similarities = []
    for query_vector in query_tfidf_vectors:
        similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in doc_tfidf_vectors]
        cosine_similarities.append(similarities)

    # Write the ranked results to a text file
    with open("cosine_similarities_ranked_results.txt", "w") as output_file:
        for i, query in enumerate(queries):
            output_file.write(f"\nRanked cosine similarities for query '{query}':\n")

            # Pair document filenames with their similarity scores
            doc_similarity_pairs = list(zip(filenames, cosine_similarities[i]))
            # Sort by similarity in descending order
            ranked_docs = sorted(doc_similarity_pairs, key=lambda x: x[1], reverse=True)

            # Write ranked results
            for rank, (filename, score) in enumerate(ranked_docs, 1):
                output_file.write(f"Rank {rank}: Document {filename} - Score: {score:.4f}\n")

    # Optional: print ranked results for checking
    for i, query in enumerate(queries):
        print(f"\nRanked cosine similarities for query '{query}':")

        # Pair document filenames with their similarity scores
        doc_similarity_pairs = list(zip(filenames, cosine_similarities[i]))
        # Sort by similarity in descending order
        ranked_docs = sorted(doc_similarity_pairs, key=lambda x: x[1], reverse=True)

        # Print ranked results
        for rank, (filename, score) in enumerate(ranked_docs, 1):
            print(f"Rank {rank}: Document {filename} - Score: {score:.4f}")

if __name__ == "__main__":
    main()


Ranked cosine similarities for query 'time':
Rank 1: Document christopher_nolan.txt - Score: 0.1428
Rank 2: Document stanley_kubrick.txt - Score: 0.0338
Rank 3: Document quentin_tarantino.txt - Score: 0.0305
Rank 4: Document david_fincher.txt - Score: 0.0000
Rank 5: Document anurag_kashyap.txt - Score: 0.0000
Rank 6: Document denis_villeneuve.txt - Score: 0.0000
Rank 7: Document akira_kurosawa.txt - Score: 0.0000
Rank 8: Document boon_joon_ho.txt - Score: 0.0000
Rank 9: Document satyajit_ray.txt - Score: 0.0000
Rank 10: Document martin_scorsese.txt - Score: 0.0000

Ranked cosine similarities for query 'time universe':
Rank 1: Document christopher_nolan.txt - Score: 0.1428
Rank 2: Document stanley_kubrick.txt - Score: 0.0338
Rank 3: Document quentin_tarantino.txt - Score: 0.0305
Rank 4: Document david_fincher.txt - Score: 0.0000
Rank 5: Document anurag_kashyap.txt - Score: 0.0000
Rank 6: Document denis_villeneuve.txt - Score: 0.0000
Rank 7: Document akira_kurosawa.txt - Score: 0.0000
R