## Importing libraries and mounting google drive

In [42]:
import os
import numpy as np
import math

In [43]:
# Mounting Google Drive to access documents
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Loading texts files from folder

In [44]:
# Loading text files from the specified folder
def load_text_files(folder_path):
    data = []
    doc_id_to_filename = {}
    for i, filename in enumerate(os.listdir(folder_path)):
        if filename.endswith('.txt'):  # Ensure it's a text file
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                data.append(file.read())
                doc_id_to_filename[i] = filename
    return data, doc_id_to_filename

In [45]:
# Folder path
folder_path = '/content/drive/MyDrive/dataset/wk3-docs'

In [46]:
# Loading documents from the folder
docs, doc_id_to_filename = load_text_files(folder_path)

## Defining sample queries and tokenizing

In [66]:
# Defining sample queries with logical operators
queries = [
    "traveling light", "plants hate math", "thermometer bring a scarf"
]

In [67]:
# Tokenizing the documents and queries by lowercasing and splitting
def tokenize(text):
    return text.lower().split()

tokenized_docs = [tokenize(doc) for doc in docs]
tokenized_queries = [tokenize(query) for query in queries]

## Building Vocabulary and calculating TF & IDF

In [68]:
# Building vocabulary (unique words from all documents and queries)
vocab = set([word for doc in tokenized_docs for word in doc if word.isalpha()])
vocab = sorted(vocab)  # Optional sorting for consistency
print("Vocabulary:", vocab)


Vocabulary: ['a', 'accelerating', 'all', 'already', 'always', 'an', 'are', 'astronaut', 'at', 'atoms', 'bacteria', 'bad', 'bags', 'because', 'bed', 'best', 'biologists', 'black', 'break', 'bring', 'charged', 'check', 'chemist', 'chemists', 'clothes', 'coffee', 'did', 'dig', 'dinosaur', 'do', 'down', 'electron', 'electrons', 'elevator', 'fail', 'feeling', 'felt', 'file', 'finding', 'fold', 'gain', 'get', 'go', 'going', 'good', 'goodbye', 'got', 'great', 'had', 'handle', 'hate', 'have', 'he', 'his', 'hole', 'how', 'in', 'it', 'its', 'job', 'jokes', 'keep', 'know', 'ladder', 'leaf', 'little', 'lose', 'lot', 'love', 'make', 'many', 'million', 'mitochondria', 'mix', 'molecules', 'multiply', 'needed', 'negative', 'never', 'new', 'of', 'on', 'paleontologist', 'part', 'particle', 'particles', 'photon', 'physicist', 'physicists', 'plants', 'play', 'police', 'powerhouse', 'problem', 'proton', 'reach', 'react', 'right', 'say', 'scientist', 'scientists', 'so', 'solving', 'some', 'square', 'stairs'

In [69]:
# Calculating term frequency (TF)
def term_frequency(term, document):
  return document.count(term) / len(document)

In [70]:
# Calculating inverse document frequency (IDF)
def inverse_document_frequency(term, all_documents):
  num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
  return math.log(len(all_documents) / (1 + num_docs_containing_term))

In [71]:
# Computing TF-IDF for a single document
def compute_tfidf(document, all_documents, vocab):
  tfidf_vector = []
  for term in vocab:
    tf = term_frequency(term, document)
    idf = inverse_document_frequency(term, all_documents)
    tfidf_vector.append(tf * idf)
  return np.array(tfidf_vector)

## Calculating cosine similarity and ranking documents

In [72]:
# Calculating cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
  dot_product = np.dot(vec1, vec2)
  norm_vec1 = np.linalg.norm(vec1)
  norm_vec2 = np.linalg.norm(vec2)
  return dot_product / (norm_vec1 * norm_vec2)

In [73]:
# Generating TF-IDF vectors for both documents and queries
doc_tfidf_vectors = [compute_tfidf(doc, tokenized_docs, vocab) for doc in tokenized_docs]
query_tfidf_vectors = [compute_tfidf(query, tokenized_docs, vocab) for query in tokenized_queries]

In [91]:
# Calculating cosine similarity for each query against the documents
cosine_similarities = []
for query_vector in query_tfidf_vectors:
    similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in doc_tfidf_vectors]
    cosine_similarities.append(similarities)

# Displaying cosine similarity results, sorted by highest similarity
for i, query in enumerate(queries):
    print(f"\nTop 3 Cosine similarities for query '{query}':")

    # Pairing document indices with corresponding similarity scores
    doc_sim_pairs = list(enumerate(cosine_similarities[i]))

    # Sorting the pairs by similarity scores in descending order
    doc_sim_pairs_sorted = sorted(doc_sim_pairs, key=lambda x: x[1], reverse=True)[:3]

    # Printing the sorted similarity scores with correct rank
    for rank, (doc_idx, similarity) in enumerate(doc_sim_pairs_sorted, start=1):
        print(f"Rank {rank}: Document{doc_idx + 1}: Score: {similarity:.4f}")


Top 3 Cosine similarities for query 'traveling light':
Rank 1: Document1: Score: 0.2596
Rank 2: Document2: Score: 0.0000
Rank 3: Document3: Score: 0.0000

Top 3 Cosine similarities for query 'plants hate math':
Rank 1: Document9: Score: 0.2849
Rank 2: Document10: Score: 0.2213
Rank 3: Document1: Score: 0.0000

Top 3 Cosine similarities for query 'thermometer bring a scarf':
Rank 1: Document7: Score: 0.2916
Rank 2: Document5: Score: 0.1293
Rank 3: Document8: Score: 0.0092


## Saving result in .txt

In [93]:
# Saving the same cosine similarity results to a .txt file
with open('cosine_similarities_output.txt', 'w') as file:

    for i, query in enumerate(queries):
        file.write(f"\nTop 3 Cosine similarities for query '{query}':\n")

        # Pairing document indices with similarity scores
        doc_sim_pairs = list(enumerate(cosine_similarities[i]))

        # Sorting the pairs by similarity scores in descending order
        doc_sim_pairs_sorted = sorted(doc_sim_pairs, key=lambda x: x[1], reverse=True)[:3]

        # Writing the sorted similarities to the file with rank
        for rank, (doc_idx, similarity) in enumerate(doc_sim_pairs_sorted, start=1):
            file.write(f"Rank {rank}: Document {doc_idx + 1} Score {similarity:.4f}\n")

# Output saved in 'cosine_similarities_output.txt'.