In [40]:
import os
import numpy as np
import math

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:
# Loading text files from the folder
def load_text_files(folder_path):
    data = []
    doc_id_to_filename = {}
    for i, filename in enumerate(os.listdir(folder_path)):
        if filename.endswith('.txt'):  # Ensure it's a text file
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                data.append(file.read())
                doc_id_to_filename[i] = filename
    return data, doc_id_to_filename

In [42]:
# Folder path
folder_path = '/content/drive/MyDrive/documents'

In [43]:
# Loading the dataset
docs, doc_id_to_filename = load_text_files(folder_path)

In [44]:
#  list of queries
queries = [
'smart home devices disconnecting network troubleshooting',
'MacBook performance slow overheating after software update',
'smart TV Wi-Fi disconnecting streaming issue recent update',
]

In [45]:
# Tokenization
# Preprocessing documents and queries: lowercase and tokenize
def tokenize(text):
    return text.lower().split()

tokenized_docs = [tokenize(doc) for doc in docs]
tokenized_queries = [tokenize(query) for query in queries]

In [46]:
# Building vocabulary (unique words across all documents and queries)
vocab = set([word for doc in tokenized_docs for word in doc])
vocab = sorted(vocab) # Optional sorting for consistency
print("Vocabulary:", vocab)



In [47]:
# Function to calculate term frequency (TF)
def term_frequency(term, document):
  return document.count(term) / len(document)

In [48]:
# Function to calculate inverse document frequency (IDF)
def inverse_document_frequency(term, all_documents):
  num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
  return math.log(len(all_documents) / (1 + num_docs_containing_term))

In [49]:
# Computing TF-IDF for a document
def compute_tfidf(document, all_documents, vocab):
  tfidf_vector = []
  for term in vocab:
    tf = term_frequency(term, document)
    idf = inverse_document_frequency(term, all_documents)
    tfidf_vector.append(tf * idf)
  return np.array(tfidf_vector)

In [50]:
# Computing cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
  dot_product = np.dot(vec1, vec2)
  norm_vec1 = np.linalg.norm(vec1)
  norm_vec2 = np.linalg.norm(vec2)
  return dot_product / (norm_vec1 * norm_vec2)

In [51]:
# Calculating TF-IDF vectors for documents and queries
doc_tfidf_vectors = [compute_tfidf(doc, tokenized_docs, vocab) for doc in tokenized_docs]
query_tfidf_vectors = [compute_tfidf(query, tokenized_docs, vocab) for query in tokenized_queries]

In [55]:
# Path for the output file
output_file_path = "/content/cosine_similarities_output.txt"

# Opening the file in write mode
with open(output_file_path, 'w') as f:
    # Calculate cosine similarities
    cosine_similarities = []
    for query_vector in query_tfidf_vectors:
        similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in doc_tfidf_vectors]
        cosine_similarities.append(similarities)

    # Displaying the results in ascending order of cosine similarity
    for i, query in enumerate(queries):
        f.write(f"\nCosine similarities for query '{query}':\n")

        # Zipping document indices and their corresponding similarities
        doc_sim_pairs = list(enumerate(cosine_similarities[i]))

        # Sorting the pairs based on similarity in ascending order
        doc_sim_pairs_sorted = sorted(doc_sim_pairs, key=lambda x: x[1])

        # Writing the sorted document similarities to the file
        for doc_idx, similarity in doc_sim_pairs_sorted:
            f.write(f"Document {doc_idx + 1}: {similarity:.4f}\n")

# Confirming that the output has been saved
print(f"Output has been saved to {output_file_path}")


Output has been saved to /content/cosine_similarities_output.txt
