In [60]:
import nltk
import re
import logging
import math
import numpy as np
import os
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict, Counter

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Files Loading

In [61]:
def load_animals(folder_path):
    data = {}
    doc_id_to_filename = {}
    doc_id = 0
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
                data[doc_id] = file.read()
                doc_id_to_filename[doc_id] = filename
                logging.info(f"Loaded file: {filename} with doc_id: {doc_id}")
                doc_id += 1
    return data, doc_id_to_filename

## Text-cleaning

In [62]:
def clean_text(text):
  text = re.sub(r"[^a-zA-Z0-9 ]", "", text)
  text = text.lower()
  text = " ".join(text.split())
  return text

## Tokenization

In [63]:
def tokenize(text):
    return text.lower().split()

## Term Frequency (TF)

In [64]:
def term_frequency(term, document):
    return document.count(term) / len(document)

## Inverse Document Frequency (IDF)

In [65]:
def inverse_document_frequency(term, all_documents):
    num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
    return math.log(len(all_documents) / (1 + num_docs_containing_term))

## Compute TF-IDF

In [66]:
def compute_tfidf(document, all_documents, vocab):
    tfidf_vector = []
    for term in vocab:
        tf = term_frequency(term, document)
        idf = inverse_document_frequency(term, all_documents)
        tfidf_vector.append(tf * idf)
    return np.array(tfidf_vector)

## Cosine Similarities

In [68]:
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2) if norm_vec1 * norm_vec2 != 0 else 0

## Queries Processing


In [79]:
def process_queries(query, all_documents, doc_tfidf_vectors, vocab, top_k=4):
    tokenized_query = clean_text(query)
    query_vector = compute_tfidf (tokenized_query, all_documents, vocab)

    similarities = []
    for doc_id, doc_vector in enumerate(doc_tfidf_vectors):
        similarity = cosine_similarity(query_vector, doc_vector)
        similarities.append((doc_id, similarity))

    similarities.sort(key=lambda x: x[1], reverse=True)

    return similarities[:top_k]

## Main Function

In [88]:
def convert_docs_ids_to_filenames(doc_id_to_filename):
  return [doc_id_to_filename[doc_id] for doc_id in doc_ids]

def main():
    folder_path = "/content/drive/MyDrive/Tech400_animals"

    # Load the animals data and document ID to filename mapping
    animals_data, doc_id_to_filename = load_animals(folder_path)

    # Input query
    queries = input("Enter the queries: ")

    # Tokenize and clean the documents
    tokenized_docs = [clean_text(doc) for doc in animals_data.values()]

    # Build a vocabulary
    vocab = sorted(set(word for doc in tokenized_docs for word in doc))

    # Compute TF-IDF vectors for the documents
    doc_tfidf_vectors = [compute_tfidf(doc, tokenized_docs, vocab) for doc in tokenized_docs]

    # Search the query in the documents
    print(f"Searching results for '{queries}':")
    similarities = process_queries(queries, tokenized_docs, doc_tfidf_vectors, vocab)

    # Prepare results
    results = [(queries, similarities)]

    # If results are found, print the top 4 animals
    if similarities:
        print("\nTop 4 animals: ")
        for idx, (doc_id, score) in enumerate(similarities[:4], 1):
            animal_name = os.path.splitext(doc_id_to_filename[doc_id])[0]
            print(f"Animal {idx}: {animal_name}, Score: {score:.4f}")
    else:
        print("No results found.")

if __name__ == "__main__":
    main()

Enter the queries: wild
Searching results for 'wild':

Top 4 animals: 
Animal 1: Dogs, Score: 0.3132
Animal 2: Gorillas, Score: 0.3104
Animal 3: Bears, Score: 0.3045
Animal 4: Chimpanzees, Score: 0.3032
