# 1. Required Libraries:

In [None]:
import numpy as np
import math
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import os
import string
import logging
import re
from collections import defaultdict, Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


# 2. Loading Text Files:

In [None]:
# Read documents from uploaded files
file_paths = ["/content/drive/MyDrive/Information retrieval /week-3 Assignment, TF-IDF Vector Space Model"]

# Load documents into a list
docs = []
for file_path in file_paths:
    # Check if the path is a directory
    if os.path.isdir(file_path):
        # If it's a directory, iterate through its files
        for filename in os.listdir(file_path):
            filepath = os.path.join(file_path, filename)
            # Check if it's a file and then read
            if os.path.isfile(filepath):
                with open(filepath, 'r', encoding='utf-8') as file:
                    docs.append(file.read())
    # If it's not a directory, try reading it as a file
    elif os.path.isfile(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            docs.append(file.read())

# 3. Defining Queries:

In [None]:
 # Define the queries
 queries = [
       "Deep Learning ",
        "Data mining ",
        " machine learning",
        "Computer vision",
        "Artificial Inteliigence",
        " human "
    ]

# 4. Text Pre-Processing:

In [None]:
# Function to lowercase and tokenize text
def tokenize(text):
    return text.lower().split()

 # Convert to lowercase
    text = text.lower()
    # Remove special characters and punctuation
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
    # Remove digit
    text = re.sub(r"\d+", "", text)

In [None]:
# Tokenize documents and queries
tokenized_docs = [tokenize(doc) for doc in docs]
tokenized_queries = [tokenize(query) for query in queries]

# Create a vocabulary from the tokenized documents
vocab = list(set(term for doc in tokenized_docs for term in doc))


print(tokenized_docs)
print(tokenized_queries)
print(vocab)

[['{"nbformat":4,"nbformat_minor":0,"metadata":{"colab":{"provenance":[],"mount_file_id":"1jf5bku0pfe4ir-jwfolkx0wzttm1q638","authorship_tag":"abx9tymabycryeqdwnlheepnv36t"},"kernelspec":{"name":"python3","display_name":"python', '3"},"language_info":{"name":"python"}},"cells":[{"cell_type":"markdown","source":["#', '1.', 'required', 'libraries:"],"metadata":{"id":"wou_9ln_ptgj"}},{"cell_type":"code","execution_count":1,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"qw1806vioevo","executioninfo":{"status":"ok","timestamp":1726413423579,"user_tz":-345,"elapsed":7514,"user":{"displayname":"prabesh', 'pandey","userid":"10934286709721246056"}},"outputid":"74f130c2-d341-4eb4-90ab-c3458d1d9597"},"outputs":[{"output_type":"stream","name":"stderr","text":["[nltk_data]', 'downloading', 'package', 'stopwords', 'to', '/root/nltk_data...\\n","[nltk_data]', 'unzipping', 'corpora/stopwords.zip.\\n","[nltk_data]', 'downloading', 'package', 'punkt', 'to', '/root/nltk_data...\\n","[nl

# 5. Term Frequency (TF)

In [None]:
# Function to calculate term frequency (TF)
def term_frequency(term, document):
    return document.count(term) / len(document)

# 6. Inverse Document Frequency (IDF)

In [None]:
# Function to calculate inverse document frequency (IDF)
def inverse_document_frequency(term, all_documents):
    num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
    return math.log(len(all_documents) / (1 + num_docs_containing_term))

# 7. Computing TF-IDF

In [None]:
# Compute TF-IDF for a document
def compute_tfidf(document, all_documents, vocab):
    tfidf_vector = []
    for term in vocab:
        tf = term_frequency(term, document)
        idf = inverse_document_frequency(term, all_documents)
        tfidf_vector.append(tf * idf)
    return np.array(tfidf_vector)

# 8. Cosine similarity between two vectors

In [None]:
# Compute cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

# 9. Calculate TF-IDF vectors

In [None]:
# Calculate TF-IDF vectors for documents and queries
doc_tfidf_vectors = [compute_tfidf(doc, tokenized_docs, vocab) for doc in tokenized_docs]
query_tfidf_vectors = [compute_tfidf(query, tokenized_docs, vocab) for query in tokenized_queries]

# 10. Writing  Results to .textfile format:

In [None]:

file_path = "/content/drive/MyDrive/Information retrieval /week-3 Assignment, TF-IDF Vector Space Model/result_Prabesh.txt"

with open(file_path, "w") as result_file:
    # Calculate cosine similarities and rank top 3 documents for each query
    for i, query_vector in enumerate(query_tfidf_vectors):
        similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in doc_tfidf_vectors]

        # Rank documents by similarity score
        ranked_docs = sorted(enumerate(similarities), key=lambda x: x[1], reverse=True)[:3]  # Top 3

        # Prepare the result string
        result_str = f"\nTop 3 results for query '{queries[i]}':\n"
        for rank, (doc_index, score) in enumerate(ranked_docs, 1):
            result_str += f"Rank {rank}: Document {doc_index + 1} with score {score:.4f}\n"

        # Print to console and write to file
        print(result_str)
        result_file.write(result_str)



Top 3 results for query 'Deep Learning ':
Rank 1: Document 11 with score 0.2351
Rank 2: Document 4 with score 0.0950
Rank 3: Document 3 with score 0.0911


Top 3 results for query 'Data mining ':
Rank 1: Document 10 with score 0.6608
Rank 2: Document 9 with score 0.1209
Rank 3: Document 7 with score 0.1065


Top 3 results for query ' machine learning':
Rank 1: Document 2 with score 0.2157
Rank 2: Document 7 with score 0.1622
Rank 3: Document 6 with score 0.1328


Top 3 results for query 'Computer vision':
Rank 1: Document 5 with score 0.4220
Rank 2: Document 4 with score 0.0675
Rank 3: Document 1 with score 0.0000


Top 3 results for query 'Artificial Inteliigence':
Rank 1: Document 3 with score 0.3364
Rank 2: Document 1 with score 0.0000
Rank 3: Document 2 with score 0.0000


Top 3 results for query ' human ':
Rank 1: Document 3 with score 0.1282
Rank 2: Document 4 with score 0.1114
Rank 3: Document 1 with score 0.0000

