In [1]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import os
import numpy as np
import string
import logging
import re # Import regular expressions library
from collections import defaultdict , Counter
from nltk . corpus import stopwords
from nltk . tokenize import word_tokenize
from nltk . stem import WordNetLemmatizer
# Initialize the stop words and lemmatizer
STOPWORDS = set( stopwords . words ('english') )
LEMMATIZER = WordNetLemmatizer ()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [2]:
# Function to load documents from a specified directory
def load_documents(Songs):
    documents = {}
    for filename in os.listdir(Songs):
        if filename.endswith(".txt"):
            with open(os.path.join(Songs, filename), 'r') as file:
                documents[filename] = file.read()
    return documents

documents = load_documents('Songs')

In [3]:
# Function to clean and preprocess text (lowercase, tokenization, stopwords removal, and lemmatization)
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    tokens = word_tokenize(text)
    tokens = [LEMMATIZER.lemmatize(token) for token in tokens if token not in STOPWORDS]
    return tokens

cleaned_documents = {filename: clean_text(content) for filename, content in documents.items()}

In [4]:
# Function to create an inverted index
def create_inverted_index(documents):
    inverted_index = defaultdict(set)
    for filename, tokens in documents.items():
        for word in tokens:
            inverted_index[word].add(filename)
    return inverted_index

inverted_index = create_inverted_index(cleaned_documents)

In [5]:
# Initialize all_documents with the set of all document filenames
all_documents = set(documents.keys())

# Function for 'AND' query (finds common documents for all terms)
def and_query(terms, inverted_index):
    result = inverted_index.get(terms[0], set())
    for term in terms[1:]:
        result &= inverted_index.get(term, set())
    return result

In [6]:
# Function for 'OR' query (finds documents that contain any of the terms)
def or_query(terms, inverted_index):
    result = inverted_index.get(terms[0], set())
    for term in terms[1:]:
        result |= inverted_index.get(term, set())
    return result

In [7]:
# Function for 'NOT' query (finds documents that do not contain the specified term)
def not_query(term, inverted_index, all_documents):
    return all_documents - inverted_index.get(term, set())

In [8]:
# Function to convert document IDs (filenames) to a list
def convert_doc_ids_to_filenames(doc_ids):
    return list(doc_ids)

In [9]:
# Function to process the query and execute the appropriate Boolean operation
def process_query(query, inverted_index, all_documents):
    # Tokenize and preprocess the query
    terms = [LEMMATIZER.lemmatize(term) for term in word_tokenize(query.lower()) if term not in STOPWORDS]
    if 'and' in terms:
        terms.remove('and')
        result = and_query(terms, inverted_index)
    elif 'or' in terms:
        terms.remove('or')
        result = or_query(terms, inverted_index)
    elif 'not' in terms:
        terms.remove('not')
        result = not_query(terms[0], inverted_index, all_documents)
    else:
        result = inverted_index.get(terms[0], set())
    return convert_doc_ids_to_filenames(result)

In [10]:
# Example usage
query = "not see"
result = process_query(query, inverted_index, all_documents)
print(result)

['Song 7.txt', 'Song 3.txt', 'Song 6.txt', 'Song 10.txt']


In [11]:
# Example usage
query = "play and cool"
result = process_query(query, inverted_index, all_documents)
print(result)

['Song 7.txt']


In [12]:
# Example usage
query = "baby or know"
result = process_query(query, inverted_index, all_documents)
print(result)

['Song 9.txt', 'Song 7.txt', 'Song 6.txt', 'Song 10.txt']


Vector Space Model

In [13]:
# Directory containing the .txt files
songs_dir = 'Songs'

In [14]:
# Load documents from .txt files
documents = []
for filename in os.listdir(songs_dir):
  if filename.endswith('.txt'):
    with open(os.path.join(songs_dir, filename), 'r') as f:
      documents.append(f.read())

queries = ["change", "baby"]

In [15]:
# Preprocess documents and queries: lowercase and tokenize
def tokenize(text):
  return text.lower().split()

# Use documents instead of docs
tokenized_docs = [tokenize(doc) for doc in documents]
# Create a list containing the query
tokenized_queries = [tokenize(query) for query in queries]

In [16]:
# Vocabulary Creation
# Build vocabulary (unique words across all documents and queries)
vocab = set([word for doc in tokenized_docs for word in doc])
vocab = sorted(vocab) # Optional sorting for consistency
print("Vocabulary:", vocab)

Vocabulary: ['"baby,', '"do', '"there\'s', '"we\'re', "'bout", "'cause", "'em", "'give-a-fucks'", "'round", "'til", "(don't", '(feat.', '(gone,', '(ha-ha-ha-ha-ha,', '(i', '(luda!)', '(oh)', '(oh),', '(stupid)', '(uh-huh)', '(we', '(whoa)', '(woo)', '(yeah,', '(yes)', '(yo)', '(yo),', '(yo,', '-alan', '-bruno', '-charlie', '-coldplay', '-justin', '-sabrina', '-selena', '-shawn', '-taylor', '-the', '13,', 'a', 'about', 'above', 'across', 'act', 'adored', 'afraid', 'after', 'again', 'ah', 'aid', "ain't", 'air', 'album', 'alive', 'all', 'all,', 'almost', 'alone', 'along', 'always', "amazin'", 'amazing', 'an', 'and', 'another"', 'any', 'anymore', 'anything', 'anytime', 'apart', 'are', 'are,', 'around', 'around,', 'as', 'ask', 'asked', 'asking', 'asks', 'at', 'autumn', 'away?', 'awhile', 'baby', 'baby,', 'back', 'bad', 'bandit', 'be', 'beat', 'beautiful', 'beautiful,', 'become', 'bed', 'bee,', 'been', 'before', 'being', 'believe', 'believe,', 'belongs', 'better', 'between', 'bieber', 'bitch

In [17]:
# Function to calculate term frequency (TF)
def term_frequency(term, document):
  tf = document.count(term) / len(document)
  print (tf)
  return tf

In [18]:
# Function to calculate inverse document frequency (IDF)
def inverse_document_frequency(term, all_documents):
  num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
  return math.log(len(all_documents) / (1 + num_docs_containing_term))

In [19]:
# Compute TF-IDF for a document
def compute_tfidf(document, all_documents, vocab):
  tfidf_vector = []
  for term in vocab:
    tf = term_frequency(term, document)
    idf = inverse_document_frequency(term, all_documents)
    tfidf_vector.append(tf * idf)
  return np.array(tfidf_vector)

In [20]:
# Compute cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
  dot_product = np.dot(vec1, vec2)
  norm_vec1 = np.linalg.norm(vec1)
  norm_vec2 = np.linalg.norm(vec2)
  return dot_product / (norm_vec1 * norm_vec2)

In [21]:
import math

In [22]:
# Calculate TF-IDF vectors for documents and queries
doc_tfidf_vectors = [compute_tfidf(doc, tokenized_docs, vocab) for doc in tokenized_docs]
query_tfidf_vectors = [compute_tfidf(query, tokenized_docs, vocab) for query in tokenized_queries]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.003401360544217687
0.0
0.0
0.0
0.0
0.0
0.0
0.023809523809523808
0.0
0.030612244897959183
0.0
0.0
0.0
0.0
0.0
0.0
0.027210884353741496
0.006802721088435374
0.0
0.0
0.0
0.003401360544217687
0.0
0.003401360544217687
0.003401360544217687
0.0
0.0
0.0
0.01020408163265306
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.006802721088435374
0.0
0.0
0.0
0.0
0.0
0.0
0.003401360544217687
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.003401360544217687
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.006802721088435374
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.013605442176870748
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.003401360544217687
0.0
0.003401360544217687
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.006802721088435374
0.003401360544217687
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.006802721088435374
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0

In [23]:
# Calculate cosine similarities
cosine_similarities = []
for query_vector in query_tfidf_vectors:
    similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in doc_tfidf_vectors]
    cosine_similarities.append(similarities)

# Display the results
for i, query in enumerate(queries):
    print(f"\nCosine similarities for query '{query}':")
    for j, doc in enumerate(doc_tfidf_vectors):
        print(f"Document {j+1}: {cosine_similarities[i][j]:.4f}")

# Save results to a file
with open("result_romita.txt", "w") as f:
    for i, query in enumerate(queries):
        f.write(f"\nCosine similarities for query '{query}':\n")
        # Change documents to doc_tfidf_vectors to have the same number of items to iterate over
        for j, doc in enumerate(doc_tfidf_vectors):
            f.write(f"Document {j + 1}: {cosine_similarities[i][j]:.4f}\n")


Cosine similarities for query 'change':
Document 1: 0.0000
Document 2: 0.0000
Document 3: 0.0000
Document 4: 0.0000
Document 5: 0.0000
Document 6: 0.2135
Document 7: 0.0000
Document 8: 0.0000
Document 9: 0.0000
Document 10: 0.0000

Cosine similarities for query 'baby':
Document 1: 0.0000
Document 2: 0.0000
Document 3: 0.0000
Document 4: 0.0511
Document 5: 0.0000
Document 6: 0.0000
Document 7: 0.0000
Document 8: 0.0000
Document 9: 0.0000
Document 10: 0.0000


In [24]:
# Calculate cosine similarities
cosine_similarities = []
for query_vector in query_tfidf_vectors:
    similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in doc_tfidf_vectors]
    cosine_similarities.append(similarities)

# Display the results in ranked order
for i, query in enumerate(queries):
    print(f"\nCosine similarities for query '{query}' (ranked):")
    ranked_docs = sorted(enumerate(cosine_similarities[i]), key=lambda x: x[1], reverse=True)
    for j, similarity in ranked_docs:
        print(f"Document {j+1}: {similarity:.4f}")

# Save results to a file in ranked order
with open("result_romita.txt", "w") as f:
    for i, query in enumerate(queries):
        f.write(f"\nCosine similarities for query '{query}' (ranked):\n")
        ranked_docs = sorted(enumerate(cosine_similarities[i]), key=lambda x: x[1], reverse=True)
        for j, similarity in ranked_docs:
            f.write(f"Document {j + 1}: {similarity:.4f}\n")



Cosine similarities for query 'change' (ranked):
Document 6: 0.2135
Document 1: 0.0000
Document 2: 0.0000
Document 3: 0.0000
Document 4: 0.0000
Document 5: 0.0000
Document 7: 0.0000
Document 8: 0.0000
Document 9: 0.0000
Document 10: 0.0000

Cosine similarities for query 'baby' (ranked):
Document 4: 0.0511
Document 1: 0.0000
Document 2: 0.0000
Document 3: 0.0000
Document 5: 0.0000
Document 6: 0.0000
Document 7: 0.0000
Document 8: 0.0000
Document 9: 0.0000
Document 10: 0.0000
