In [39]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
import os
import numpy as np
import string
import logging
import math
import re # Import regular expressions library
from collections import defaultdict , Counter
from nltk . corpus import stopwords
from nltk . tokenize import word_tokenize
from nltk . stem import WordNetLemmatizer
# Initialize the stop words and lemmatizer
STOPWORDS = set( stopwords . words ('english') )
LEMMATIZER = WordNetLemmatizer ()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
#Load documents
def load_documents(directory):
    documents = []
    file_names = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            with open(os.path.join(directory, filename), 'r', encoding='utf-8') as f:
                content = f.read().replace('\n', ' ')
                documents.append(content)
            file_names.append(filename)

    return documents, file_names

In [None]:
queries = ["baby", "pop", "selena"]

In [None]:
# Function to clean and preprocess text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W+', ' ', text)
    tokens = word_tokenize(text)
    tokens = [LEMMATIZER.lemmatize(token) for token in tokens if token not in STOPWORDS]
    return tokens

# Specify the directory containing your text files
directory = 'documents'

# Call load_documents to get the documents and filenames
documents, file_names = load_documents(directory)

# Create cleaned_documents
cleaned_documents = {}
for i in range(len(file_names)):
    cleaned_documents[file_names[i]] = clean_text(documents[i])

In [None]:
# Vocabulary Creation
# Build vocabulary (unique words across all documents and queries)
vocab = set([word for doc in cleaned_documents.values() for word in doc])
vocab = sorted(vocab)
print("Vocabulary:", vocab)

Vocabulary: ['13', 'across', 'act', 'adored', 'adult', 'afraid', 'age', 'air', 'alan', 'album', 'align', 'alive', 'almost', 'alone', 'along', 'already', 'always', 'amazin', 'amazing', 'another', 'anymore', 'anything', 'anytime', 'apart', 'appear', 'appeared', 'arena', 'ariana', 'arm', 'around', 'ask', 'asked', 'asking', 'asks', 'autumn', 'away', 'awhile', 'b', 'baby', 'back', 'bad', 'balance', 'ball', 'band', 'beat', 'beating', 'beautiful', 'become', 'bed', 'bedsheets', 'bee', 'behind', 'believe', 'belly', 'belongs', 'berry', 'better', 'beyond', 'bieber', 'bit', 'black', 'blame', 'bleed', 'bleeds', 'blind', 'blindly', 'blood', 'blue', 'body', 'bone', 'bother', 'bout', 'boy', 'brain', 'brand', 'break', 'breakin', 'breaking', 'breath', 'breathe', 'brewed', 'bringing', 'britpop', 'broke', 'broken', 'brown', 'bruno', 'burn', 'butterfly', 'buy', 'byway', 'call', 'called', 'calling', 'came', 'candle', 'car', 'care', 'carpenter', 'cast', 'casually', 'cause', 'ceiling', 'chance', 'change', 'ch

In [None]:
# Function to calculate TF
def term_frequency(term, document):
  tf = document.count(term) / len(document)
  print (tf)
  return tf

# Function to calculate IDF
def inverse_document_frequency(term, all_documents):
  num_docs_containing_term = sum(1 for doc in all_documents if term in doc)
  return math.log(len(all_documents) / (1 + num_docs_containing_term))

# Compute TF-IDF for a document
def compute_tfidf(document, all_documents, vocab):
  tfidf_vector = []
  for term in vocab:
    tf = term_frequency(term, document)
    idf = inverse_document_frequency(term, all_documents)
    tfidf_vector.append(tf * idf)
  return np.array(tfidf_vector)

# Compute cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
  dot_product = np.dot(vec1, vec2)
  norm_vec1 = np.linalg.norm(vec1)
  norm_vec2 = np.linalg.norm(vec2)
  return dot_product / (norm_vec1 * norm_vec2)

In [None]:
# Calculate TF-IDF vectors for documents and queries
doc_tfidf_vectors = [compute_tfidf(doc, cleaned_documents.values(), vocab) for doc in cleaned_documents.values()]

# Calculate TF-IDF vectors for each query separately
query_tfidf_vectors = [compute_tfidf(clean_text(query), cleaned_documents.values(), vocab) for query in queries]

# Calculate cosine similarities
cosine_similarities = []
for query_vector in query_tfidf_vectors:
    similarities = [cosine_similarity(query_vector, doc_vector) for doc_vector in doc_tfidf_vectors]
    cosine_similarities.append(similarities)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.006557377049180328
0.0
0.0
0.003278688524590164
0.0
0.0
0.0
0.003278688524590164
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.006557377049180328
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.10163934426229508
0.0
0.0
0.0
0.0
0.0
0.009836065573770493
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.003278688524590164
0.0
0.0
0.0
0.003278688524590164
0.003278688524590164
0.003278688524590164
0.0
0.0
0.0
0.003278688524590164
0.0
0.0
0.003278688524590164
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.003278688524590164
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.003278688524590164
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.003278688524590164
0.003278688524590164
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.003278688524590164
0.0
0.0
0.0
0.0
0.0
0.006557377049180328
0.0
0.0
0.003278688524590164
0.0
0.0
0.0
0.006557377049180

In [None]:
# Display the results in ranked order (top 5 only)
for i, query in enumerate(queries):
    print(f"\nRanked Result for '{query}':")
    ranked_docs = sorted(enumerate(cosine_similarities[i]), key=lambda x: x[1], reverse=True)

    # Limiting the results to top 5 and filtering out documents with 0 similarity
    for j, similarity in ranked_docs[:5]:
        if similarity > 0:
            print(f"Document {j+1}: {similarity:.4f}")
        else:
            print(f"Document {j+1}: No similarity found")


Ranked Result for 'baby':
Document 18: 0.6145
Document 14: 0.1477
Document 20: 0.1334
Document 16: 0.1219
Document 2: 0.0510

Ranked Result for 'pop':
Document 13: 0.0174
Document 16: 0.0107
Document 20: 0.0073
Document 1: 0.0069
Document 4: 0.0064

Ranked Result for 'selena':
Document 21: 0.0345
Document 12: 0.0257
Document 1: No similarity found
Document 2: No similarity found
Document 3: No similarity found


In [None]:
relevant_docs_per_query = {
    "baby": {18, 14, 20, 16, 2},
    "pop": {13, 16, 20, 1, 4},
    "selena gomez": {21, 12}
}

def precision_at_k(ranked_docs, relevant_docs, k):
    ranked_docs = ranked_docs[:k]
    num_relevant_in_k = len(set(ranked_docs) & relevant_docs)
    if k == 0:
        return 0
    precision = num_relevant_in_k / k
    return precision

# Evaluate precision at k for each query
ranked_docs_ids_list = []
for i, query in enumerate(queries):
    print(f"\nRanked Result for '{query}':")
    ranked_docs_for_query = sorted(enumerate(cosine_similarities[i]), key=lambda x: x[1], reverse=True)

    # Adjust document IDs by adding 1
    ranked_docs_ids = [doc_id + 1 for doc_id, _ in ranked_docs_for_query]

    ranked_docs_ids_list.append(ranked_docs_ids)
    relevant_docs = relevant_docs_per_query.get(query, set())
    k = 5
    precision_k = precision_at_k(ranked_docs_ids, relevant_docs, k)
    print(f"Precision at {k}: {precision_k:.4f}")


Ranked Result for 'baby':
Precision at 5: 1.0000

Ranked Result for 'pop':
Precision at 5: 1.0000

Ranked Result for 'selena':
Precision at 5: 0.0000


Logistic Regression Model

In [None]:
class LogisticRegression:
    def __init__(self, learning_rate=0.01, epochs=1000):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self.weights = None
        self.bias = None

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))

    def fit(self, X, y):
        # Initialize weights and bias
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0

        # Gradient descent
        for _ in range(self.epochs):
            # Linear model
            linear_model = np.dot(X, self.weights) + self.bias
            # Apply sigmoid function
            y_predicted = self.sigmoid(linear_model)

            # Compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)

            # Update weights and bias
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db

    def predict(self, X):
        linear_model = np.dot(X, self.weights) + self.bias
        y_predicted = self.sigmoid(linear_model)
        return [1 if i > 0.5 else 0 for i in y_predicted]

# Training the model
model = LogisticRegression(learning_rate=0.01, epochs=1000)
model.fit(X, y)


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

y_pred = model.predict(X)

accuracy = accuracy_score(y, y_pred)
# Set average to 'micro', 'macro', 'weighted', or None
precision = precision_score(y, y_pred, average='micro')
recall = recall_score(y, y_pred, average='micro')

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")

Accuracy: 0.84
Precision: 0.84
Recall: 0.84
