### Boolean Information Retrieval

In [1]:
# Sample documents
docs = {
    "doc1": "artificial intelligence and machine learning revolutionize modern technology",
    "doc2": "natural language processing enables computers to understand human language",
    "doc3": "computer vision algorithms analyze and interpret visual data"
}

# Preprocessing: lowercase + split
def preprocess(text):
    return text.lower().split()



In [2]:
# Build term-document matrix (binary)
terms = {}
doc_list = list(docs.keys())

for i, (doc, text) in enumerate(docs.items()):
    words = set(preprocess(text))
    for word in words:
        if word not in terms:
            terms[word] = [0] * len(docs)
        terms[word][i] = 1


In [3]:
# Show matrix
print("\nTerm-Document Matrix:")
for term, vec in terms.items():
    print(f"{term:15}: {vec}")

# Evaluate boolean query (supports AND, OR, NOT)
def evaluate(query):
    query = query.lower().split()
    result = None
    operator = None
    i = 0

    while i < len(query):
        if query[i] == 'not':
            word = query[i + 1]
            vec = terms.get(word, [0] * len(docs))
            vec = [1 - x for x in vec]
            i += 2
        elif query[i] in ['and', 'or']:
            operator = query[i]
            i += 1
            continue
        else:
            word = query[i]
            vec = terms.get(word, [0] * len(docs))
            i += 1

        if result is None:
            result = vec
        else:
            if operator == 'and':
                result = [a & b for a, b in zip(result, vec)]
            elif operator == 'or':
                result = [a | b for a, b in zip(result, vec)]

    return result



Term-Document Matrix:
machine        : [1, 0, 0]
revolutionize  : [1, 0, 0]
artificial     : [1, 0, 0]
learning       : [1, 0, 0]
technology     : [1, 0, 0]
intelligence   : [1, 0, 0]
modern         : [1, 0, 0]
and            : [1, 0, 1]
to             : [0, 1, 0]
language       : [0, 1, 0]
natural        : [0, 1, 0]
processing     : [0, 1, 0]
understand     : [0, 1, 0]
enables        : [0, 1, 0]
human          : [0, 1, 0]
computers      : [0, 1, 0]
data           : [0, 0, 1]
computer       : [0, 0, 1]
visual         : [0, 0, 1]
vision         : [0, 0, 1]
interpret      : [0, 0, 1]
analyze        : [0, 0, 1]
algorithms     : [0, 0, 1]


In [4]:
# Get user query
query = input("\nEnter Boolean query (AND, OR, NOT): ")
# Example queries to try:
# - artificial AND intelligence
# - machine OR learning
# - NOT language

# Evaluate
result_vec = evaluate(query)
matches = [doc_list[i] for i, v in enumerate(result_vec) if v == 1]

# Show result
print("\nMatching Documents:", matches if matches else "No match found.")


Enter Boolean query (AND, OR, NOT):  machine OR learning



Matching Documents: ['doc1']


### Term Weighthing Meachanism - TF, IDF, TF-IDF

In [5]:
import math

# Sample documents
documents = {
    "doc1": "deep learning neural networks transform artificial intelligence",
    "doc2": "software engineering principles guide system development",
    "doc3": "data science analytics reveal meaningful business insights"
}

def preprocess(text):
    return text.lower().split()

# Build vocabulary
vocab = set()
preprocessed_docs = {}

for name, text in documents.items():
    tokens = preprocess(text)
    preprocessed_docs[name] = tokens
    vocab.update(tokens)

print(f'Preprocessed docs: {preprocessed_docs}')
vocab = sorted(list(vocab))
print("Vocabulary:", vocab)




Preprocessed docs: {'doc1': ['deep', 'learning', 'neural', 'networks', 'transform', 'artificial', 'intelligence'], 'doc2': ['software', 'engineering', 'principles', 'guide', 'system', 'development'], 'doc3': ['data', 'science', 'analytics', 'reveal', 'meaningful', 'business', 'insights']}
Vocabulary: ['analytics', 'artificial', 'business', 'data', 'deep', 'development', 'engineering', 'guide', 'insights', 'intelligence', 'learning', 'meaningful', 'networks', 'neural', 'principles', 'reveal', 'science', 'software', 'system', 'transform']


### Term Frequency

In [6]:
def compute_tf(doc_tokens, vocab):
    tf = {}
    total_terms = len(doc_tokens)
    for term in vocab:
        tf[term] = doc_tokens.count(term) / total_terms
    return tf


### Inverse Document Frequency

In [7]:
def compute_idf(all_docs, vocab):
    N = len(all_docs)
    idf = {}
    for term in vocab:
        df = sum(1 for doc in all_docs.values() if term in doc)
        idf[term] = math.log(N / (df + 1)) + 1  # +1 smoothing
    return idf


### TF-IDF

In [8]:
def compute_tfidf(tf, idf):
    tfidf = {}
    for term in tf:
        tfidf[term] = tf[term] * idf[term]
    return tfidf


In [9]:
# Compute IDF once for all docs
idf = compute_idf(preprocessed_docs, vocab)

# For each document, compute TF and TF-IDF
for name, tokens in preprocessed_docs.items():
    tf = compute_tf(tokens, vocab)
    tfidf = compute_tfidf(tf, idf)

    print(f"\nDocument: {name}")
    print("TF:     ", {k: round(v, 3) for k, v in tf.items()})
    print("TF-IDF: ", {k: round(v, 3) for k, v in tfidf.items()})



Document: doc1
TF:      {'analytics': 0.0, 'artificial': 0.143, 'business': 0.0, 'data': 0.0, 'deep': 0.143, 'development': 0.0, 'engineering': 0.0, 'guide': 0.0, 'insights': 0.0, 'intelligence': 0.143, 'learning': 0.143, 'meaningful': 0.0, 'networks': 0.143, 'neural': 0.143, 'principles': 0.0, 'reveal': 0.0, 'science': 0.0, 'software': 0.0, 'system': 0.0, 'transform': 0.143}
TF-IDF:  {'analytics': 0.0, 'artificial': 0.201, 'business': 0.0, 'data': 0.0, 'deep': 0.201, 'development': 0.0, 'engineering': 0.0, 'guide': 0.0, 'insights': 0.0, 'intelligence': 0.201, 'learning': 0.201, 'meaningful': 0.0, 'networks': 0.201, 'neural': 0.201, 'principles': 0.0, 'reveal': 0.0, 'science': 0.0, 'software': 0.0, 'system': 0.0, 'transform': 0.201}

Document: doc2
TF:      {'analytics': 0.0, 'artificial': 0.0, 'business': 0.0, 'data': 0.0, 'deep': 0.0, 'development': 0.167, 'engineering': 0.167, 'guide': 0.167, 'insights': 0.0, 'intelligence': 0.0, 'learning': 0.0, 'meaningful': 0.0, 'networks': 0.0,

## Cosine Similarity

In [10]:
import math

# Sample documents
documents = [
    "python programming language supports machine learning development",
    "java enterprise applications require robust software architecture",
]

# Build vocabulary (unique words in all documents)
def build_vocab(docs):
    vocab_set = set()
    for doc in docs:
        vocab_set.update(doc.split())
    return sorted(vocab_set)

vocab = build_vocab(documents)
print("Vocabulary:", vocab)

Vocabulary: ['applications', 'architecture', 'development', 'enterprise', 'java', 'language', 'learning', 'machine', 'programming', 'python', 'require', 'robust', 'software', 'supports']


In [11]:
# Create Bag of Words vector for a document
def bow_vector(doc, vocab):
    words = doc.split()
    return [words.count(term) for term in vocab]

In [12]:
# Compute cosine similarity between two vectors
def cosine_similarity(vec1, vec2):
    dot_product = sum(a * b for a, b in zip(vec1, vec2))
    magnitude1 = math.sqrt(sum(a * a for a in vec1))
    magnitude2 = math.sqrt(sum(b * b for b in vec2))
    if magnitude1 == 0 or magnitude2 == 0:
        return 0.0
    return dot_product / (magnitude1 * magnitude2)

# Create BoW vectors for all documents
vectors = [bow_vector(doc, vocab) for doc in documents]

# Compute similarity between first and second documents
similarity = cosine_similarity(vectors[0], vectors[1])

print("\nBoW Vector for Document 1:", vectors[0])
print("BoW Vector for Document 2:", vectors[1])
print("Cosine Similarity between Doc1 and Doc2:", round(similarity, 4))


BoW Vector for Document 1: [0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1]
BoW Vector for Document 2: [1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0]
Cosine Similarity between Doc1 and Doc2: 0.0


**Kullback–Leibler (KL) divergence**

In [13]:
import math
import numpy as np

# Sample query and documents
query = "machine learning algorithms neural networks"
documents = {
    "doc1": "machine learning algorithms are used in artificial intelligence applications",
    "doc2": "neural networks deep learning models for pattern recognition",
    "doc3": "database management systems and data storage solutions",
    "doc4": "natural language processing and text mining techniques",
    "doc5": "computer vision algorithms for image recognition and analysis"
}

def preprocess(text):
    return text.lower().split()

# Build vocabulary from query and documents
vocab = set()
query_tokens = preprocess(query)
vocab.update(query_tokens)

doc_tokens = {}
for doc_id, text in documents.items():
    tokens = preprocess(text)
    doc_tokens[doc_id] = tokens
    vocab.update(tokens)

vocab = sorted(list(vocab))
print("Vocabulary:", vocab)
print("Query tokens:", query_tokens)

Vocabulary: ['algorithms', 'analysis', 'and', 'applications', 'are', 'artificial', 'computer', 'data', 'database', 'deep', 'for', 'image', 'in', 'intelligence', 'language', 'learning', 'machine', 'management', 'mining', 'models', 'natural', 'networks', 'neural', 'pattern', 'processing', 'recognition', 'solutions', 'storage', 'systems', 'techniques', 'text', 'used', 'vision']
Query tokens: ['machine', 'learning', 'algorithms', 'neural', 'networks']


In [14]:
# Calculate probability distribution for query model
def query_model(query_tokens, vocab, smoothing=0.1):
    """Create probability distribution for query using maximum likelihood with smoothing"""
    prob_dist = {}
    query_length = len(query_tokens)
    vocab_size = len(vocab)
    
    for term in vocab:
        term_count = query_tokens.count(term)
        # Laplace smoothing
        prob_dist[term] = (term_count + smoothing) / (query_length + smoothing * vocab_size)
    
    return prob_dist

# Calculate probability distribution for document model  
def document_model(doc_tokens, vocab, smoothing=0.1):
    """Create probability distribution for document using maximum likelihood with smoothing"""
    prob_dist = {}
    doc_length = len(doc_tokens)
    vocab_size = len(vocab)
    
    for term in vocab:
        term_count = doc_tokens.count(term)
        # Laplace smoothing
        prob_dist[term] = (term_count + smoothing) / (doc_length + smoothing * vocab_size)
    
    return prob_dist

# Calculate query model
P_Q = query_model(query_tokens, vocab)
print("\nQuery Model P(w|Q):")
for term, prob in P_Q.items():
    print(f"{term:12}: {prob:.4f}")


Query Model P(w|Q):
algorithms  : 0.1325
analysis    : 0.0120
and         : 0.0120
applications: 0.0120
are         : 0.0120
artificial  : 0.0120
computer    : 0.0120
data        : 0.0120
database    : 0.0120
deep        : 0.0120
for         : 0.0120
image       : 0.0120
in          : 0.0120
intelligence: 0.0120
language    : 0.0120
learning    : 0.1325
machine     : 0.1325
management  : 0.0120
mining      : 0.0120
models      : 0.0120
natural     : 0.0120
networks    : 0.1325
neural      : 0.1325
pattern     : 0.0120
processing  : 0.0120
recognition : 0.0120
solutions   : 0.0120
storage     : 0.0120
systems     : 0.0120
techniques  : 0.0120
text        : 0.0120
used        : 0.0120
vision      : 0.0120


In [15]:
# KL Divergence calculation
def kl_divergence(P, Q):
    """Calculate KL divergence from P to Q: D_KL(P || Q)"""
    kl_div = 0.0
    for term in P:
        if P[term] > 0:  # Only calculate for terms with non-zero probability in P
            if Q[term] > 0:  # Avoid log(0)
                kl_div += P[term] * math.log(P[term] / Q[term])
            else:
                # If Q[term] = 0 but P[term] > 0, KL divergence is infinite
                # In practice, we use smoothing to avoid this
                kl_div += float('inf')
    return kl_div

# Calculate document models and KL divergences
print("\nDocument Models and KL Divergences:")
print("="*50)

scores = {}
for doc_id, tokens in doc_tokens.items():
    P_D = document_model(tokens, vocab)
    
    print(f"\n{doc_id.upper()} Model P(w|D):")
    for term, prob in P_D.items():
        if prob > 0.01:  # Only show terms with reasonable probability
            print(f"{term:12}: {prob:.4f}")
    
    # Calculate KL divergence from Query to Document: D_KL(P_Q || P_D)
    kl_score = kl_divergence(P_Q, P_D)
    scores[doc_id] = kl_score
    
    print(f"KL Divergence D_KL(Q || {doc_id}): {kl_score:.4f}")

# Rank documents (lower KL divergence = better match)
print(f"\n{'='*50}")
print("DOCUMENT RANKING (Lower KL divergence = Better match):")
print("="*50)

ranked_docs = sorted(scores.items(), key=lambda x: x[1])
for rank, (doc_id, score) in enumerate(ranked_docs, 1):
    print(f"Rank {rank}: {doc_id} (KL divergence: {score:.4f})")
    print(f"         Text: '{documents[doc_id]}'")
    print()


Document Models and KL Divergences:

DOC1 Model P(w|D):
algorithms  : 0.0894
applications: 0.0894
are         : 0.0894
artificial  : 0.0894
in          : 0.0894
intelligence: 0.0894
learning    : 0.0894
machine     : 0.0894
used        : 0.0894
KL Divergence D_KL(Q || doc1): 0.8556

DOC2 Model P(w|D):
deep        : 0.0973
for         : 0.0973
learning    : 0.0973
models      : 0.0973
networks    : 0.0973
neural      : 0.0973
pattern     : 0.0973
recognition : 0.0973
KL Divergence D_KL(Q || doc2): 0.7997

DOC3 Model P(w|D):
and         : 0.1068
data        : 0.1068
database    : 0.1068
management  : 0.1068
solutions   : 0.1068
storage     : 0.1068
systems     : 0.1068
KL Divergence D_KL(Q || doc3): 1.6026

DOC4 Model P(w|D):
and         : 0.1068
language    : 0.1068
mining      : 0.1068
natural     : 0.1068
processing  : 0.1068
techniques  : 0.1068
text        : 0.1068
KL Divergence D_KL(Q || doc4): 1.6026

DOC5 Model P(w|D):
algorithms  : 0.0973
analysis    : 0.0973
and         : 0.09

In [16]:
# Alternative: KL divergence from Document to Query D_KL(P_D || P_Q)
print("\nAlternative Direction: D_KL(Document || Query)")
print("="*50)

alt_scores = {}
for doc_id, tokens in doc_tokens.items():
    P_D = document_model(tokens, vocab)
    kl_score_alt = kl_divergence(P_D, P_Q)
    alt_scores[doc_id] = kl_score_alt
    print(f"D_KL({doc_id} || Q): {kl_score_alt:.4f}")

# Compare both directions
print(f"\n{'='*60}")
print("COMPARISON OF BOTH DIRECTIONS:")
print("="*60)
print(f"{'Document':<8} {'D_KL(Q||D)':<12} {'D_KL(D||Q)':<12} {'Text'}")
print("-" * 60)

for doc_id in documents.keys():
    print(f"{doc_id:<8} {scores[doc_id]:<12.4f} {alt_scores[doc_id]:<12.4f} {documents[doc_id][:30]}...")

print(f"\nNote: In IR, we typically use D_KL(Q || D) where:")
print("- Lower divergence means the document model is closer to the query model")
print("- This measures how much 'information is lost' when using document model instead of query model")


Alternative Direction: D_KL(Document || Query)
D_KL(doc1 || Q): 0.8543
D_KL(doc2 || Q): 0.8161
D_KL(doc3 || Q): 1.4603
D_KL(doc4 || Q): 1.4603
D_KL(doc5 || Q): 1.2405

COMPARISON OF BOTH DIRECTIONS:
Document D_KL(Q||D)   D_KL(D||Q)   Text
------------------------------------------------------------
doc1     0.8556       0.8543       machine learning algorithms ar...
doc2     0.7997       0.8161       neural networks deep learning ...
doc3     1.6026       1.4603       database management systems an...
doc4     1.6026       1.4603       natural language processing an...
doc5     1.3775       1.2405       computer vision algorithms for...

Note: In IR, we typically use D_KL(Q || D) where:
- Lower divergence means the document model is closer to the query model
- This measures how much 'information is lost' when using document model instead of query model
