# Information Retrieval Lab: Indexing and Boolean Models
## Part 1: Document Processing and TF-IDF Computation

In [1]:
import os
import math
import nltk
import re 
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import FreqDist
from collections import defaultdict, deque

In [2]:
# === Ensure required NLTK resources are available ===
nltk.download('stopwords')

# === Paths ===
collection_path = "../Collection"  # Folder containing .txt files
output_dir = "exercice_1"       # All output files will be saved here

# Create output folder if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Output file paths
output_doc_terms = os.path.join(output_dir, "Document_Terms.txt")
output_inverted_index = os.path.join(output_dir, "Inverted_Index.txt")
output_term_freq = os.path.join(output_dir, "Term_Frequencies.txt")
output_tfidf = os.path.join(output_dir, "TFIDF_Weights.txt")
output_term_doc_weight = os.path.join(output_dir, "Term_Doc_Weights.txt")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moous\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# === Initialize tools ===
tokenizer = RegexpTokenizer(
    r'(?:[A-Za-z]\.)+'            # abbreviations like U.S.A.
    r'|[A-Za-z]+[\-@]\d+(?:\.\d+)?'
    r'|\d+(?:[\.\,\-]\d+)*%?'
    r'|[A-Za-z]+'
)
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()

In [4]:
# === Step 1: Read all documents ===
documents = {}
for filename in os.listdir(collection_path):
    if filename.endswith(".txt"):
        doc_id = filename.split(".")[0]
        with open(os.path.join(collection_path, filename), "r", encoding="utf-8") as f:
            documents[doc_id] = f.read()

In [5]:
# === Step 2: Process documents ===
doc_terms = {}          # {doc_id: [terms]}
inverted_index = {}     # {term: [doc_ids]}

for doc_id, text in documents.items():
    # Tokenize
    terms = tokenizer.tokenize(text)

    # Normalize & clean
    cleaned_terms = []
    for term in terms:
        term = term.lower()
        if term not in stop_words:
            stemmed = porter.stem(term)
            cleaned_terms.append(stemmed)

    doc_terms[doc_id] = cleaned_terms

    # Build inverted index
    for term in cleaned_terms:
        if term not in inverted_index:
            inverted_index[term] = set()
        inverted_index[term].add(doc_id)

In [6]:
# === Step 3: Write document-terms and inverted index ===
with open(output_doc_terms, "w", encoding="utf-8") as f:
    for doc_id, terms in doc_terms.items():
        for term in terms:
            f.write(f"{doc_id} {term}\n")

with open(output_inverted_index, "w", encoding="utf-8") as f:
    for term, docs in inverted_index.items():
        for doc_id in docs:
            f.write(f"{term} {doc_id}\n")

print(f"Document-Terms saved to {output_doc_terms}")
print(f"Inverted Index saved to {output_inverted_index}")

Document-Terms saved to exercice_1\Document_Terms.txt
Inverted Index saved to exercice_1\Inverted_Index.txt


In [7]:
# === Step 4: Compute and save Term Frequencies ===
all_terms = [term for terms in doc_terms.values() for term in terms]
term_freq = FreqDist(all_terms)

with open(output_term_freq, "w", encoding="utf-8") as f:
    for term, freq in term_freq.items():
        f.write(f"{term} {freq}\n")

print(f"Term Frequencies saved to {output_term_freq}")

Term Frequencies saved to exercice_1\Term_Frequencies.txt


In [8]:
# === Step 5: Compute and save TF-IDF weights ===
N = len(documents)
doc_freq = {term: len(docs) for term, docs in inverted_index.items()}

tfidf_weights = {}
term_doc_data = []  # For combined file (term, doc, weight, frequency)

for doc_id, terms in doc_terms.items():
    tfidf_weights[doc_id] = {}
    term_freq_doc = {}

    for term in terms:
        term_freq_doc[term] = term_freq_doc.get(term, 0) + 1

    max_freq = max(term_freq_doc.values()) if term_freq_doc else 1

    for term, freq_td in term_freq_doc.items():
        nt = doc_freq.get(term, 1)
        tf = freq_td / max_freq
        idf = math.log10((N / nt) + 1)
        weight = tf * idf
        tfidf_weights[doc_id][term] = weight

        term_doc_data.append((term, doc_id, weight, freq_td))

# Write TF-IDF weights
with open(output_tfidf, "w", encoding="utf-8") as f:
    for doc_id, weights in tfidf_weights.items():
        for term, weight in weights.items():
            f.write(f"{doc_id} {term} {weight:.3f}\n")

print(f"TF-IDF Weights saved to {output_tfidf}")

TF-IDF Weights saved to exercice_1\TFIDF_Weights.txt


In [9]:
# === Step 6: Save combined Term-Doc-Weight-Frequency file ===
with open(output_term_doc_weight, "w", encoding="utf-8") as f:
    f.write("Term\tDocument\tWeight\tFrequency\n")
    for term, doc_id, weight, freq in term_doc_data:
        f.write(f"{term}\t{doc_id}\t{weight:.3f}\t{freq}\n")

print(f"Term-Doc-Weight-Frequency file saved to {output_term_doc_weight}")
print("\nAll files have been created successfully in:", os.path.abspath(output_dir))

Term-Doc-Weight-Frequency file saved to exercice_1\Term_Doc_Weights.txt

All files have been created successfully in: c:\Users\moous\Documents\M2\RI\Information-Retrival-RI-TPs\LAB2\exercice_1


---
## Part 2: Boolean Models Implementation

In [10]:
# === Query preprocessing ===
def preprocess_query(query_text):
    tokens = tokenizer.tokenize(query_text)
    cleaned_tokens = []
    
    for token in tokens:
        token = token.lower()
        if token not in stop_words:
            stemmed = porter.stem(token)
            cleaned_tokens.append(stemmed)
    
    return cleaned_tokens

In [11]:
# === Boolean query parsing ===
TOKEN_RE = re.compile(r"\(|\)|\bAND\b|\bOR\b|\bNOT\b|[A-Za-z0-9]+", re.IGNORECASE)

class TermNode:
    def __init__(self, term):
        self.term = term

class NotNode:
    def __init__(self, child):
        self.child = child

class AndNode:
    def __init__(self, children):
        self.children = children

class OrNode:
    def __init__(self, children):
        self.children = children


def tokenize_boolean_query(query):
    tokens = TOKEN_RE.findall(query)
    tokens = [t.upper() if t.upper() in ("AND", "OR", "NOT") else t for t in tokens]
    return tokens


def parse_boolean_query(query):
    tokens = deque(tokenize_boolean_query(query))

    def parse_expr():
        return parse_or()

    def parse_or():
        left = parse_and()
        children = [left]
        while tokens and tokens[0] == "OR":
            tokens.popleft()
            children.append(parse_and())
        if len(children) == 1:
            return children[0]
        return OrNode(children)

    def parse_and():
        left = parse_not()
        children = [left]
        while tokens and tokens[0] == "AND":
            tokens.popleft()
            children.append(parse_not())
        if len(children) == 1:
            return children[0]
        return AndNode(children)

    def parse_not():
        if tokens and tokens[0] == "NOT":
            tokens.popleft()
            return NotNode(parse_not())
        return parse_term()

    def parse_term():
        if not tokens:
            raise ValueError("Unexpected end of query")
        tok = tokens.popleft()
        if tok == "(":
            node = parse_expr()
            if not tokens or tokens.popleft() != ")":
                raise ValueError("Missing closing parenthesis")
            return node
        else:
            processed = preprocess_query(tok)
            if not processed:
                return TermNode(tok.lower())
            return TermNode(processed[0])

    return parse_expr()

In [12]:
# === Classic Boolean Model ===
def evaluate_classic_boolean(node, inverted_index, all_docs):
    if isinstance(node, TermNode):
        return inverted_index.get(node.term, set())
    
    if isinstance(node, NotNode):
        child_docs = evaluate_classic_boolean(node.child, inverted_index, all_docs)
        return all_docs - child_docs
    
    if isinstance(node, AndNode):
        result = all_docs.copy()
        for child in node.children:
            result &= evaluate_classic_boolean(child, inverted_index, all_docs)
        return result
    
    if isinstance(node, OrNode):
        result = set()
        for child in node.children:
            result |= evaluate_classic_boolean(child, inverted_index, all_docs)
        return result
    
    raise ValueError("Unknown node type")

In [13]:
# === Fuzzy Boolean Model ===
def get_term_scores(term, doc_terms, tfidf_weights):
    scores = {}
    max_weight = 0
    
    for doc_id in doc_terms.keys():
        if term in tfidf_weights.get(doc_id, {}):
            weight = tfidf_weights[doc_id][term]
            if weight > max_weight:
                max_weight = weight
    
    for doc_id in doc_terms.keys():
        if term in tfidf_weights.get(doc_id, {}):
            scores[doc_id] = tfidf_weights[doc_id][term] 
        else:
            scores[doc_id] = 0.0
    
    return scores


def evaluate_fuzzy_boolean(node, doc_terms, tfidf_weights):
    all_docs = set(doc_terms.keys())
    
    if isinstance(node, TermNode):
        return get_term_scores(node.term, doc_terms, tfidf_weights)
    
    if isinstance(node, NotNode):
        child_scores = evaluate_fuzzy_boolean(node.child, doc_terms, tfidf_weights)
        return {doc_id: 1.0 - child_scores.get(doc_id, 0.0) for doc_id in all_docs}
    
    if isinstance(node, AndNode):
        scores_list = [evaluate_fuzzy_boolean(child, doc_terms, tfidf_weights) 
                      for child in node.children]
        return {doc_id: min(scores.get(doc_id, 0.0) for scores in scores_list) 
                for doc_id in all_docs}
    
    if isinstance(node, OrNode):
        scores_list = [evaluate_fuzzy_boolean(child, doc_terms, tfidf_weights) 
                      for child in node.children]
        return {doc_id: max(scores.get(doc_id, 0.0) for scores in scores_list) 
                for doc_id in all_docs}
    
    raise ValueError("Unknown node type")

In [14]:
# === Extended Boolean Model (p-norm) ===
def evaluate_extended_boolean(node, doc_terms, tfidf_weights, p=2):
    all_docs = set(doc_terms.keys())

    if isinstance(node, TermNode):
        return get_term_scores(node.term, doc_terms, tfidf_weights)
    
    if isinstance(node, NotNode):
        child_scores = evaluate_extended_boolean(node.child, doc_terms, tfidf_weights, p)
        return {doc_id: 1.0 - child_scores.get(doc_id, 0.0) for doc_id in all_docs}
    
    if isinstance(node, AndNode):
        child_scores = [evaluate_extended_boolean(child, doc_terms, tfidf_weights, p)
                        for child in node.children]
        result = {}
        for doc_id in all_docs:
            values = [1 - s.get(doc_id, 0.0) for s in child_scores]
            avg = sum(v**p for v in values) / len(values)
            result[doc_id] = 1 - (avg ** (1/p))
        return result
    
    if isinstance(node, OrNode):
        child_scores = [evaluate_extended_boolean(child, doc_terms, tfidf_weights, p)
                        for child in node.children]
        result = {}
        for doc_id in all_docs:
            values = [s.get(doc_id, 0.0) for s in child_scores]
            avg = sum(v**p for v in values) / len(values)
            result[doc_id] = avg ** (1/p)
        return result
    
    raise ValueError("Unknown node type")

In [15]:
# === Display functions ===
def display_classic_results(matching_docs, query):
    print("\n" + "="*70)
    print("CLASSIC BOOLEAN MODEL")
    print("="*70)
    print(f"Query: {query}")
    print(f"\nMatching Documents: {len(matching_docs)}")
    
    if matching_docs:
        for doc_id in sorted(matching_docs):
            print(f"  - {doc_id}")
    else:
        print("  No documents match the query.")


def display_fuzzy_results(scores, query, threshold=0.0):
    print("\n" + "="*70)
    print("FUZZY BOOLEAN MODEL")
    print("="*70)
    print(f"Query: {query}")
    print(f"\nRanked Documents (score > {threshold}):")
    
    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    displayed = 0
    for doc_id, score in ranked:
        if score > threshold:
            print(f"  {doc_id}: {score:.4f}")
            displayed += 1
    if displayed == 0:
        print("  No documents with positive relevance scores.")


def display_extended_results(scores, query, p=2, threshold=0.0):
    print("\n" + "="*70)
    print(f"EXTENDED BOOLEAN MODEL (p = {p})")
    print("="*70)
    print(f"Query: {query}")
    print(f"\nRanked Documents (score > {threshold}):")
    
    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    displayed = 0
    for doc_id, score in ranked:
        if score > threshold:
            print(f"  {doc_id}: {score:.4f}")
            displayed += 1
    if displayed == 0:
        print("  No documents with positive relevance scores.")

In [16]:
# === Run all models ===
def run_boolean_models():
    test_query = "(query AND reformulation) OR (Language AND model)"
    
    print("\n" + "#"*70)
    print("# BOOLEAN MODELS EVALUATION")
    print("#"*70)
    print(f"\nOriginal Query: {test_query}")
    
    query_tree = parse_boolean_query(test_query)
    all_docs = set(documents.keys())
    
    # Classic
    classic_results = evaluate_classic_boolean(query_tree, inverted_index, all_docs)
    display_classic_results(classic_results, test_query)
    
    # Fuzzy
    fuzzy_scores = evaluate_fuzzy_boolean(query_tree, doc_terms, tfidf_weights)
    display_fuzzy_results(fuzzy_scores, test_query)
    
    # Extended
    extended_scores = evaluate_extended_boolean(query_tree, doc_terms, tfidf_weights, p=2)
    display_extended_results(extended_scores, test_query, p=2, threshold=0.0)
    
    print("\n" + "#"*70)


# Run
run_boolean_models()


######################################################################
# BOOLEAN MODELS EVALUATION
######################################################################

Original Query: (query AND reformulation) OR (Language AND model)

CLASSIC BOOLEAN MODEL
Query: (query AND reformulation) OR (Language AND model)

Matching Documents: 6
  - D1
  - D2
  - D3
  - D4
  - D5
  - D6

FUZZY BOOLEAN MODEL
Query: (query AND reformulation) OR (Language AND model)

Ranked Documents (score > 0.0):
  D4: 0.2676
  D1: 0.2388
  D5: 0.1003
  D2: 0.0502
  D6: 0.0376
  D3: 0.0376

EXTENDED BOOLEAN MODEL (p = 2)
Query: (query AND reformulation) OR (Language AND model)

Ranked Documents (score > 0.0):
  D4: 0.2357
  D1: 0.2145
  D3: 0.1124
  D5: 0.0914
  D2: 0.0737
  D6: 0.0398

######################################################################
