In [25]:
import os, re, math, collections
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk

# ---------- CONFIG ----------
# Point these to your Lab1 outputs (Porter, weighted)
OUTPUT_DIR = "results"
INVERTED_WEIGHTED = os.path.join(OUTPUT_DIR, "inverted_index_weighted.txt")
# file format per Lab1: "<Term>\tDocID\t<TF_max>\t<Weight>" or space-separated

# Query to test (Task 1)
RAW_QUERY = "(query AND reformulation) OR (Language AND model)"

# ---------- PREPROC (reuse Lab1 pipeline) ----------
nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()

# Tokenizer that preserves words and simple tokens; enough for queries
TOKENIZER = RegexpTokenizer(
    r'(?:[A-Za-z]\.){2,}'     # e.g., "e.g."
    r'|[A-Za-z]+(?:-[A-Za-z]+)+'  # hyphenated words
    r'|[A-Za-z]+'             # words
)

def remove_stopwords(tokens):
    # only drop stopwords for pure alpha tokens
    return [t for t in tokens if not (t.isalpha() and t in stop_words)]

def stem_porter(tokens):
    return [porter.stem(t) for t in tokens]

def preprocess_text_to_terms(text):
    toks = TOKENIZER.tokenize(text.lower())
    toks = remove_stopwords(toks)
    stems = stem_porter(toks)
    return stems

# ---------- LOAD INVERTED INDEX (from Lab1) ----------
# Build postings: term -> {docId(str like "D1"): degree in [0,1]}
# We'll take TF_max (already in [0,1]) as the membership degree μ_t(d)
postings = collections.defaultdict(dict)
all_docs = set()

def _split_line(line):
    # Accept both tab or space separated
    if "\t" in line:
        parts = line.strip().split("\t")
    else:
        parts = line.strip().split()
    return parts

with open(INVERTED_WEIGHTED, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip(): 
            continue
        parts = _split_line(line)
        # Expect 4 columns: term, doc, tf_norm, weight
        if len(parts) < 4:
            continue
        term, doc, tf_norm_str, _ = parts[0], parts[1], parts[2], parts[3]
        # Normalize doc like "D3" → "D3"
        doc = doc.strip()
        try:
            tf_norm = float(tf_norm_str)
        except:
            continue
        postings[term][doc] = tf_norm
        all_docs.add(doc)

all_docs = sorted(all_docs, key=lambda d: int(d[1:]))  # ["D1","D2",...]



In [26]:
# ---------- BOOLEAN PARSER ----------
# We’ll parse infix boolean expressions with precedence: NOT > AND > OR
# Supported tokens: terms, AND, OR, NOT, parentheses
OPERATORS = {"NOT": 3, "AND": 2, "OR": 1}  # precedence

def tokenize_boolean_query(q):
    # split on spaces and parentheses, keep parentheses
    q = q.replace("(", " ( ").replace(")", " ) ")
    raw = q.strip().split()
    return raw

def preprocess_boolean_terms(tokens):
    # Lower + stem non-operator tokens
    result = []
    for t in tokens:
        up = t.upper()
        if up in ("AND","OR","NOT","(",")"):
            result.append(up)
        else:
            # preprocess as a single-term text
            stemmed = preprocess_text_to_terms(t)
            if len(stemmed) == 0:
                continue
            # queries like "Language" -> ["languag"]
            result.append(stemmed[0])
    return result

def infix_to_rpn(tokens):
    # Shunting-yard to RPN
    out = []
    stack = []
    for tok in tokens:
        if tok in ("AND","OR","NOT"):
            while stack and stack[-1] in OPERATORS and OPERATORS[stack[-1]] >= OPERATORS[tok]:
                out.append(stack.pop())
            stack.append(tok)
        elif tok == "(":
            stack.append(tok)
        elif tok == ")":
            while stack and stack[-1] != "(":
                out.append(stack.pop())
            if stack and stack[-1] == "(":
                stack.pop()
        else:
            # term
            out.append(tok)
    while stack:
        out.append(stack.pop())
    return out

# ---------- CLASSIC BOOLEAN ----------
def eval_classic_boolean(rpn):
    # Evaluate to a SET of docs
    stack = []
    for tok in rpn:
        if tok == "NOT":
            a = stack.pop()
            res = set(all_docs) - a
            stack.append(res)
        elif tok == "AND":
            b = stack.pop(); a = stack.pop()
            stack.append(a & b)
        elif tok == "OR":
            b = stack.pop(); a = stack.pop()
            stack.append(a | b)
        else:
            # term -> set of docs that contain the term (non-zero degree)
            docs = set(postings.get(tok, {}).keys())
            stack.append(docs)
    return stack.pop() if stack else set()




In [27]:
# ---------- FUZZY BOOLEAN (min/max, NOT->1-x) ----------
def fuzzy_not(x): return 1.0 - x
def fuzzy_and(a, b): return min(a, b)
def fuzzy_or(a, b):  return max(a, b)

def eval_fuzzy_boolean(rpn):
    # Evaluate to a dict: doc -> degree in [0,1]
    stack = []
    for tok in rpn:
        if tok == "NOT":
            A = stack.pop()
            stack.append({d: fuzzy_not(A.get(d, 0.0)) for d in all_docs})
        elif tok in ("AND","OR"):
            B = stack.pop(); A = stack.pop()
            if tok == "AND":
                stack.append({d: fuzzy_and(A.get(d,0.0), B.get(d,0.0)) for d in all_docs})
            else:
                stack.append({d: fuzzy_or(A.get(d,0.0), B.get(d,0.0)) for d in all_docs})
        else:
            # term -> membership function μ_t(d) = tf_max(d,t)
            μ = {d: postings.get(tok, {}).get(d, 0.0) for d in all_docs}
            stack.append(μ)
        # Normalize each term's doc frequencies to [0,1]
        # for term, docs in postings.items():
        #     max_tf = max(docs.values()) if docs else 1.0
        #     postings[term] = {d: tf / max_tf for d, tf in docs.items()}

    return stack.pop() if stack else {d:0.0 for d in all_docs}

# ---------- EXTENDED BOOLEAN (p-norm; Salton) ----------
# For two inputs a,b in [0,1]:
# OR_p(a,b)  = ((a^p + b^p)/2)^(1/p)
# AND_p(a,b) = 1 - (((1-a)^p + (1-b)^p)/2)^(1/p)
def or_p(a, b, p):  return ((a**p + b**p)/2.0)**(1.0/p)
def and_p(a, b, p): return 1.0 - (((1.0-a)**p + (1.0-b)**p)/2.0)**(1.0/p)

def eval_extended_boolean(rpn, p=2.0):
    stack = []
    for tok in rpn:
        if tok == "NOT":
            A = stack.pop()
            stack.append({d: 1.0 - A.get(d,0.0) for d in all_docs})
        elif tok in ("AND","OR"):
            B = stack.pop(); A = stack.pop()
            if tok == "AND":
                stack.append({d: and_p(A.get(d,0.0), B.get(d,0.0), p) for d in all_docs})
            else:
                stack.append({d: or_p(A.get(d,0.0),  B.get(d,0.0), p) for d in all_docs})
        else:
            μ = {d: postings.get(tok, {}).get(d, 0.0) for d in all_docs}
            stack.append(μ)

    return stack.pop() if stack else {d:0.0 for d in all_docs}



In [28]:
# ---------- RUN TASK 1 ----------
def run_task1(raw_query=RAW_QUERY, p=2.0, topk=10):
    # 1) preprocess query like docs (per lab)
    toks = tokenize_boolean_query(raw_query)
    toks = preprocess_boolean_terms(toks)
    rpn  = infix_to_rpn(toks)

    # 2) Classic Boolean
    classic = eval_classic_boolean(rpn)

    # 3) Fuzzy Boolean (rank)
    fuzzy_scores = eval_fuzzy_boolean(rpn)
    fuzzy_rank = sorted(((d, fuzzy_scores[d]) for d in all_docs), key=lambda x: x[1], reverse=True)

    # 4) Extended Boolean (rank), p=2 by default
    ext_scores = eval_extended_boolean(rpn, p=p)
    ext_rank = sorted(((d, ext_scores[d]) for d in all_docs), key=lambda x: x[1], reverse=True)

    # pretty print
    print("Query (raw):", raw_query)
    print("Query (stemmed):", toks)
    print("\n=== Classic Boolean (exact match) ===")
    print("Matches:", " ".join(sorted(classic, key=lambda d: int(d[1:]))) if classic else "None")

    print("\n=== Fuzzy Boolean (min/max) — top docs ===")
    for d, s in fuzzy_rank[:topk]:
        print(f"{d}\t{s:.3f}")

    print("\n=== Extended Boolean (p-norm, p={p}) — top docs ===".format(p=p))
    for d, s in ext_rank[:topk]:
        print(f"{d}\t{s:.3f}")

In [29]:
run_task1()

Query (raw): (query AND reformulation) OR (Language AND model)
Query (stemmed): ['(', 'queri', 'AND', 'reformul', ')', 'OR', '(', 'languag', 'AND', 'model', ')']

=== Classic Boolean (exact match) ===
Matches: D1 D2 D3 D4 D5 D6

=== Fuzzy Boolean (min/max) — top docs ===
D4	4.000
D1	3.000
D5	2.000
D2	1.000
D3	1.000
D6	1.000

=== Extended Boolean (p-norm, p=2.0) — top docs ===
D4	3.571
D3	2.793
D1	1.000
D5	0.460
D2	0.359
D6	0.207
