
### Query Operations and Languages

In this lab we will be doing:

- Query expansion (with a thesaurus or WordNet and correlation matrix),  
- Spelling correction (Edit distance, K-  Gram indexes, Context sensitive spelling correction),  
- Query languages (Single-Word Queries, Context Queries, Boolean Queries, Structural Query, Natural Language)



In [1]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.util import ngrams
from collections import defaultdict, Counter
import re

In [2]:
documents = [
    "Machine learning algorithms are transforming artificial intelligence applications.",
    "Deep neural networks enable advanced pattern recognition capabilities.",
    "Natural language processing helps computers understand human communication.",
    "Computer vision systems can analyze and interpret visual data effectively.",
    "Data mining techniques extract valuable insights from large datasets.",
    "Software engineering principles guide robust system development practices."
]

query = "machine learning"
vocab = ["machine", "learning", "algorithms", "neural", "networks", "data", "analysis", "artificial", "intelligence", "computer", "vision", "software", "engineering", "natural", "language", "processing"]
corpus = " ".join(documents)

## 1. Query Expansion (WordNet)


In [3]:
def query_expansion_wordnet(query):
    words = nltk.word_tokenize(query)
    expanded_query = set(words)
    for word in words:
        for syn in wn.synsets(word):
            for lemma in syn.lemmas():
                expanded_query.add(lemma.name().replace('_', ' '))
    return list(expanded_query)

print("1. Query Expansion (WordNet):")
print(query_expansion_wordnet(query))

1. Query Expansion (WordNet):
['get wind', 'discover', 'see', 'eruditeness', 'find out', 'take', 'learnedness', 'auto', 'automobile', 'car', 'machine', 'memorize', 'instruct', 'acquire', 'get a line', 'study', 'learning', 'check', 'ascertain', 'larn', 'get word', 'con', 'teach', 'acquisition', 'read', 'erudition', 'learn', 'simple machine', 'political machine', 'encyclopedism', 'encyclopaedism', 'motorcar', 'hear', 'memorise', 'watch', 'pick up', 'scholarship', 'determine']
['get wind', 'discover', 'see', 'eruditeness', 'find out', 'take', 'learnedness', 'auto', 'automobile', 'car', 'machine', 'memorize', 'instruct', 'acquire', 'get a line', 'study', 'learning', 'check', 'ascertain', 'larn', 'get word', 'con', 'teach', 'acquisition', 'read', 'erudition', 'learn', 'simple machine', 'political machine', 'encyclopedism', 'encyclopaedism', 'motorcar', 'hear', 'memorise', 'watch', 'pick up', 'scholarship', 'determine']


## 2. Spelling Correction Techniques

In [4]:
# A. Edit Distance
def edit_distance(w1, w2):
    dp = [[0] * (len(w2)+1) for _ in range(len(w1)+1)]
    for i in range(len(w1)+1):
        for j in range(len(w2)+1):
            if i == 0:
                dp[i][j] = j
            elif j == 0:
                dp[i][j] = i
            elif w1[i-1] == w2[j-1]:
                dp[i][j] = dp[i-1][j-1]
            else:
                dp[i][j] = 1 + min(dp[i-1][j], dp[i][j-1], dp[i-1][j-1])
    return dp[-1][-1]

def correct_by_edit_distance(word, vocab):
  min_dist = float('inf')
  correction = word
  for w in vocab:
      dist = edit_distance(word, w)
      if dist < min_dist:
          min_dist = dist
          correction = w
  return correction


print("\n2A. Edit Distance Correction:")
print(correct_by_edit_distance("machne", vocab))




2A. Edit Distance Correction:
machine


In [5]:
# B. K-Gram Index
def generate_k_grams(word, k=3):
    word = f"${word}$"
    return [word[i:i+k] for i in range(len(word)-k+1)]

def kgram_index(vocab, k=3):
    index = defaultdict(set)
    for word in vocab:
        grams = generate_k_grams(word, k)
        for g in grams:
            index[g].add(word)
    return index

def correct_by_kgram(word, index, k=3):
    grams = generate_k_grams(word, k)
    candidates = Counter()
    for g in grams:
        for cand in index.get(g, []):
            candidates[cand] += 1
    return candidates.most_common(1)[0][0] if candidates else word


print("\n2B. K-Gram Correction:")
k_index = kgram_index(vocab)
print(correct_by_kgram("lerning", k_index))


2B. K-Gram Correction:
learning


In [6]:
# C. Context Sensitive Correction (Bigram Based)
def train_bigram_model(corpus):
    tokens = nltk.word_tokenize(corpus.lower())
    bigrams = list(ngrams(tokens, 2))
    model = Counter(bigrams)
    return model

def correct_contextually(word_list, model):
    corrected = [word_list[0]]
    for i in range(1, len(word_list)):
        prev_word = corrected[-1]
        word = word_list[i]
        candidates = [word] + [correct_by_edit_distance(word, model.keys())]
        best_word = max(candidates, key=lambda w: model.get((prev_word, w), 0))
        corrected.append(best_word)
    return corrected

print("\n2C. Context Sensitive:")
bigram_model = train_bigram_model(corpus)
print(correct_contextually(["machine", "lerning", "algorithms"], bigram_model))



2C. Context Sensitive:
['machine', 'lerning', 'algorithms']


## 3. Query Language Interpreter

In [7]:
def single_word_query(word, documents):
    return [doc for doc in documents if word in doc.lower()]


print("\n3A. Single-word Query:")
print(single_word_query("neural", documents))



3A. Single-word Query:
['Deep neural networks enable advanced pattern recognition capabilities.']


In [8]:
def boolean_query(q, documents):
    terms = q.lower().split()
    result = set(documents)
    if "and" in terms:
        terms = [t for t in terms if t != "and"]
        result = [doc for doc in documents if all(t in doc.lower() for t in terms)]
    elif "or" in terms:
        terms = [t for t in terms if t != "or"]
        result = [doc for doc in documents if any(t in doc.lower() for t in terms)]
    elif "not" in terms:
        idx = terms.index("not")
        term = terms[idx+1]
        result = [doc for doc in documents if term not in doc.lower()]
    return result

print("\n3B. Boolean Query:")
print(boolean_query("machine AND learning", documents))



3B. Boolean Query:
['Machine learning algorithms are transforming artificial intelligence applications.']


In [9]:
def natural_language_query(nl_query, documents):
    tokens = nltk.word_tokenize(nl_query.lower())
    return [doc for doc in documents if any(t in doc.lower() for t in tokens)]

print("\n3C. Natural Language Query:")
print(natural_language_query("How does artificial intelligence work?", documents))


3C. Natural Language Query:
['Machine learning algorithms are transforming artificial intelligence applications.', 'Deep neural networks enable advanced pattern recognition capabilities.']


In [None]:
def structural_query(structure_query, documents):
    # Dummy: match title:xxx or body:xxx
    m = re.match(r"(title|body):(\w+)", structure_query.lower())
    if m:
        field, word = m.groups()
        return [doc for doc in documents if word in doc.lower()]
    return []


print("\n3D. Structural Query:")
print(structural_query("title:neural", documents))



3D. Structural Query:
[]
