## Query Operations

In [11]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import wordnet as wn
from nltk.util import ngrams
from collections import defaultdict, Counter
import re

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Pralo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pralo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Pralo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

### Query Exapansion

#### Using WordNet 

In [12]:
from nltk.corpus import wordnet

In [13]:
def query_expansion(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace("_", " "))
    return list(synonyms)

### Spelling Correction

#### Edit distance

In [16]:
def edit_distance(w1, w2):
    m, n = len(w1), len(w2)
    dp = [[0] * (n + 1) for _ in range(m + 1)]

    for i in range(m + 1):
        dp[i][0] = i
    for j in range(n + 1):
        dp[0][j] = j

    for i in range(1, m + 1):
        for j in range(1, n + 1):
            cost = 0 if w1[i-1] == w2[j-1] else 1
            dp[i][j] = min(
                dp[i-1][j] + 1,     # deletion
                dp[i][j-1] + 1,     # insertion
                dp[i-1][j-1] + cost # substitution
            )
    return dp[m][n]


def correct_word_edit_distance(word, dictionary):
    best_word = word
    min_dist = float("inf")
    for candidate in dictionary:
        dist = edit_distance(word, candidate)
        if dist < min_dist:
            min_dist = dist
            best_word = candidate
    return best_word


#### K-gram indexes(characters bigrams example)

In [17]:
def build_kgram_index(dictionary, k=2):
    index = defaultdict(set)
    for word in dictionary:
        grams = get_kgrams(word, k)
        for g in grams:
            index[g].add(word)
    return index


def get_kgrams(word, k=2):
    word = f"${word}$"
    return [word[i:i+k] for i in range(len(word)-k+1)]


def correct_word_kgram(word, dictionary, k=2):
    index = build_kgram_index(dictionary, k)
    grams = get_kgrams(word, k)

    candidates = defaultdict(int)
    for g in grams:
        for cand in index.get(g, []):
            candidates[cand] += 1

    if not candidates:
        return word

    return max(candidates, key=candidates.get)


### Query Languages

In [None]:
documents = {
    1: "I like to eat apple and mango",
    2: "Banana is yellow and sweet",
    3: "Grapes are small and juicy",
    4: "Orange and apple juice is tasty",
}

#### Single-Word Query

In [19]:
def search_single_word(word):
    results = []
    for doc_id, text in documents.items():
        if word.lower() in text.lower():
            results.append(doc_id)
    return results

#### Context Queries

In [20]:
def search_context(words):
    results = []
    for doc_id, text in documents.items():
        if all(w.lower() in text.lower() for w in words):
            results.append(doc_id)
    return results

#### Boolean Query

In [21]:
def search_boolean(query):
    query = query.replace("AND", "and").replace("OR", "or").replace("NOT", "not")
    results = []
    for doc_id, text in documents.items():
        expr = query
        for word in set(query.split()):
            if word.lower() not in ["and", "or", "not", "(", ")"]:
                expr = expr.replace(word, str(word.lower() in text.lower()))
        if eval(expr):
            results.append(doc_id)
    return results

#### Structual Query

In [22]:
def search_structural(query):
    return search_boolean(query)

#### Natural language Query

In [23]:
def search_natural_language(query):
    keywords = [w for w in query.lower().split() if w not in ["find", "documents", "about"]]
    return search_context(keywords)

### Output for Query Expansion

In [30]:
word = "apple"
print("Query Expansion for:", word)
print(query_expansion(word))

Query Expansion for: apple
['apple', 'Malus pumila', 'orchard apple tree']


### Outpur for Spelling Correction

In [28]:
dictionary = ["information", "retrieval", "system", "query", "expansion", "spelling", "correction"]

print("Corrected (Edit Distance):", correct_word_edit_distance("retriveal", dictionary))
print("Corrected (K-Gram):", correct_word_kgram("retriveal", dictionary))

Corrected (Edit Distance): retrieval
Corrected (K-Gram): retrieval


### Output for Query Languages

In [29]:
print("Single Word Query (apple):", search_single_word("apple"))
print("Context Query (apple, mango):", search_context(["apple", "mango"]))
print("Boolean Query ((apple AND mango) OR orange):", search_boolean("(apple AND mango) OR orange"))
print("Structural Query ((apple AND banana) OR grape):", search_structural("(apple AND banana) OR grape"))
print("Natural Language Query (find documents about apple and mango):", 
      search_natural_language("find documents about apple and mango"))

Single Word Query (apple): [1, 4]
Context Query (apple, mango): [1]
Boolean Query ((apple AND mango) OR orange): [4]
Structural Query ((apple AND banana) OR grape): [3]
Natural Language Query (find documents about apple and mango): [1]
