<a href="https://colab.research.google.com/github/pnabende/ahumain-big-data-course-development/blob/main/positional_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from collections import defaultdict

In [None]:
# Sample documents

documents = [
    "Web mining is useful",
    "Usage mining applications",
    "Web structure mining studies the Web hyperlink structures"
]

In [None]:
# Define functions for tokenization and building positional index

def tokenize(text):
  return text.lower().split()

def build_positional_index(docs):
  index = defaultdict(lambda:defaultdict(list))
  for doc_id, doc in enumerate(docs):
    tokens = tokenize(doc)
    for position, token in enumerate(tokens):
      index[token][doc_id].append(position)
  return index

In [None]:
# Build the positional index
positional_index = build_positional_index(documents)


In [None]:
# Display the positional index

def print_positional_index(index):
  for term, postings in index.items():
    print(f"Term: {term}")
    for doc_id, positions in postings.items():
      print(f"  Doc ID: {doc_id}, Positions: {positions}")

print_positional_index(positional_index)

Term: web
  Doc ID: 0, Positions: [0]
  Doc ID: 2, Positions: [0, 5]
Term: mining
  Doc ID: 0, Positions: [1]
  Doc ID: 1, Positions: [1]
  Doc ID: 2, Positions: [2]
Term: is
  Doc ID: 0, Positions: [2]
Term: useful
  Doc ID: 0, Positions: [3]
Term: usage
  Doc ID: 1, Positions: [0]
Term: applications
  Doc ID: 1, Positions: [2]
Term: structure
  Doc ID: 2, Positions: [1]
Term: studies
  Doc ID: 2, Positions: [3]
Term: the
  Doc ID: 2, Positions: [4]
Term: hyperlink
  Doc ID: 2, Positions: [6]
Term: structures
  Doc ID: 2, Positions: [7]


In [None]:
# Function to perform phrase search
def phrase_search(phrase, index, docs):
  phrase_tokens = tokenize(phrase)
  result_docs = []

  if phrase_tokens[0] in index:
    initial_postings = index[phrase_tokens[0]]
    for doc_id, initial_positions in initial_postings.items():
      for pos in initial_positions:
        match = True
        for offset, token in enumerate(phrase_tokens[1:], start=1):
          if token not in index or doc_id not in index[token] or pos + offset not in index[token][doc_id]:
            match = False
            break
        if match:
          result_docs.append(doc_id)
          break
  return result_docs

In [None]:
# Example phrase search
phrase = "web mining"
result_docs = phrase_search(phrase, positional_index, documents)
print(f"Phrase '{phrase}' found in documents: {result_docs}")

phrase = "usage mining"
result_docs = phrase_search(phrase, positional_index, documents)
print(f"Phrase '{phrase}' found in documents: {result_docs}")

phrase = "web structure"
result_docs = phrase_search(phrase, positional_index, documents)
print(f"Phrase '{phrase}' found in documents: {result_docs}")

Phrase 'web mining' found in documents: [0]
Phrase 'usage mining' found in documents: [1]
Phrase 'web structure' found in documents: [2]
