<a href="https://colab.research.google.com/github/pnabende/ahumain-big-data-course-development/blob/main/ngram_index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from collections import defaultdict

In [None]:
# Sample documents

documents = [
    "Web mining is useful",
    "Usage mining applications",
    "Web structure mining studies the Web hyperlink structures"
]

In [None]:
# Function to generate 3-grams from a word

def generate_ngrams(word, n=3):
  return [word[i:i+n] for i in range(len(word) - n + 1)]

In [None]:
# Tokenize documents and build n-gram index

def tokenize(text):
  return text.lower().split()

def build_ngram_index(docs, n=3):
  index = defaultdict(lambda: defaultdict(list))
  for doc_id, doc in enumerate(docs):
    tokens = tokenize(doc)
    for token in tokens:
      ngrams = generate_ngrams(token, n)
      for ngram in ngrams:
        index[ngram][doc_id].append(token)
  return index

In [None]:
# Build the ngram index

ngram_index = build_ngram_index(documents)

In [None]:
# Display the ngram index

def print_ngram_index(index):
  for ngram, postings in index.items():
    print(f"N-gram: {ngram}")
    for doc_id, tokens in postings.items():
      print(f"  Document ID: {doc_id}, Tokens: {tokens}")

print_ngram_index(ngram_index)

N-gram: web
  Document ID: 0, Tokens: ['web']
  Document ID: 2, Tokens: ['web', 'web']
N-gram: min
  Document ID: 0, Tokens: ['mining']
  Document ID: 1, Tokens: ['mining']
  Document ID: 2, Tokens: ['mining']
N-gram: ini
  Document ID: 0, Tokens: ['mining']
  Document ID: 1, Tokens: ['mining']
  Document ID: 2, Tokens: ['mining']
N-gram: nin
  Document ID: 0, Tokens: ['mining']
  Document ID: 1, Tokens: ['mining']
  Document ID: 2, Tokens: ['mining']
N-gram: ing
  Document ID: 0, Tokens: ['mining']
  Document ID: 1, Tokens: ['mining']
  Document ID: 2, Tokens: ['mining']
N-gram: use
  Document ID: 0, Tokens: ['useful']
N-gram: sef
  Document ID: 0, Tokens: ['useful']
N-gram: efu
  Document ID: 0, Tokens: ['useful']
N-gram: ful
  Document ID: 0, Tokens: ['useful']
N-gram: usa
  Document ID: 1, Tokens: ['usage']
N-gram: sag
  Document ID: 1, Tokens: ['usage']
N-gram: age
  Document ID: 1, Tokens: ['usage']
N-gram: app
  Document ID: 1, Tokens: ['applications']
N-gram: ppl
  Document ID:

In [None]:
# Function to search for documents containing ngrams of the query

def ngram_search(query, index, docs, n=3):
  query_ngrams = generate_ngrams(query, n)
  matching_docs = defaultdict(int)

  for ngram in query_ngrams:
    if ngram in index:
      for doc_id in index[ngram]:
        matching_docs[doc_id] += 1

  # Rank documents by the number of matching n-grams
  ranked_docs = sorted(matching_docs.items(), key=lambda x: x[1], reverse=True)
  return [doc_id for doc_id, _ in ranked_docs]



In [None]:
# Example ngram search

query = "usge"
result_docs = ngram_search(query, ngram_index, documents)
print(f"Query '{query}' matched documents: {result_docs}")

Query 'usge' matched documents: []
