# Test Corpus Integration (Notebook)

This notebook mirrors the checks from `test_corpus_integration.py` and runs them interactively:

- Verify `corpus.py` uses `tokenizer.py`
- Tokenization consistency check
- Build/load inverted index from `corpus.csv` and search with test text
- Search engine tests on `corpus.csv`
- Build unigrams from `corpus.csv`

Requires files in project root:
- `corpus.csv`
- (Optional) `inverted_index.json`, `test_inverted_index.json` (will be built if missing)


In [1]:
# Setup
# !pip install pandas

import os
import pandas as pd

from newa_nlp import (
    tokenize_text,
    build_unigram,
    build_unigram_from_csv,
    InvertedIndex,
    build_inverted_index_from_csv,
    save_inverted_index,
    load_inverted_index,
    create_search_engine,
)

TEST_TEXT = "नेपाल भाषा नेवाः भाषा खः।"


In [2]:
# 1) Corpus-tokenizer integration and tokenization consistency

# Direct tokenization
tokens_direct = tokenize_text(TEST_TEXT, mode="regex")
print("Direct tokens:", tokens_direct)

# Unigram-based tokenization
unigrams = build_unigram([TEST_TEXT], tokenizer_mode="regex")
print("Unigram tokens:", [t for t, _ in unigrams])

print("Consistent:", set(tokens_direct) == set(t for t, _ in unigrams))


Direct tokens: ['नेपाल', 'भाषा', 'नेवाः', 'भाषा', 'खः']
Unigram tokens: ['भाषा', 'खः', 'नेपाल', 'नेवाः']
Consistent: True


In [3]:
# 2) Build or load inverted index from corpus.csv and search with test text

if not os.path.exists('corpus.csv'):
    raise FileNotFoundError("corpus.csv not found in current directory")

index_file = 'test_inverted_index.json'

if os.path.exists(index_file):
    print(f"Loading existing index from {index_file} ...")
    index = load_inverted_index(index_file)
else:
    print("Building new inverted index from corpus.csv ...")
    index = build_inverted_index_from_csv(
        csv_path='corpus.csv',
        doc_id_column='filename',
        content_column='content',
        tokenizer_mode='regex',
    )
    print("Saving index ...")
    save_inverted_index(index, index_file, format='json')

# Search with tokens from TEST_TEXT
test_tokens = tokenize_text(TEST_TEXT, mode='regex')
print("Test tokens:", test_tokens)

for term in test_tokens:
    docs = index.search([term])
    print(f"'{term}': {len(docs)} docs, sample: {list(docs)[:3]}")

if len(test_tokens) >= 2:
    and_docs = index.search(test_tokens[:2], operation='AND')
    or_docs = index.search(test_tokens[:2], operation='OR')
    print(f"AND '{test_tokens[0]}' & '{test_tokens[1]}': {len(and_docs)} docs")
    print(f"OR  '{test_tokens[0]}' | '{test_tokens[1]}': {len(or_docs)} docs")


Loading existing index from test_inverted_index.json ...
Inverted index loaded from: test_inverted_index.json
Test tokens: ['नेपाल', 'भाषा', 'नेवाः', 'भाषा', 'खः']
'नेपाल': 9364 docs, sample: ['wiki_13391.txt', 'wiki_47988.txt', 'wiki_13254.txt']
'भाषा': 31610 docs, sample: ['wiki_56380.txt', 'wiki_60684.txt', 'wiki_56674.txt']
'नेवाः': 1389 docs, sample: ['nepalmandal_2424.txt', 'nepalmandal_746.txt', 'nepalmandal_5232.txt']
'भाषा': 31610 docs, sample: ['wiki_56380.txt', 'wiki_60684.txt', 'wiki_56674.txt']
'खः': 49201 docs, sample: ['wiki_26281.txt', 'wiki_25178.txt', 'wiki_43590.txt']
AND 'नेपाल' & 'भाषा': 7277 docs
OR  'नेपाल' | 'भाषा': 33697 docs


In [4]:
# 3) Search engine tests on corpus.csv

se_index_file = 'inverted_index.json'

if not os.path.exists(se_index_file):
    print("Building main inverted index (once) ...")
    se_index = build_inverted_index_from_csv(
        csv_path='corpus.csv',
        doc_id_column='filename',
        content_column='content',
        tokenizer_mode='regex',
    )
    save_inverted_index(se_index, se_index_file, format='json')
else:
    print(f"Using existing {se_index_file}")

engine = create_search_engine(se_index_file, 'corpus.csv')

queries = ['नेपाल', 'भाषा', 'नेवाः', 'नेपाल भाषा']
for q in queries:
    docs = engine.search_documents(q, limit=5)
    print(f"Query '{q}': {len(docs)} docs, sample: {docs[:3]}")


Building main inverted index (once) ...
[1000/80379] Processed 1000 documents
[2000/80379] Processed 2000 documents
[3000/80379] Processed 3000 documents
[4000/80379] Processed 4000 documents
[5000/80379] Processed 5000 documents
[6000/80379] Processed 6000 documents
[7000/80379] Processed 7000 documents
[8000/80379] Processed 8000 documents
[9000/80379] Processed 9000 documents
[10000/80379] Processed 10000 documents
[11000/80379] Processed 11000 documents
[12000/80379] Processed 12000 documents
[13000/80379] Processed 13000 documents
[14000/80379] Processed 14000 documents
[15000/80379] Processed 15000 documents
[16000/80379] Processed 16000 documents
[17000/80379] Processed 17000 documents
[18000/80379] Processed 18000 documents
[19000/80379] Processed 19000 documents
[20000/80379] Processed 20000 documents
[21000/80379] Processed 21000 documents
[22000/80379] Processed 22000 documents
[23000/80379] Processed 23000 documents
[24000/80379] Processed 24000 documents
[25000/80379] Proc

In [5]:
# 4) Unigram tests on corpus.csv

print("Building top 20 unigrams (freq) ...")
unigrams = build_unigram_from_csv(
    csv_path='corpus.csv',
    content_column='content',
    tokenizer_mode='regex',
    sort_by='freq',
    top_k=20,
)
for i, (term, count) in enumerate(unigrams, 1):
    print(f"{i:2d}. {term}: {count:,}")

print("\nTop 10 terms (Devanagari sorted) ...")
unigrams_dev = build_unigram_from_csv(
    csv_path='corpus.csv',
    content_column='content',
    tokenizer_mode='regex',
    sort_by='dev',
    top_k=10,
)
for i, (term, count) in enumerate(unigrams_dev, 1):
    print(f"{i:2d}. {term}: {count:,}")


Building top 20 unigrams (freq) ...
 1. थ्व: 455,768
 2. दु: 211,478
 3. थाय्: 210,236
 4. व: 185,876
 5. भाषा: 170,387
 6. खः: 96,361
 7. ख: 81,789
 8. या: 79,930
 9. थासय्: 75,808
10. नं: 72,317
11. कथं: 66,987
12. छगू: 64,549
13. भारतया: 63,949
14. खने: 61,233
15. संकिपा: 61,068
16. निसें: 57,713
17. छ्येलिगु: 55,424
18. राज्यया: 54,910
19. कुल: 54,171
20. स्वापू: 52,797

Top 10 terms (Devanagari sorted) ...
 1. ँचयायाँ: 1
 2. ँन्हूगू: 2
 3. ँस्वगिनलय्: 1
 4. ं: 4
 5. ंं: 1
 6. ंःएइल्ल्: 1
 7. ंआॠआ: 1
 8. ंइयगव: 1
 9. ंऍ: 1
10. ंगुठी: 1
