In [2]:
import whoosh, glob, time, pickle
import whoosh.fields as wf
from whoosh.qparser import QueryParser
import src as swhoosh

In [3]:
class simple_schema(wf.SchemaClass):
    doc = wf.TEXT(analyzer=whoosh.analysis.StandardAnalyzer())
    filepath = wf.ID(unique=True, stored=True)

### Data

Both datasets are text collections from [this site](http://dhresourcesforprojectbuilding.pbworks.com/w/page/69244469/Data%20Collections%20and%20Datasets). 

- TCP-ECCO (170mb uncompressed) can be downloaded [here](https://github.com/Early-Modern-OCR/TCP-ECCO-texts/archive/master.zip)
- Lincoln (700kb uncompressed) can be downloaded [here](http://oldsite.english.ucsb.edu/faculty/ayliu/unlocked/lincoln/lincoln-speeches-and-writings.zip)

In [3]:
def get_lincoln():
    for filepath in sorted(glob.glob('Lincoln/*.txt')):
        with open(filepath, 'r', encoding='latin') as f:
            doc = f.read()
        yield {'filepath': filepath, 'doc': doc}
        
def get_TCP():
    for filepath in sorted(glob.glob('TCP-ECCO/*.txt')):
        with open(filepath, 'r', encoding='latin') as f:
            doc = f.read()
        yield {'filepath': filepath, 'doc': doc}

### Intialize swhoosh index

In [4]:
s = swhoosh.Index('randomIdx/randomIdx', simple_schema(), reset=True)
s.load()

### Add documents to swhoosh index

In [5]:
t = time.time()
s.add_documents(get_lincoln())
print("TIME:", time.time() - t)

TIME: 0.3167388439178467


### Save and close, then reload swoosh index

Note that the index MUST be saved -- it does not autosave!

In [6]:
s.save_and_close()

with open('randomIdx/randomIdx.manager', 'rb') as f:
    s = pickle.load(f)
    s.load()

### Wrap Index in an IndexReader and get statistics needed for BM25

In [7]:
r = s.reader()

print(r.doc_count())
print(r.doc_frequency('doc',b'lincoln'))
print(r.doc_field_length(21, 'doc')) 
print(r.avg_field_length('doc'))

84
12
218
583.2619047619048


### Get term info and postings for 'lincoln'

In [8]:
# returns (overall frequency, num docs, start loc in postings file, postings size)
s._idx['doc'].terminfo(b'lincoln')

(13, 12, 2, 70)

In [9]:
[swhoosh.postings.load2(a[2], a[1]) for a in s._idx['doc']._postings(b'lincoln')]

[array('I', [311]),
 array('I', [217]),
 array('I', [872]),
 array('I', [4746]),
 array('I', [96]),
 array('I', [46]),
 array('I', [76]),
 array('I', [32]),
 array('I', [157, 438]),
 array('I', [93]),
 array('I', [75]),
 array('I', [69])]

In [10]:
# what the postings look like: (docId, frequency, positions)
s._idx['doc']._postings(b'lincoln')

[(0, 1, b'\xb7\x02'),
 (21, 1, b'\xd9\x01'),
 (22, 1, b'\xe8\x06'),
 (23, 1, b'\x8a%'),
 (28, 1, b'`'),
 (37, 1, b'.'),
 (40, 1, b'L'),
 (56, 1, b' '),
 (60, 2, b'\x9d\x01\x99\x02'),
 (61, 1, b']'),
 (78, 1, b'K'),
 (81, 1, b'E')]

### Run a BM25 search with Whoosh API

In [11]:
qp = QueryParser("doc", schema=s._schema)
q = qp.parse("lincoln")

with s.searcher() as searcher:
    results = searcher.search(q)
    print(results)
    for hit in results:
        print('{:f}'.format(hit.score), ' | ', hit['filepath'])

<Top 10 Results for Term('doc', 'lincoln') runtime=0.0001484961248934269>
4.477876  |  Lincoln/82-letter-to-thurlow-weed-1865.txt
4.448624  |  Lincoln/41-letter-to-george-ashmun-accepting-nomination-for-presidency-1860.txt
4.448624  |  Lincoln/79-letter-to-mrs-bixley-of-boston-nov-21-1864.txt
4.363118  |  Lincoln/62-letter-to-general-grant-july-13-1863.txt
4.224235  |  Lincoln/61-emancipation-proclamation-jan-1-1863.txt
3.934575  |  Lincoln/57-from-letter-to-august-belmont-1862.txt
3.852952  |  Lincoln/22-from-letter-to-george-robertson-1855.txt
3.539239  |  Lincoln/01-first-public-speech-1832.txt
3.296530  |  Lincoln/38-from-letter-to-jw-fell-1859.txt
2.383225  |  Lincoln/23-extracts-from-letter-to-joshua-f-speed-1855.txt


### Compare results to Whoosh

In [12]:
def make_clean_index(ix_dirname, paths, procs=1):
    ix = whoosh.index.create_in(ix_dirname, schema=simple_schema())
    writer = ix.writer(procs=procs)

    for filepath in paths:
        add_doc(writer, filepath)

    writer.commit()
    
    return ix

def add_doc(writer, filepath):
    with open(filepath, 'rb') as f:
        text = f.read().decode('latin')
    writer.add_document(doc=text, filepath=filepath)

In [13]:
t = time.time()
ix = make_clean_index('wind', sorted(glob.glob('Lincoln/*.txt')))
print("TIME:", time.time() - t)

TIME: 1.0337421894073486


In [14]:
with ix.searcher() as searcher:
    results = searcher.search(q)
    print(results)
    for hit in results:
        print('{:f}'.format(hit.score), ' | ', hit['filepath'])

<Top 10 Results for Term('doc', 'lincoln') runtime=0.000461772084236145>
4.468083  |  Lincoln/82-letter-to-thurlow-weed-1865.txt
4.434141  |  Lincoln/41-letter-to-george-ashmun-accepting-nomination-for-presidency-1860.txt
4.434141  |  Lincoln/79-letter-to-mrs-bixley-of-boston-nov-21-1864.txt
4.349186  |  Lincoln/62-letter-to-general-grant-july-13-1863.txt
4.191734  |  Lincoln/61-emancipation-proclamation-jan-1-1863.txt
3.934575  |  Lincoln/57-from-letter-to-august-belmont-1862.txt
3.849322  |  Lincoln/22-from-letter-to-george-robertson-1855.txt
3.536176  |  Lincoln/01-first-public-speech-1832.txt
3.291220  |  Lincoln/38-from-letter-to-jw-fell-1859.txt
2.381836  |  Lincoln/23-extracts-from-letter-to-joshua-f-speed-1855.txt


Note: the BM25 scores returned by whoosh's default settings are a tiny bit smaller because the default whoosh reader adds 1 to the current document length for some reason (I don't think this is correct). 

## Indexing a bigger collection

In [15]:
s = swhoosh.Index('randomIdx2/randomIdx2', simple_schema(), reset=True)
s.load()

In [16]:
t = time.time()
s.add_documents(get_TCP())
print("TIME:", time.time() - t)

TIME: 66.6019983291626


And repeating with multiprocessing enabled:

In [17]:
s = swhoosh.Index('randomIdx2/randomIdx2', simple_schema(), reset=True)
s.load()

In [18]:
t = time.time()
s.add_documents_multiprocessing(get_TCP(), num_procs=4)
print("TIME:", time.time() - t)

TIME: 27.684925317764282


In [19]:
s.save()

Comparing to whoosh default:

In [19]:
t = time.time()
ix = make_clean_index('wind2', sorted(glob.glob('TCP-ECCO/*.txt')))
print("TIME:", time.time() - t)

TIME: 175.10155200958252


And to whoosh with multiprocessing enabled:

In [20]:
t = time.time()
ix = make_clean_index('wind2', sorted(glob.glob('TCP-ECCO/*.txt')), procs=4)
print("TIME:", time.time() - t)

TIME: 147.75940346717834


Again, the matcher results are the same:

In [21]:
with s.searcher() as searcher:
    results = searcher.search(q)
    print(results)
    for hit in results:
        print('{:f}'.format(hit.score), ' | ', hit['filepath'])

print('')

with ix.searcher() as searcher:
    results = searcher.search(q)
    print(results)
    for hit in results:
        print('{:f}'.format(hit.score), ' | ', hit['filepath'])

<Top 10 Results for Term('doc', 'lincoln') runtime=0.000666104257106781>
7.649273  |  TCP-ECCO/K014182.000.txt
7.340711  |  TCP-ECCO/K045058.000.txt
7.238742  |  TCP-ECCO/K020979.000.txt
7.001680  |  TCP-ECCO/K014363.000.txt
6.953463  |  TCP-ECCO/K023186.000.txt
6.825768  |  TCP-ECCO/K029698.000.txt
6.477195  |  TCP-ECCO/K047797.000.txt
6.400992  |  TCP-ECCO/K023195.000.txt
6.331053  |  TCP-ECCO/K021268.000.txt
6.239490  |  TCP-ECCO/K054680.002.txt

<Top 10 Results for Term('doc', 'lincoln') runtime=0.0009060436859726906>
7.646144  |  TCP-ECCO/K014182.000.txt
7.350743  |  TCP-ECCO/K045058.000.txt
7.237419  |  TCP-ECCO/K020979.000.txt
6.993126  |  TCP-ECCO/K014363.000.txt
6.949495  |  TCP-ECCO/K023186.000.txt
6.832681  |  TCP-ECCO/K029698.000.txt
6.481719  |  TCP-ECCO/K047797.000.txt
6.402560  |  TCP-ECCO/K023195.000.txt
6.335115  |  TCP-ECCO/K021268.000.txt
6.241638  |  TCP-ECCO/K054680.002.txt


## Query Performance (BM25)

To benchmark this, we extrac token's from one of lincoln's speechs (not in TCP-ECCO), and select queries at random from the resulting ~5000 tokens.

In [4]:
with open('randomIdx2/randomIdx2.manager', 'rb') as f:
    s = pickle.load(f)
    s.load()

In [5]:
ix = whoosh.index.open_dir('wind2')

In [16]:
import numpy as np

s1 = s.searcher()
s2 = ix.searcher()
qp = QueryParser("doc", schema=s._schema)

with open('Lincoln/24-speech-1856.txt', 'r', encoding='latin') as f:
    data = f.read()

query_vocab = [t.text for t in s._schema['doc'].analyzer(data)]
print('Length of query vocab:',len(query_vocab))

def random_n_query(n):
    """Generates a random query of length n"""
    return ' '.join(np.random.choice(query_vocab, size=n))

def benchmark_n_query(n, trials):
    t_swhoosh, t_whoosh = 0, 0 
    for i in range(trials):
        q = qp.parse(random_n_query(n))
        t = time.time()
        results = s1.search(q)
        t_swhoosh += time.time() - t
        t = time.time()
        results = s2.search(q) 
        t_whoosh += time.time() - t
    print('- Swhoosh time per query:', "{:.2f}".format(t_swhoosh / trials * 1000), "ms")
    print('- Whoosh time per query:', "{:.2f}".format(t_whoosh / trials * 1000), "ms")
    return t_swhoosh/trials, t_whoosh/trials

Length of query vocab: 4810


#### 3 word queries

In [17]:
x, y = benchmark_n_query(3, 100)
print('\nSwhoosh was', "{0:.0f}%".format(100*(y-x)/y), 'percent faster.')

- Swhoosh time per query: 3.83 ms
- Whoosh time per query: 9.07 ms

Swhoosh was 58% percent faster.


#### 6 word queries

In [18]:
x, y = benchmark_n_query(6, 100)
print('\nSwhoosh was', "{0:.0f}%".format(100*(y-x)/y), 'percent faster.')

- Swhoosh time per query: 5.54 ms
- Whoosh time per query: 14.36 ms

Swhoosh was 61% percent faster.


#### 30 word queries

In [19]:
x, y = benchmark_n_query(30, 100)
print('\nSwhoosh was', "{0:.0f}%".format(100*(y-x)/y), 'percent faster.')

- Swhoosh time per query: 48.19 ms
- Whoosh time per query: 92.54 ms

Swhoosh was 48% percent faster.


### Index sizes

The index created by swhoosh is larger than the one created by whoosh. On TCP-ECCO (170mb uncompressed), whoosh's index is 180mb, whereas swhoosh's index is 222mb. Swoosh does not compressed things as much as it could, which is one of the reasons it runs faster.