# Indexing using Xapian

In [2]:
import xapian
import string
import sys
import time
import glob

### Path of Wiki-files

In [None]:
path = "wiki-pages-text"
files = [f for f in glob.glob(path + "**/*.txt", recursive=True)]

### Path to save indexed File

In [None]:
database = xapian.WritableDatabase("db-index", xapian.DB_CREATE)

## Simple_Index

In [None]:
start = time.time()
indexer = xapian.TermGenerator()
stemmer = xapian.Stem("english")
indexer.set_stemmer(stemmer)
index=0
for i, shard_path in enumerate(files):
    
    print("FILE PROCESSED: ",i+1,"\t",shard_path)
    fp = open(shard_path,'r')
    text = fp.readlines()
    for line_num, line in enumerate(text):
        index+=1
        line_list = line.split(" ")
        page_id = line_list.pop(0)
        sentence_id = line_list[0]
        sentence = " ".join(line_list)
        sentence = sentence.replace("\n","")

        # Store all the fields for display purposes.
        doc = xapian.Document()
        indexer.set_document(doc)
        

        # Index fields without prefixes for general search.
        indexer.index_text(page_id.replace("_"," "))#title
        indexer.increase_termpos()
        indexer.index_text(sentence)#text
        
        doc.set_data(line)
        idterm = u"Q" + sentence_id
        doc.add_boolean_term(idterm)

            # Add the document to the database.
        database.replace_document(index, doc)
    print("TIME ELAPSED: ",(time.time() - start)/60)

print("TOTAL TIME:", (time.time() - start)/60)
database.commit()
database.close()

In [None]:
#TOTAL TIME: 389.77472179730734

# Simple_search

In [16]:
start = time.time()
# Open the database for searching.
database = xapian.Database("db-index/")

# Start an enquire session.
enquire = xapian.Enquire(database)


query_string = 'Roman Atwood is a content creator.'
# Parse the query string to produce a Xapian::Query object.
qp = xapian.QueryParser()

stemmer = xapian.Stem("english")
qp.set_stemmer(stemmer)
qp.set_database(database)
qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
query = qp.parse_query(query_string)
print('query:', query)

enquire.set_query(query)
matches = enquire.get_mset(0, 5)

# Display the results.
print ("%i results found." % matches.get_matches_estimated())
print ("Results 1-%i:" % matches.size(),'\n')

for m in matches:
    print ("%i: %i%% docid=%i [%s]" % (m.rank + 1, m.percent, m.docid, m.document.get_data()), '\n')
    
print("TOTAL TIME:", (time.time() - start)/60)

query: Query((roman@1 OR atwood@2 OR Zis@3 OR Za@4 OR Zcontent@5 OR Zcreator@6))
13000000 results found.
Results 1-5: 

1: 50% docid=21388834 [b'Bedside_Press 2 Bedside Press has published new content by up-and-coming artists/writers and established creators including Margaret Atwood , Trina Robbins , and Roberta Gregory .\n'] 

2: 47% docid=12747166 [b'Roman_Atwood 0 Roman Bernard Atwood -LRB- born May 28 , 1983 -RRB- is an American YouTube personality , comedian , vlogger and pranker .\n'] 

3: 42% docid=22952871 [b'Brett_Atwood 0 Brett Atwood is a website editor , content strategist and former print and online journalist whose writings have appeared in Billboard , Rolling Stone , Vibe , The Hollywood Reporter and other publications .\n'] 

4: 42% docid=5490985 [b'Television_program_creators 21 Who merits creator credit is sometimes a matter of contention .\n'] 

5: 38% docid=16846851 [b'Patreon 0 Patreon is an Internet-based platform that allows content creators to build their own s