In [1]:
import xml.etree.ElementTree as ET
import re
import pickle
from operator import itemgetter
import os.path


def preprocess_article(filename):
    
    # Get the text part of the article xml by using Python's XML parser
    tree = ET.parse(filename) # parse the xml tree
    text = tree.getroot()[1][3][7].text # get the contents of the text-node
    
    string = ""
    for c in text:
        string += c # convert the list of characters to a Python string
    
    # Clean the string
    string = re.sub(r'\n', ' ', string) # remove new-line
    string = re.sub(r'\{\{.*?\}\}', ' ', string) # remove any {{...}} sections (metadata)
    string = re.sub(r'\<.*?\>', ' ', string) # remove HTML tags
    string = re.sub(r'\[\[([a-zA-Z\d])*\|', ' ', string) # remove the target of wiki-markup links to other articles
    string = re.sub(r'\=.*?\=', ' ', string) # removes =...=
    string = re.sub(r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)', ' ', string) # remove urls  
    string = string.lower() # lower case everything
    string = re.sub(r'\(.*?\)', ' ', string) # removes parenthses
    string = re.sub(r'[^a-z\d\s].*?', '', string) # remove any remaining non-alphanumeric characters
    string = re.sub(' +', ' ', string) # remove trailing spaces
    string = re.sub('^ ', '', string) # remove any space at the front of the string
    
    return string

def hash_tree(string, article, tree = {}, clean = False):
    words = [[m.group(0), m.start()] for m in re.finditer(r'\S+', string)]
    sorted_words = sorted(words, key=itemgetter(0)) # sort the list to save the hard drive
    
    current_char = ""
    tree = {}
    file = ""
    for word in sorted_words:
        char = word[0][0]
        word[0] = word[0][1:]
        
        if (current_char != char):
            if (current_char != ""):
                file = open('trees/'+current_char+'.tree', 'wb+')
                pickle.dump(tree, file)
                file.close()
            
            current_char = char
            if (os.path.isfile('trees/'+current_char+'.tree') != True):
                file = open('trees/'+current_char+'.tree', 'wb+')
                pickle.dump({}, file)
                file.close()
            
            if (clean): tree = {}
            else :
                file = open('trees/'+char+'.tree', 'rb+')
                tree = pickle.load(file)
                file.close()
        
        add_word_to_tree(tree, word[0], word[1], article)

    file = open('trees/'+char+'.tree', 'wb+')
    pickle.dump(tree, file)
    file.close()

        
    return tree
    
def add_word_to_tree(tree, word, index, article):
    
    if (len(word) > 0 and (len(tree) < 1 or word[0] not in tree.keys())):
        tree[word[0]] = {} # add first letter of word to tree if not present
    if (len(word) < 1): # terminate recursion
        if ('nodes' not in tree.keys()):
            tree['nodes'] = {article : []}
        elif (article not in tree['nodes'].keys()):
            tree['nodes'][article] =  []
        tree['nodes'][article].append(index)
    #elif (len(word) == 1): # terminate recursion
    #    if ('nodes' not in tree[word].keys()):
    #        tree[word]['nodes'] = {article : []}
    #    elif (article not in tree[word]['nodes'].keys()):
    #        tree[word]['nodes'][article] =  []
    #    tree[word]['nodes'][article].append(index)
    else:
        add_word_to_tree(tree[word[0]], word[1:], index, article)

def search(s):
    file = open('trees/'+s[0]+'.tree', 'rb+')
    tree = pickle.load(file)
    file.close()
    
    return search_helper(s[1:], tree)

def search_helper(s, tree):
    if (len(s) < 1):
        print('Invalid search string')
        return {}
    if (len(s) == 1):
        return tree[s]['nodes']
    else:
        return {} if s[0] not in tree.keys() else search_helper(s[1:], tree[s[0]])


In [2]:

cat_string = preprocess_article('articles/Cat')
dog_string = preprocess_article('articles/Dog')
string_test = 'a cat is not nice'

tree = {}
#tree = hash_tree(cat_string, 'Cat', tree)
#tree = hash_tree(dog_string, 'Dog', tree)
tree = hash_tree(cat_string, 'Cat', clean = True)
tree = hash_tree(dog_string, 'Dog')




In [3]:
path = 'trees/c.tree'

file_read = open(path, 'rb+')
tree = pickle.load(file_read)
file_read.close()

search('mammal')

{'Cat': [56, 19595, 33098], 'Dog': [5039, 5849, 23972, 29811]}

In [4]:

import xml.sax

class WikiContentHandler(xml.sax.ContentHandler):
    
    def __init__(self):
        self.page = False
        self.text = False
        self.title = False
    
    def startElement(self, name, attrs):
        if (name == "page"): self.page = True
        if (name == "title"):
            self.title = True
        if (name == "text"): self.text = True
        
    def endElement(self, name):
        if (name == "page"):
            self.page = False
        if (name == "title"):
            self.title = False
        if (name == "text"):
            self.title = False
        
    def characters(self, content):
        if (self.title):
            print(content)
            print("--------\n")
        if (self.text):
            print(content)
        
parser = xml.sax.make_parser()
parser.setContentHandler(WikiContentHandler())
parser.parse(open("articles/Cat","r"))


Cat
--------

{{about|the cat species that is commonly kept as a pet|the cat family|Felidae|other uses|Cat (disambiguation)|and|Cats (disambiguation)}}


{{pp-semi-indef|small=yes}}


{{pp-move-indef|small=yes}}


{{technical reasons|Cat #1|that album|Cat 1 (album)}}


{{Use dmy dates|date=March 2017}}


{{Good article}}


<
!-- Per [[WP:ENGVAR]], articles should conform to one overall spelling style of English, typically the one it was created in when, as here, the topic has no strong national ties. This article was created with American English.--
>


{{subspeciesbox


| name                = Domestic cat
<
ref name=
"
MSW3fc
"
 /
>


| status              = DOM


<
!-- There has been extensive discussion about the choice of image in this infobox. Before replacing this image with something else, consider if it actually improves on the ENCYCLOPEDIC CRITERIA which led to this choice. See [[Talk:Cat]] and [[Talk:Cat/Lead photo]] and if in doubt, DISCUSS IT FIRST! --
>


| image         