## Import libraries

In [2]:
import os
from whoosh.index import create_in, open_dir
from whoosh.writing import BufferedWriter, AsyncWriter
from whoosh.analysis import StandardAnalyzer, SimpleAnalyzer
from whoosh.qparser import QueryParser
from whoosh.fields import *
import xml.etree.ElementTree as ET
import numpy as np
import time
import concurrent

## Defined model variables and settings

In [3]:
doc_type = {k: 1 for k in ['inproceedings', 'incollection', 'book', 'proceedings', 'article']}
features = {'title': ('title', 'booktitle'),
            'year': ('year'),
            'author': ('author', 'editor'),
            'publication_venue': ('publisher', 'series', 'school', 'journal')}

In [4]:
standard_analyzer = StandardAnalyzer()
simple_analyzer = SimpleAnalyzer()

In [5]:
# Define schema
schema = Schema(
    title=TEXT(analyzer=standard_analyzer, stored=True),
    author=TEXT(analyzer=simple_analyzer, stored=True),
    year=NUMERIC(stored=True),
    publication_venue = TEXT(analyzer=standard_analyzer, stored=True)
)

In [6]:
# Create index directory if not exists
if not os.path.exists("indexdir"):
    os.mkdir("indexdir")

## Indexing

In [7]:
# Create an index
ix = create_in("indexdir", schema=schema, indexname="dblp")
class Parser():
    def __init__(self):
        self.idx = 0
        # self.buffer = []
        self.instance = {k: list() for k in features.keys()}
        self.time = time.time()
        self.timing = list([0])
        self.percent = 0
        self.flag = False

    def parse(self, file = './dblp.xml', dtd = './dblp.dtd'):
        for (event, element) in lxml.etree.iterparse(file, dtd_validation=True, events=['end'], load_dtd=dtd):
            if self.idx == 0:
                writer = AsyncWriter(ix, writerargs={'procs':6})
            if element.tag in doc_type:
                children = element.getchildren()
                self.idx += 1
                for child in children:
                    if child.tag in features['title']:
                        if child.text:
                            self.instance['title'].append(child.text)
                            self.flag = True
                    elif child.tag in features['author']:
                       if child.text:
                            self.instance['author'].append(child.text)
                            self.flag = True
                    elif child.tag in features['year']:
                        if child.text:
                            self.instance['year'].append(child.text)
                            self.flag = True
                    elif child.tag in features['publication_venue']:
                        if child.text:
                            self.instance['publication_venue'].append(child.text)
                            self.flag = True
            if self.flag:
                title = ', '.join(self.instance['title'])
                author = ', '.join(self.instance['author'])
                if self.instance['year'] == []:
                    self.instance['year'] = ['0']
                year = ', '.join(self.instance['year'])
                publication_venue = ', '.join(self.instance['publication_venue'])

                writer.add_document(title=title,
                                author=author,
                                year=year,
                                publication_venue=publication_venue)
                self.instance = {k: list() for k in features.keys()}
                self.flag = False
                if self.idx == 67867 or (element.tag == 'dblp' and event == 'end'):
                    self.percent += 1
                    self.timing.append(time.time() - self.time)
                    print(f"-- Finish indexing {self.percent}% of the corpus in {self.timing[-1]} seconds")
                    self.idx = 0
                    writer.commit(optimize=True)
        self.timing.append(time.time() - self.time)
        print(f"-- Finish indexing the corpus in {self.timing[-1]} seconds")

In [None]:
timing = []
parser = Parser()
parser.parse()