In [3]:
import os, os.path
import yaml

import sqlalchemy as sql
from sqlalchemy.orm import sessionmaker
from whoosh import index, fields, analysis, query
from whoosh.qparser import QueryParser, OrGroup
from whoosh.filedb.filestore import FileStorage

from src import Config
from src.orm import Monographs

In [4]:
from src.text_index import IndexReader

config = Config()
index_dir = config['text_index_path']

ir = IndexReader(config)
ir.query_monographs('Instytut badań systemowych polskiej akademii nauk')


((name:inst OR name:nsty OR name:styt OR name:tytu OR name:ytut OR name:bada OR name:adań OR name:syst OR name:yste OR name:stem OR name:temo OR name:emow OR name:mowy OR name:owyc OR name:wych OR name:pols OR name:olsk OR name:lski OR name:skie OR name:kiej OR name:akad OR name:kade OR name:adem OR name:demi OR name:emii OR name:nauk) REQUIRE type:publisher)


[]

In [6]:
schema = fields.Schema(
    # Identifier of this entry in the respective DB table
    id=fields.ID(stored=True),
    # Name (or names) of this journal/conference/publisher
    # NOT STORED, used only for scoring
    name=fields.NGRAMWORDS(queryor=True, stored=True),
    # Type of entry (journal/conference/publisher)
    # NOT STORED, used only for filtering
    type=fields.ID(),
    # Names of science domains for this entry
    # NOT STORED, used only for scoring
    domains=fields.KEYWORD(lowercase=True, commas=True, scorable=True),
)

class EntryType:
    Conference = 'conference'
    Journal = 'journal'
    Publisher = 'publisher'

In [4]:
os.makedirs(index_dir, exist_ok=True)
storage = FileStorage(index_dir)
# storage.clean()
storage.create_index(schema)

FileIndex(FileStorage('./text_index'), 'MAIN')

In [5]:
engine = sql.create_engine(f"sqlite:///{config['db_file']}")
Session = sessionmaker(bind=engine)
session = Session()

In [6]:
ix = index.open_dir(index_dir)

In [7]:
writer = ix.writer()

try:
    for r in session.query(Monographs):
        writer.add_document(
            id=str(r.id),
            name=r.publisher_name,
            # name_ngram=r.publisher_name,
            type=EntryType.Publisher,
        )

    writer.commit()
except:
    writer.cancel()
    print('Failed to index documents')
    raise

In [11]:
with ix.searcher() as s:
    to_find = 'Instytut Badania Systemowych Polskiej Akademii Nauk'

    name_parser = QueryParser('name', ix.schema, plugins=[], group=OrGroup)
    q = query.Or([
        name_parser.parse(to_find),
    ])
    results = s.search(q)
    v = [
        {
            'id': hit['id'],
            'name': hit['name'],
            'score': hit.score
        }
        for hit in results
    ]

In [9]:
v2 = v