In [8]:
from tqdm import tqdm
from whoosh import scoring
from whoosh.index import create_in, open_dir
from whoosh.fields import Schema, TEXT, ID
from whoosh.analysis import StemmingAnalyzer
from whoosh.index import open_dir
from whoosh.qparser import QueryParser, OrGroup, AndGroup
import os
import json
import pandas as pd

In [9]:
def load_hp_synonyms():
    # load hpo from json
    hpo_json = json.load(open('hp.json'))
    synonym_dict_list = []
    nodes = hpo_json['graphs'][0]['nodes']
    for node in nodes:
        # "id" : "http://purl.obolibrary.org/obo/HP_0000016"
        try:
            id_component_list = node['id'].split('/')
            if 'HP_' in id_component_list[-1]:
                synonym_dict = {}
                synonym_dict['hp_id'] = id_component_list[-1]
                synonym_dict['name'] = node['lbl']
                synonyms = node['meta']['synonyms']
                synonym_dict['synonyms'] = []
                for synonym in synonyms:
                    synonym_dict['synonyms'].append(synonym['val'])
                synonym_dict_list.append(synonym_dict)
        except Exception as e:
            pass
    return synonym_dict_list

def create_index(index_dir, synonym_dict_list):
    # Create an index and schema
    stem_ana = StemmingAnalyzer()
    custom_schema = Schema(hp_id=ID(stored=True),
                hp_desc=TEXT(stored=True, analyzer=stem_ana),
    )
    # create if not exist
    if not os.path.exists(index_dir):
        os.mkdir(index_dir)
    else:
        # delete forecely and recreate
        import shutil
        shutil.rmtree(index_dir)
        os.mkdir(index_dir)
    index = create_in(index_dir, custom_schema)

    # Open the index
    index = open_dir(index_dir)

    # Create a writer to add documents to the index
    writer = index.writer()

    # Add documents to the index
    for i in tqdm(range(len(synonym_dict_list))):
        phrase = synonym_dict_list[i]
        synonyms = phrase['synonyms']
        synonyms.extend([phrase['name']])
        for synonym in synonyms:
            writer.add_document(hp_id=str(phrase['hp_id']),
                                hp_desc=synonym
                                )
    writer.commit()
    return 0

In [20]:
def query_terms(index_dir, query_name_list, output, top_k=1):
    def custom_scoring(searcher, fieldname, text, matcher):
        frequency = scoring.Frequency().scorer(searcher, fieldname, text).score(matcher)
        tfidf =  scoring.TF_IDF().scorer(searcher, fieldname, text).score(matcher)
        bm25 = scoring.BM25F().scorer(searcher, fieldname, text).score(matcher)
        return frequency + tfidf + bm25
    
    query_results = []
    index = open_dir(index_dir)
    my_weighting = scoring.FunctionWeighting(custom_scoring)
    # scoring.BM25F(B=10, K1=0.1)
    searcher = index.searcher(weighting=scoring.BM25F(B=10, K1=0.1))
    query_parser = QueryParser('hp_desc', schema=index.schema, group=OrGroup)
    # Tokenize the query text using the same analyzer
    for query_name in tqdm(query_name_list):
        query = query_parser.parse('{}'.format(query_name))
        results = searcher.search(query, limit=10, scored=True)
        # Retrieve the top k matching phrase
        if len(results) > 0:
            # Retrieve and print the ranked matching phrases
            # top k only
            results = results[:top_k]
            for i, result in enumerate(results):
                hp_desc = result['hp_desc']
                hp_id = result['hp_id']
                score = result.score
                query_results.append({'query_name': query_name, 'hp_desc': hp_desc, 'hp_id': hp_id, 'score': score})
        else:
            query_results.append({'query_name': query_name, 'hp_desc': '', 'hp_id': '', 'score': -1})
    searcher.close()
    query_results_df = pd.DataFrame(query_results)
    query_results_df.to_csv(output, index=False)   
    return query_results_df

In [14]:
synonym_dict_list = load_hp_synonyms()
index_dir = './woosh_index'
create_index(index_dir, synonym_dict_list)

100%|██████████| 10445/10445 [00:03<00:00, 2968.11it/s]


0

In [21]:
query_name_list = pd.read_csv('synonym_df.csv')['synonym'].tolist()
output = './woosh_query_results.csv'
query_terms(index_dir, query_name_list, output, top_k=1)

  0%|          | 0/498 [00:00<?, ?it/s]

100%|██████████| 498/498 [00:15<00:00, 32.94it/s]


Unnamed: 0,query_name,hp_desc,hp_id,score
0,Liver fibrosis,Liver fibrosis,HP_0001395,23.713547
1,Hepatic fibrosis,Hepatic fibrosis,HP_0001395,24.282408
2,Portal tract fibrosis,Portal fibrosis,HP_0006580,26.451651
3,Fibrosis of the liver,Liver fibrosis,HP_0001395,23.713547
4,Fibrotic liver disease,Liver disease,HP_0001392,20.715952
...,...,...,...,...
493,Bovine milk hypersensitivity,Hypersensitivity,HP_0041092,24.677188
494,Dairy allergy,Allergy to dairy,HP_0410327,25.211736
495,Allergy to cow's milk,Cow milk allergy,HP_0100327,27.288984
496,Milk protein allergy,Milk allergy,HP_0100327,23.474081
