In [1]:
# import modules in the parent directory
import sys
import os
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# turn off DeprecationWarning
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
sys.path.append('..')
obo_path = '../hp.obo'

In [14]:
# load all synonyms generated by GPT-4o-mini
synonym_folder = '/Users/cl3720/Desktop/hpo-parser-fine-tuned-gemini/hpo_synonyms/gpt-4o-mini-2024-07-18'
expanded_hpo_name_dict = {}
if os.path.exists(synonym_folder):
# load the synonyms from the folder
# each file in the folder is a json file with the following format {"HP:XXXX": ["synonym1", "synonym2", "synonym3"]}
    for file in os.listdir(synonym_folder):
        with open(os.path.join(synonym_folder, file), "r") as f:
            expanded_hpo_name_dict.update(json.load(f))
len(expanded_hpo_name_dict)
print(f"There are {len(expanded_hpo_name_dict)} HPO terms with synonyms")
# load the HPO ontology
from HpoFactory import HpoFactory
hpoF = HpoFactory(obo_path)
hpo_tree = hpoF.build_hpo_tree()
hpo_ancestors = hpoF.get_hpo_ancestors(hpo_tree)
hpo_levels = hpoF.get_hpo_levels(hpo_tree)
hpo_ancestors_df = pd.DataFrame(hpo_ancestors.items(), columns=['hpo_id', 'ancestor'])
# add hp_id to the ancestor list in the ancestor column
hpo_ancestors_df['ancestor'] = hpo_ancestors_df.apply(lambda x: [x.hpo_id] + x.ancestor, axis=1)
hpo_ancestors_df.rename(columns={'hpo_id': 'hit_hpo_id'}, inplace=True)

# convert expanded_hpo_name_dict into pandas dataframe
expanded_hpo_name_df = pd.DataFrame(expanded_hpo_name_dict.items(), columns=['query_hpo_id', 'synonyms'])
expanded_hpo_name_df = expanded_hpo_name_df.explode('synonyms')
expanded_hpo_name_df.rename(columns={'synonyms': 'query_synonym'}, inplace=True)

There are 18330 HPO terms with synonyms


### Performance in Top 1 exactly matched results

In [11]:
result_json = './rag_evaluation_query_results_top1_whoosh.json'    
# load the query results from the json file
with open(result_json, 'r') as f:
    query_results = json.load(f)

# # # top 5 also contains top 1 results
# result_json = './rag_evaluation_query_results_top5_text-embedding-3-large.json'
# # load the query results from the json file
# with open(result_json, 'r') as f:
#     query_results = json.load(f)
#     # convert to top 1 results
#     query_results = {k: [v[i]['parsed_results'][0]['hpo_id'] if i < len(v) else None for i in range(5)] for k, v in query_results.items()}
#     # for some reason, the query results are not the same.

In [12]:
# merge two dictionary by keys
# convert expanded_hpo_name_dict into pandas dataframe
expanded_hpo_name_df = pd.DataFrame(expanded_hpo_name_dict.items(), columns=['query_hpo_id', 'synonyms'])
# convert query_results into pandas dataframe
query_results_df = pd.DataFrame(query_results.items(), columns=['query_hpo_id', 'hit_hpo_id'])
# merge the two dataframes
merged_df = pd.merge(expanded_hpo_name_df, query_results_df, on='query_hpo_id')
# expand the hit_hpo_ids into multiple rows
merged_df = merged_df.explode(['synonyms','hit_hpo_id'])
# overall accuracy query_hpo_id == hit_hpo_ids
overall_accuracy = (merged_df.query_hpo_id == merged_df.hit_hpo_id).mean()
# round 2 decimal places
print(f"Overall accuracy: {overall_accuracy:.2f}")
# by query_hpo_id at least one hit_hpo_ids == query_hpo_id
for at_leat_i_hit in range(1, 6):
    at_least_x_hit_accuracy = merged_df.groupby('query_hpo_id').apply(lambda x: (x.query_hpo_id == x.hit_hpo_id).value_counts().get(True, 0) >= at_leat_i_hit).mean()
    print(f"At least {at_leat_i_hit} hit accuracy: {at_least_x_hit_accuracy:1f}")

Overall accuracy: 0.06
At least 1 hit accuracy: 0.218603
At least 2 hit accuracy: 0.054610
At least 3 hit accuracy: 0.012220
At least 4 hit accuracy: 0.002728
At least 5 hit accuracy: 0.000327


In [13]:
query_results_df

Unnamed: 0,query_hpo_id,hit_hpo_id
0,HP:0030429,"[None, None, HP:0030429, None, None]"
1,HP:0004875,"[None, None, None, None, None]"
2,HP:0032414,"[HP:0031991, None, None, None, None]"
3,HP:5200239,"[None, None, None, None, None]"
4,HP:0100607,"[None, HP:0032149, None, HP:0003710, None]"
...,...,...
18325,HP:0030778,"[HP:0030778, None, None, None, None]"
18326,HP:0011025,"[None, None, None, None, None]"
18327,HP:0008204,"[None, None, None, None, None]"
18328,HP:0030282,"[None, None, None, None, None]"


### Performance in Top 1 desendants matched results

In [18]:
# accuracy counted by ancestors
merged_ancestor_df = hpo_ancestors_df.merge(merged_df, how='right')
# generate a column to indicate whether the hit_hpo_ids is in the ancestor list
merged_ancestor_df['query_is_ancestor'] = merged_ancestor_df.apply(
    lambda x: isinstance(x['ancestor'], (list, set)) and x['query_hpo_id'] in x['ancestor'],
    axis=1
)# To avoid TypeError: argument of type 'float' is not iterable
# accuracy by ancestors
ancestor_accuracy = merged_ancestor_df.query_is_ancestor.mean()
print(f"Accuracy by ancestors: {ancestor_accuracy:2f}")
# by query_hpo_id at least one hit_hpo_ids == ancestor
for at_leat_i_hit in range(1, 6):
    at_least_x_hit_accuracy = merged_ancestor_df.groupby('query_hpo_id').apply(lambda x: x.query_is_ancestor.value_counts().get(True, 0) >= at_leat_i_hit).mean()
    print(f"At least {at_leat_i_hit} hit accuracy by ancestors: {at_least_x_hit_accuracy:2f}")

Accuracy by ancestors: 0.639356
At least 1 hit accuracy by ancestors: 0.911348
At least 2 hit accuracy by ancestors: 0.797054
At least 3 hit accuracy by ancestors: 0.660011
At least 4 hit accuracy by ancestors: 0.503219
At least 5 hit accuracy by ancestors: 0.325150


### Performance in Top 5 exactly matched results

In [3]:
# top 5 match
result_json = './rag_evaluation_query_results_top25_text-embedding-3-large.json'
# load the query results from the json file
with open(result_json, 'r') as f:
    query_results = json.load(f)

In [21]:
# merge two dictionary by keys

# convert query_results into pandas dataframe
# Flatten the dictionary
rows = []
for key, value in query_results.items():
    for entry in value:
        synonym = entry.get('synonym', '')
        for result in entry.get('parsed_results', []):
            rows.append({
                'query_hpo_id': key,
                'query_synonym': synonym,
                'hit_hpo_id': result['hpo_id'],
                'hit_hpo_name': result['hpo_name'],
                'top_k': result['top_k'],
                'distance': result['distance']
            })

# Create a pandas DataFrame
query_results_df = pd.DataFrame(rows)
# query_results_df = pd.DataFrame(query_results.items(), columns=['query_hpo_id', 'hit_hpo_objects'])
# merge the two dataframes
merged_df = pd.merge(expanded_hpo_name_df, query_results_df, on=['query_hpo_id','query_synonym'])
# expand the hit_hpo_ids into multiple rows
total_pairs = merged_df.groupby(['query_hpo_id', 'query_synonym']).ngroups
print(f"Total pairs: {total_pairs}")
# Load into pandas DataFrame
# Filter the DataFrame based on the conditions
for k in range(25):
    filtered_df = merged_df[(merged_df['top_k'] <= k) & (merged_df['query_hpo_id'] == merged_df['hit_hpo_id'])]
    # Count the unique "query_hpo_id, query_synonym" pairs
    count = filtered_df.groupby(['query_hpo_id', 'query_synonym']).ngroups
    # Calculate the percentage
    percentage = (count / total_pairs) * 100
    print(f"Top {k+1} accuracy: {percentage:.2f}%")

Total pairs: 91649
Top 1 accuracy: 60.94%
Top 2 accuracy: 72.79%
Top 3 accuracy: 78.40%
Top 4 accuracy: 81.81%
Top 5 accuracy: 84.11%
Top 6 accuracy: 85.77%
Top 7 accuracy: 87.00%
Top 8 accuracy: 87.98%
Top 9 accuracy: 88.75%
Top 10 accuracy: 89.43%
Top 11 accuracy: 90.05%
Top 12 accuracy: 90.56%
Top 13 accuracy: 90.97%
Top 14 accuracy: 91.35%
Top 15 accuracy: 91.68%
Top 16 accuracy: 91.99%
Top 17 accuracy: 92.28%
Top 18 accuracy: 92.55%
Top 19 accuracy: 92.79%
Top 20 accuracy: 93.00%
Top 21 accuracy: 93.19%
Top 22 accuracy: 93.37%
Top 23 accuracy: 93.54%
Top 24 accuracy: 93.67%
Top 25 accuracy: 93.83%


### Performance in Top 5 descendants matched results

In [8]:
# accuracy counted by ancestors
merged_ancestor_df = hpo_ancestors_df.merge(merged_df, how='right')
# generate a column to indicate whether the hit_hpo_ids is in the ancestor list
merged_ancestor_df['query_is_ancestor'] = merged_ancestor_df.apply(
    lambda x: isinstance(x['ancestor'], (list, set)) and x['query_hpo_id'] in x['ancestor'],
    axis=1
)# expand the hit_hpo_ids into multiple rows
total_pairs = merged_ancestor_df.groupby(['query_hpo_id', 'query_synonym']).ngroups
print(f"Total pairs: {total_pairs}")
# Load into pandas DataFrame
# Filter the DataFrame based on the conditions
for k in range(5):
    filtered_df = merged_ancestor_df[(merged_ancestor_df['top_k'] <= k) & (merged_ancestor_df['query_is_ancestor'] == True)]
    # Count the unique "query_hpo_id, query_synonym" pairs
    count = filtered_df.groupby(['query_hpo_id', 'query_synonym']).ngroups
    # Calculate the percentage
    percentage = (count / total_pairs) * 100
    print(f"Top {k+1} accuracy: {percentage:.2f}%")

Total pairs: 91649
Top 1 accuracy: 63.94%
Top 2 accuracy: 74.24%
Top 3 accuracy: 78.89%
Top 4 accuracy: 81.79%
Top 5 accuracy: 83.72%


In [21]:
# plot distantce distribution by query_is_ancestor
for_plot_df = merged_ancestor_df[['query_is_ancestor','distance']]
for_plot_df['query_is_ancestor'] = for_plot_df['query_is_ancestor'].astype(str)
sns.violinplot(x='query_is_ancestor', y='distance', data=for_plot_df)
plt.show()
# do t test to show the differences
from scipy.stats import ttest_ind
query_is_ancestor = for_plot_df.query('query_is_ancestor == "True"')['distance']
query_is_not_ancestor = for_plot_df.query('query_is_ancestor == "False"')['distance']
# print summary statistics
print(query_is_ancestor.describe())
print(query_is_not_ancestor.describe())
t_test = ttest_ind(query_is_ancestor, query_is_not_ancestor)
print(t_test)
# do mannwhitneyu test to show the differences
from scipy.stats import mannwhitneyu
mannwhitneyu_test = mannwhitneyu(query_is_ancestor, query_is_not_ancestor)
print(mannwhitneyu_test)

KeyError: "['distance'] not in index"

### Compare with GPT-4's reranking

In [69]:
# load all sampled hp_ids from 
nonir_gpt_response_folder = './nonir_gpt_response_top5'
# loop through all files in the folder
all_nonir_gpt_responses = []
for file in os.listdir(nonir_gpt_response_folder):
    with open(os.path.join(nonir_gpt_response_folder, file), "r") as f:
        gpt_response_json = json.load(f)
        all_nonir_gpt_responses.append(gpt_response_json)
# convert to pandas dataframe
nonir_gpt_responses_df = pd.DataFrame(all_nonir_gpt_responses)
nonir_gpt_responses_df['gpt_top_k'] = nonir_gpt_responses_df['gpt_response'] - 1
# subset merged_df by nonir_gpt_responses_df's query_hpo_id and query_synonym
ir_subset_df = merged_df[merged_df['query_hpo_id'].isin(nonir_gpt_responses_df['query_hpo_id']) & merged_df['query_synonym'].isin(nonir_gpt_responses_df['query_synonym'])]
# # merge ir_merged_df with nonir_gpt_responses_df
# gpt_ir_comparison_df = ir_merged_df.merge(nonir_gpt_responses_df, on=['query_hpo_id', 'query_synonym'])

In [70]:
nonir_gpt_responses_df

Unnamed: 0,query_hpo_id,query_synonym,gpt_response,gpt_hit_hpo_id,gpt_top_k
0,HP:0032681,Focal seizure with awareness,1,HP:0002349,0
1,HP:0031486,Vascular lesion of the lip,1,HP:0031486,0
2,HP:0009512,Triangular growth at the tip of the second digit,2,HP:0009512,1
3,HP:0012036,Sternocleidomastoid weakness,1,HP:0003722,0
4,HP:0033435,Irregular levels of circulating keto acids,1,HP:0033435,0
...,...,...,...,...,...
95,HP:0033314,Proliferative visceral epithelium,1,HP:0033314,0
96,HP:0025413,Urethral narrowing at the navicular fossa,1,HP:0025413,0
97,HP:0032616,Interstitial kidney immune globulin deposits,1,HP:0032616,0
98,HP:5000033,SOX1 neutralizing antibody,1,HP:5000033,0


In [71]:
# GPT top 1 accuracy 
gpt_top1_accuracy = (nonir_gpt_responses_df['query_hpo_id'] == nonir_gpt_responses_df['gpt_hit_hpo_id']).mean()
print(f"GPT top 1 accuracy: {gpt_top1_accuracy:.2f}")

GPT top 1 accuracy: 0.82


In [8]:
# IR top 1 accuracy
ir_subset_df_top1 = ir_subset_df[ir_subset_df['top_k'] == 0]    
ir_top1_accuracy = (ir_subset_df_top1['query_hpo_id'] == ir_subset_df_top1['hit_hpo_id']).mean()
print(f"IR top 1 accuracy: {ir_top1_accuracy:.2f}")

IR top 1 accuracy: 0.71


# Generate sampled 100 pairs for human evaluation.

In [9]:
ir_subset_df

Unnamed: 0,query_hpo_id,query_synonym,hit_hpo_id,hit_hpo_name,top_k,distance
14205,HP:0032681,Focal seizure with awareness,HP:0002349,Focal aware seizure,0,0.523132
14206,HP:0032681,Focal seizure with awareness,HP:0032681,Focal aware cognitive seizure,1,0.571581
14207,HP:0032681,Focal seizure with awareness,HP:0032754,Focal aware sensory seizure,2,0.571813
14208,HP:0032681,Focal seizure with awareness,HP:0020217,Focal aware motor seizure,3,0.618296
14209,HP:0032681,Focal seizure with awareness,HP:0032864,Focal aware sensory seizure with auditory feat...,4,0.619292
...,...,...,...,...,...,...
458050,HP:0100556,Unilateral atrophy,HP:0008717,Unilateral renal atrophy,0,0.694840
458051,HP:0100556,Unilateral atrophy,HP:0100557,Hemiatrophy of lower limb,1,0.811439
458052,HP:0100556,Unilateral atrophy,HP:0100558,Hemiatrophy of upper limb,2,0.830153
458053,HP:0100556,Unilateral atrophy,HP:0100556,Hemiatrophy,3,0.848824


In [63]:
### compare top 5 and top25 results
with open('./rag_evaluation_query_results_top5_text-embedding-3-large.json', 'r') as f:
    top5_results = json.load(f)
with open('./rag_evaluation_query_results_top25_text-embedding-3-large.json', 'r') as f:
    top25_results = json.load(f)

In [10]:
def query_results_json_to_df(query_results):
    rows = []
    for key, value in query_results.items():
        for entry in value:
            synonym = entry.get('synonym', '')
            for result in entry.get('parsed_results', []):
                rows.append({
                    'query_hpo_id': key,
                    'query_synonym': synonym,
                    'hit_hpo_id': result['hpo_id'],
                    'hit_hpo_name': result['hpo_name'],
                    'top_k': result['top_k'],
                    'distance': result['distance']
                })

    # Create a pandas DataFrame
    query_results_df = pd.DataFrame(rows)
    return query_results_df


In [64]:
query_results_top25_df = query_results_json_to_df(top25_results)
# only retain top 5 results from top 25 results
query_results_top5_prime_df = query_results_top25_df[query_results_top25_df['top_k'] < 5]
query_results_top5_df = query_results_json_to_df(top5_results)
merged_top5_df = pd.merge(expanded_hpo_name_df, query_results_top5_df, on=['query_hpo_id','query_synonym'])
merged_top5_prime_df = pd.merge(expanded_hpo_name_df, query_results_top5_prime_df, on=['query_hpo_id','query_synonym'])

In [65]:
total_pairs_top5 = merged_top5_df.groupby(['query_hpo_id', 'query_synonym']).ngroups
print(f"Total pairs: {total_pairs_top5}")
# Load into pandas DataFrame
# Filter the DataFrame based on the conditions
k = 0
filtered_df_top5 = merged_top5_df[(merged_top5_df['top_k'] <= k) & (merged_top5_df['query_hpo_id'] == merged_top5_df['hit_hpo_id'])]
# Count the unique "query_hpo_id, query_synonym" pairs
count_top5 = filtered_df_top5.groupby(['query_hpo_id', 'query_synonym']).ngroups
# Calculate the percentage
percentage = (count_top5 / total_pairs_top5) * 100
print(f"Top {k+1} accuracy: {percentage:.2f}%")

Total pairs: 91649
Top 1 accuracy: 59.01%


In [27]:
total_pairs_top25 = merged_top25_df.groupby(['query_hpo_id', 'query_synonym']).ngroups
print(f"Total pairs: {total_pairs_top25}")
# Load into pandas DataFrame
# Filter the DataFrame based on the conditions
k = 0
filtered_df_top25 = merged_top25_df[(merged_top25_df['top_k'] <= k) & (merged_top25_df['query_hpo_id'] == merged_top25_df['hit_hpo_id'])]
# Count the unique "query_hpo_id, query_synonym" pairs
count_top25 = filtered_df_top25.groupby(['query_hpo_id', 'query_synonym']).ngroups
# Calculate the percentage
percentage = (count_top25 / total_pairs_top25) * 100
print(f"Top {k+1} accuracy: {percentage:.2f}%")

Total pairs: 91649
Top 1 accuracy: 60.94%


In [61]:
count_top5

54086

In [62]:
count_top25

55855

In [46]:
# find rows in filtered_df_top25 but not in filtered_df_top5
filtered_df_top25 = filtered_df_top25[['query_hpo_id', 'query_synonym', 'hit_hpo_id']]
filtered_df_top5 = filtered_df_top5[['query_hpo_id', 'query_synonym', 'hit_hpo_id']]
diff_df = pd.merge(filtered_df_top25, filtered_df_top5, on=['query_hpo_id', 'query_synonym', 'hit_hpo_id'], how='outer', indicator=True)
diff_df = diff_df[diff_df['_merge'] == 'right_only']
diff_df

Unnamed: 0,query_hpo_id,query_synonym,hit_hpo_id,_merge
190,HP:0000092,Kidney tubular degeneration,HP:0000092,right_only
489,HP:0000262,Elevated cranial structure,HP:0000262,right_only
920,HP:0000545,Poor distance vision,HP:0000545,right_only
1021,HP:0000612,Iris cleft,HP:0000612,right_only
1770,HP:0001050,excess,HP:0001050,right_only
...,...,...,...,...
53432,HP:6000421,Decorative neckbands,HP:6000421,right_only
53566,HP:6000458,Mercaptan abscess,HP:6000458,right_only
55296,HP:6000901,Positive T-cell activation assay for TB infection,HP:6000901,right_only
55917,HP:6001085,Posterior cranial fossa growth,HP:6001085,right_only


In [48]:
query_results_top25_df[query_results_top25_df['query_hpo_id'] == 'HP:0000092']

Unnamed: 0,query_hpo_id,query_synonym,hit_hpo_id,hit_hpo_name,top_k,distance
414125,HP:0000092,Kidney tubular degeneration,HP:6000429,Renal tubular karyomegaly,0,0.832372
414126,HP:0000092,Kidney tubular degeneration,HP:0000092,Renal tubular atrophy,1,0.837288
414127,HP:0000092,Kidney tubular degeneration,HP:0008682,Renal tubular epithelial necrosis,2,0.855447
414128,HP:0000092,Kidney tubular degeneration,HP:0000124,Renal tubular dysfunction,3,0.870575
414129,HP:0000092,Kidney tubular degeneration,HP:0032647,Renal tubular epithelial cell apoptosis,4,0.870792
...,...,...,...,...,...,...
414245,HP:0000092,Tubular dysfunction,HP:0001969,Abnormal tubulointerstitial morphology,20,1.004958
414246,HP:0000092,Tubular dysfunction,HP:0033774,Impaired renal tubular reabsorption of uric acid,21,1.008977
414247,HP:0000092,Tubular dysfunction,HP:0041050,Renal tubular cyst,22,1.012497
414248,HP:0000092,Tubular dysfunction,HP:0012491,Abnormal dense tubular system,23,1.018737


In [49]:
query_results_top5_df[query_results_top5_df['query_hpo_id'] == 'HP:0000092']

Unnamed: 0,query_hpo_id,query_synonym,hit_hpo_id,hit_hpo_name,top_k,distance
82825,HP:0000092,Kidney tubular degeneration,HP:0000092,Renal tubular atrophy,0,0.837274
82826,HP:0000092,Kidney tubular degeneration,HP:0008682,Renal tubular epithelial necrosis,1,0.855429
82827,HP:0000092,Kidney tubular degeneration,HP:0000124,Renal tubular dysfunction,2,0.870529
82828,HP:0000092,Kidney tubular degeneration,HP:0032647,Renal tubular epithelial cell apoptosis,3,0.870808
82829,HP:0000092,Kidney tubular degeneration,HP:0032952,Usual-type tubular atrophy,4,0.870852
82830,HP:0000092,Tubular necrosis,HP:0008682,Renal tubular epithelial necrosis,0,0.62868
82831,HP:0000092,Tubular necrosis,HP:0032632,Renal papillary necrosis,1,0.833587
82832,HP:0000092,Tubular necrosis,HP:0032647,Renal tubular epithelial cell apoptosis,2,0.839011
82833,HP:0000092,Tubular necrosis,HP:0025418,Renal cortical necrosis,3,0.858579
82834,HP:0000092,Tubular necrosis,HP:0000092,Renal tubular atrophy,4,0.901397


In [57]:
top5_results['HP:0000092'][0]['parsed_results']

[{'hpo_id': 'HP:0000092',
  'hpo_name': 'Renal tubular atrophy',
  'top_k': 0,
  'distance': 0.8372736573219299},
 {'hpo_id': 'HP:0008682',
  'hpo_name': 'Renal tubular epithelial necrosis',
  'top_k': 1,
  'distance': 0.855429470539093},
 {'hpo_id': 'HP:0000124',
  'hpo_name': 'Renal tubular dysfunction',
  'top_k': 2,
  'distance': 0.870528519153595},
 {'hpo_id': 'HP:0032647',
  'hpo_name': 'Renal tubular epithelial cell apoptosis',
  'top_k': 3,
  'distance': 0.8708081841468811},
 {'hpo_id': 'HP:0032952',
  'hpo_name': 'Usual-type tubular atrophy',
  'top_k': 4,
  'distance': 0.8708524703979492}]

In [59]:
top25_results['HP:0000092'][0]['parsed_results']

[{'hpo_id': 'HP:6000429',
  'hpo_name': 'Renal tubular karyomegaly',
  'top_k': 0,
  'distance': 0.8323719501495361},
 {'hpo_id': 'HP:0000092',
  'hpo_name': 'Renal tubular atrophy',
  'top_k': 1,
  'distance': 0.8372879028320312},
 {'hpo_id': 'HP:0008682',
  'hpo_name': 'Renal tubular epithelial necrosis',
  'top_k': 2,
  'distance': 0.8554474711418152},
 {'hpo_id': 'HP:0000124',
  'hpo_name': 'Renal tubular dysfunction',
  'top_k': 3,
  'distance': 0.870574951171875},
 {'hpo_id': 'HP:0032647',
  'hpo_name': 'Renal tubular epithelial cell apoptosis',
  'top_k': 4,
  'distance': 0.8707923293113708},
 {'hpo_id': 'HP:0032952',
  'hpo_name': 'Usual-type tubular atrophy',
  'top_k': 5,
  'distance': 0.8709280490875244},
 {'hpo_id': 'HP:0005583',
  'hpo_name': 'Tubular basement membrane disintegration',
  'top_k': 6,
  'distance': 0.8833180069923401},
 {'hpo_id': 'HP:0032595',
  'hpo_name': 'Renal tubular epithelial cell detachment',
  'top_k': 7,
  'distance': 0.8964729309082031},
 {'hpo_i

In [60]:
db_path = '../hpo_chroma_db'
obo_path = '../hp.obo'
whoosh_path = '../hpo_whoosh_db'
from HpoDatabase import HpoDatabase
hpo_db = HpoDatabase(obo_path=obo_path, db_path=db_path, woosh_path=None, embedding_model='text-embedding-3-large')
synonm = "Kidney tubular degeneration"	
top5_results = hpo_db.query_hpo(synonm, n_results=5)
top5_parsed_n_results = hpo_db.parse_results_n_results(top5_results)
print(top5_parsed_n_results)
top25_results = hpo_db.query_hpo(synonm, n_results=25)
top25_parsed_n_results = hpo_db.parse_results_n_results(top25_results)
print(top25_parsed_n_results)

    

[{'hpo_id': 'HP:0000092', 'hpo_name': 'Renal tubular atrophy', 'top_k': 0, 'distance': 0.8372879028320312}, {'hpo_id': 'HP:0008682', 'hpo_name': 'Renal tubular epithelial necrosis', 'top_k': 1, 'distance': 0.8554474711418152}, {'hpo_id': 'HP:0000124', 'hpo_name': 'Renal tubular dysfunction', 'top_k': 2, 'distance': 0.870574951171875}, {'hpo_id': 'HP:0032647', 'hpo_name': 'Renal tubular epithelial cell apoptosis', 'top_k': 3, 'distance': 0.8707923293113708}, {'hpo_id': 'HP:0032952', 'hpo_name': 'Usual-type tubular atrophy', 'top_k': 4, 'distance': 0.8709280490875244}]
[{'hpo_id': 'HP:6000429', 'hpo_name': 'Renal tubular karyomegaly', 'top_k': 0, 'distance': 0.8323719501495361}, {'hpo_id': 'HP:0000092', 'hpo_name': 'Renal tubular atrophy', 'top_k': 1, 'distance': 0.8372879028320312}, {'hpo_id': 'HP:0008682', 'hpo_name': 'Renal tubular epithelial necrosis', 'top_k': 2, 'distance': 0.8554474711418152}, {'hpo_id': 'HP:0000124', 'hpo_name': 'Renal tubular dysfunction', 'top_k': 3, 'distance'