In [14]:
# import modules in the parent directory
import sys
import os
import json
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# turn off DeprecationWarning
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
sys.path.append('..')
obo_path = '../hp.obo'

In [15]:
# load all synonyms generated by GPT-4o-mini
synonym_folder = '/Users/cl3720/Desktop/hpo-parser-fine-tuned-gemini/hpo_synonyms/gpt-4o-mini-2024-07-18'
expanded_hpo_name_dict = {}
if os.path.exists(synonym_folder):
# load the synonyms from the folder
# each file in the folder is a json file with the following format {"HP:XXXX": ["synonym1", "synonym2", "synonym3"]}
    for file in os.listdir(synonym_folder):
        with open(os.path.join(synonym_folder, file), "r") as f:
            expanded_hpo_name_dict.update(json.load(f))
len(expanded_hpo_name_dict)
print(f"There are {len(expanded_hpo_name_dict)} HPO terms with synonyms")
# load the HPO ontology
from HpoFactory import HpoFactory
hpoF = HpoFactory(obo_path)
hpo_tree = hpoF.build_hpo_tree()
hpo_ancestors = hpoF.get_hpo_ancestors(hpo_tree)
hpo_levels = hpoF.get_hpo_levels(hpo_tree)
hpo_ancestors_df = pd.DataFrame(hpo_ancestors.items(), columns=['hpo_id', 'ancestor'])
# add hp_id to the ancestor list in the ancestor column
hpo_ancestors_df['ancestor'] = hpo_ancestors_df.apply(lambda x: [x.hpo_id] + x.ancestor, axis=1)
hpo_ancestors_df.rename(columns={'hpo_id': 'hit_hpo_id'}, inplace=True)

There are 18330 HPO terms with synonyms


### Performance in Top 1 exactly matched results

In [11]:
result_json = './rag_evaluation_query_results_top1_whoosh.json'    
# load the query results from the json file
with open(result_json, 'r') as f:
    query_results = json.load(f)

# # # top 5 also contains top 1 results
# result_json = './rag_evaluation_query_results_top5_text-embedding-3-large.json'
# # load the query results from the json file
# with open(result_json, 'r') as f:
#     query_results = json.load(f)
#     # convert to top 1 results
#     query_results = {k: [v[i]['parsed_results'][0]['hpo_id'] if i < len(v) else None for i in range(5)] for k, v in query_results.items()}
#     # for some reason, the query results are not the same.

In [12]:
# merge two dictionary by keys
# convert expanded_hpo_name_dict into pandas dataframe
expanded_hpo_name_df = pd.DataFrame(expanded_hpo_name_dict.items(), columns=['query_hpo_id', 'synonyms'])
# convert query_results into pandas dataframe
query_results_df = pd.DataFrame(query_results.items(), columns=['query_hpo_id', 'hit_hpo_id'])
# merge the two dataframes
merged_df = pd.merge(expanded_hpo_name_df, query_results_df, on='query_hpo_id')
# expand the hit_hpo_ids into multiple rows
merged_df = merged_df.explode(['synonyms','hit_hpo_id'])
# overall accuracy query_hpo_id == hit_hpo_ids
overall_accuracy = (merged_df.query_hpo_id == merged_df.hit_hpo_id).mean()
# round 2 decimal places
print(f"Overall accuracy: {overall_accuracy:.2f}")
# by query_hpo_id at least one hit_hpo_ids == query_hpo_id
for at_leat_i_hit in range(1, 6):
    at_least_x_hit_accuracy = merged_df.groupby('query_hpo_id').apply(lambda x: (x.query_hpo_id == x.hit_hpo_id).value_counts().get(True, 0) >= at_leat_i_hit).mean()
    print(f"At least {at_leat_i_hit} hit accuracy: {at_least_x_hit_accuracy:1f}")

Overall accuracy: 0.06
At least 1 hit accuracy: 0.218603
At least 2 hit accuracy: 0.054610
At least 3 hit accuracy: 0.012220
At least 4 hit accuracy: 0.002728
At least 5 hit accuracy: 0.000327


In [13]:
query_results_df

Unnamed: 0,query_hpo_id,hit_hpo_id
0,HP:0030429,"[None, None, HP:0030429, None, None]"
1,HP:0004875,"[None, None, None, None, None]"
2,HP:0032414,"[HP:0031991, None, None, None, None]"
3,HP:5200239,"[None, None, None, None, None]"
4,HP:0100607,"[None, HP:0032149, None, HP:0003710, None]"
...,...,...
18325,HP:0030778,"[HP:0030778, None, None, None, None]"
18326,HP:0011025,"[None, None, None, None, None]"
18327,HP:0008204,"[None, None, None, None, None]"
18328,HP:0030282,"[None, None, None, None, None]"


### Performance in Top 1 desendants matched results

In [18]:
# accuracy counted by ancestors
merged_ancestor_df = hpo_ancestors_df.merge(merged_df, how='right')
# generate a column to indicate whether the hit_hpo_ids is in the ancestor list
merged_ancestor_df['query_is_ancestor'] = merged_ancestor_df.apply(
    lambda x: isinstance(x['ancestor'], (list, set)) and x['query_hpo_id'] in x['ancestor'],
    axis=1
)# To avoid TypeError: argument of type 'float' is not iterable
# accuracy by ancestors
ancestor_accuracy = merged_ancestor_df.query_is_ancestor.mean()
print(f"Accuracy by ancestors: {ancestor_accuracy:2f}")
# by query_hpo_id at least one hit_hpo_ids == ancestor
for at_leat_i_hit in range(1, 6):
    at_least_x_hit_accuracy = merged_ancestor_df.groupby('query_hpo_id').apply(lambda x: x.query_is_ancestor.value_counts().get(True, 0) >= at_leat_i_hit).mean()
    print(f"At least {at_leat_i_hit} hit accuracy by ancestors: {at_least_x_hit_accuracy:2f}")

Accuracy by ancestors: 0.639356
At least 1 hit accuracy by ancestors: 0.911348
At least 2 hit accuracy by ancestors: 0.797054
At least 3 hit accuracy by ancestors: 0.660011
At least 4 hit accuracy by ancestors: 0.503219
At least 5 hit accuracy by ancestors: 0.325150


### Performance in Top 5 exactly matched results

In [16]:
# top 5 match
result_json = './rag_evaluation_query_results_top25_text-embedding-3-large.json'
# load the query results from the json file
with open(result_json, 'r') as f:
    query_results = json.load(f)

In [17]:
# merge two dictionary by keys
# convert expanded_hpo_name_dict into pandas dataframe
expanded_hpo_name_df = pd.DataFrame(expanded_hpo_name_dict.items(), columns=['query_hpo_id', 'synonyms'])
expanded_hpo_name_df = expanded_hpo_name_df.explode('synonyms')
expanded_hpo_name_df.rename(columns={'synonyms': 'query_synonym'}, inplace=True)
# convert query_results into pandas dataframe
# Flatten the dictionary
rows = []
for key, value in query_results.items():
    for entry in value:
        synonym = entry.get('synonym', '')
        for result in entry.get('parsed_results', []):
            rows.append({
                'query_hpo_id': key,
                'query_synonym': synonym,
                'hit_hpo_id': result['hpo_id'],
                'hit_hpo_name': result['hpo_name'],
                'top_k': result['top_k'],
                'distance': result['distance']
            })

# Create a pandas DataFrame
query_results_df = pd.DataFrame(rows)
# query_results_df = pd.DataFrame(query_results.items(), columns=['query_hpo_id', 'hit_hpo_objects'])
# merge the two dataframes
merged_df = pd.merge(expanded_hpo_name_df, query_results_df, on=['query_hpo_id','query_synonym'])
# expand the hit_hpo_ids into multiple rows
total_pairs = merged_df.groupby(['query_hpo_id', 'query_synonym']).ngroups
# Load into pandas DataFrame
# Filter the DataFrame based on the conditions
for k in range(25):
    filtered_df = merged_df[(merged_df['top_k'] <= k) & (merged_df['query_hpo_id'] == merged_df['hit_hpo_id'])]
    # Count the unique "query_hpo_id, query_synonym" pairs
    count = filtered_df.groupby(['query_hpo_id', 'query_synonym']).ngroups
    # Calculate the percentage
    percentage = (count / total_pairs) * 100
    print(f"Top {k+1} accuracy: {percentage:.2f}%")

Top 1 accuracy: 60.94%
Top 2 accuracy: 72.79%
Top 3 accuracy: 78.40%
Top 4 accuracy: 81.81%
Top 5 accuracy: 84.11%


### Performance in Top 5 descendants matched results

In [8]:
# accuracy counted by ancestors
merged_ancestor_df = hpo_ancestors_df.merge(merged_df, how='right')
# generate a column to indicate whether the hit_hpo_ids is in the ancestor list
merged_ancestor_df['query_is_ancestor'] = merged_ancestor_df.apply(
    lambda x: isinstance(x['ancestor'], (list, set)) and x['query_hpo_id'] in x['ancestor'],
    axis=1
)# expand the hit_hpo_ids into multiple rows
total_pairs = merged_ancestor_df.groupby(['query_hpo_id', 'query_synonym']).ngroups
print(f"Total pairs: {total_pairs}")
# Load into pandas DataFrame
# Filter the DataFrame based on the conditions
for k in range(5):
    filtered_df = merged_ancestor_df[(merged_ancestor_df['top_k'] <= k) & (merged_ancestor_df['query_is_ancestor'] == True)]
    # Count the unique "query_hpo_id, query_synonym" pairs
    count = filtered_df.groupby(['query_hpo_id', 'query_synonym']).ngroups
    # Calculate the percentage
    percentage = (count / total_pairs) * 100
    print(f"Top {k+1} accuracy: {percentage:.2f}%")

Total pairs: 91649
Top 1 accuracy: 63.94%
Top 2 accuracy: 74.24%
Top 3 accuracy: 78.89%
Top 4 accuracy: 81.79%
Top 5 accuracy: 83.72%


In [21]:
# plot distantce distribution by query_is_ancestor
for_plot_df = merged_ancestor_df[['query_is_ancestor','distance']]
for_plot_df['query_is_ancestor'] = for_plot_df['query_is_ancestor'].astype(str)
sns.violinplot(x='query_is_ancestor', y='distance', data=for_plot_df)
plt.show()
# do t test to show the differences
from scipy.stats import ttest_ind
query_is_ancestor = for_plot_df.query('query_is_ancestor == "True"')['distance']
query_is_not_ancestor = for_plot_df.query('query_is_ancestor == "False"')['distance']
# print summary statistics
print(query_is_ancestor.describe())
print(query_is_not_ancestor.describe())
t_test = ttest_ind(query_is_ancestor, query_is_not_ancestor)
print(t_test)
# do mannwhitneyu test to show the differences
from scipy.stats import mannwhitneyu
mannwhitneyu_test = mannwhitneyu(query_is_ancestor, query_is_not_ancestor)
print(mannwhitneyu_test)

KeyError: "['distance'] not in index"

### Compare with GPT-4's reranking

In [6]:
# load all sampled hp_ids from 
nonir_gpt_response_folder = './nonir_gpt_response'
# loop through all files in the folder
all_nonir_gpt_responses = []
for file in os.listdir(nonir_gpt_response_folder):
    with open(os.path.join(nonir_gpt_response_folder, file), "r") as f:
        gpt_response_json = json.load(f)
        all_nonir_gpt_responses.append(gpt_response_json)
# convert to pandas dataframe
nonir_gpt_responses_df = pd.DataFrame(all_nonir_gpt_responses)
nonir_gpt_responses_df['gpt_top_k'] = nonir_gpt_responses_df['gpt_response'] - 1
# subset merged_df by nonir_gpt_responses_df's query_hpo_id and query_synonym
ir_subset_df = merged_df[merged_df['query_hpo_id'].isin(nonir_gpt_responses_df['query_hpo_id']) & merged_df['query_synonym'].isin(nonir_gpt_responses_df['query_synonym'])]
# # merge ir_merged_df with nonir_gpt_responses_df
# gpt_ir_comparison_df = ir_merged_df.merge(nonir_gpt_responses_df, on=['query_hpo_id', 'query_synonym'])

In [7]:
# GPT top 1 accuracy 
gpt_top1_accuracy = (nonir_gpt_responses_df['query_hpo_id'] == nonir_gpt_responses_df['gpt_hit_hpo_id']).mean()
print(f"GPT top 1 accuracy: {gpt_top1_accuracy:.2f}")

GPT top 1 accuracy: 0.82


In [8]:
# IR top 1 accuracy
ir_subset_df_top1 = ir_subset_df[ir_subset_df['top_k'] == 0]    
ir_top1_accuracy = (ir_subset_df_top1['query_hpo_id'] == ir_subset_df_top1['hit_hpo_id']).mean()
print(f"IR top 1 accuracy: {ir_top1_accuracy:.2f}")

IR top 1 accuracy: 0.71


# Generate sampled 100 pairs for human evaluation.

In [9]:
ir_subset_df

Unnamed: 0,query_hpo_id,query_synonym,hit_hpo_id,hit_hpo_name,top_k,distance
14205,HP:0032681,Focal seizure with awareness,HP:0002349,Focal aware seizure,0,0.523132
14206,HP:0032681,Focal seizure with awareness,HP:0032681,Focal aware cognitive seizure,1,0.571581
14207,HP:0032681,Focal seizure with awareness,HP:0032754,Focal aware sensory seizure,2,0.571813
14208,HP:0032681,Focal seizure with awareness,HP:0020217,Focal aware motor seizure,3,0.618296
14209,HP:0032681,Focal seizure with awareness,HP:0032864,Focal aware sensory seizure with auditory feat...,4,0.619292
...,...,...,...,...,...,...
458050,HP:0100556,Unilateral atrophy,HP:0008717,Unilateral renal atrophy,0,0.694840
458051,HP:0100556,Unilateral atrophy,HP:0100557,Hemiatrophy of lower limb,1,0.811439
458052,HP:0100556,Unilateral atrophy,HP:0100558,Hemiatrophy of upper limb,2,0.830153
458053,HP:0100556,Unilateral atrophy,HP:0100556,Hemiatrophy,3,0.848824
