In [1]:
import pandas as pd
import numpy as np
import math

# Read in the data
df1 = pd.read_csv('./Data/HPO_input/Results/GPT4_top_prediction_RE.csv')
# split columns into multiple columns
df1['dataset'] = df1['file_path'].str.split('/', expand=True)[4]
df2 = pd.read_csv('./Data/simulated_pt_input/Results/GPT4_top_prediction_RE.csv')
df2['dataset'] = df2['file_path'].str.split('/', expand=True)[4]
df1['input'] = 'HPO input'
df2['input'] = 'Simulation HPO input'
# Read in the data
gpt_df = pd.read_csv('./Data/free_text_input/Results/GPT4_top_prediction_RE.csv')
# split columns into multiple columns
# gpt_df['pt_id'] = gpt_df['file_path']
gpt_df['PMID'] = gpt_df['file_path'].apply(lambda x: x.split('_')[0])
probe_info_df = pd.read_csv('./Data/HPO_input/Original_data/probe_info', sep = '\t', header=None)
probe_info_df.columns = ['dataset','pt_id','dx_gene']
probe_info_df['PMID'] = probe_info_df['pt_id'].apply(lambda x: str(x).split('_')[0])
probe_info_df = probe_info_df[['dataset','PMID']].drop_duplicates()
df3 = gpt_df.merge(probe_info_df, how='left')
df3['dataset'] = df3['dataset'].fillna('Others')
df3['input'] = 'free text input'

In [42]:
# format float to 2 decimal places
def prediction_accuracy_summary(df):
    w = df.shape[0]
    x = df[['predict_correct_in_top_5']].values.sum()
    y = df[['predict_made_in_top_10']].values.sum()
    print('{y} out of {w} predictions made in top 5 ({z}%)'.format(y=y, w=w, z=np.round(y/w,4)*100))
    print('{x} out of {y} predictions correct in top 5 ({z}%)'.format(x=x, y=y, z=np.round(x/y,4)*100))

    x = df[['predict_correct_in_top_10']].values.sum()
    y = df[['predict_made_in_top_10']].values.sum()
    print('{y} out of {w} predictions made in top 10 ({z}%)'.format(y=y, w=w, z=np.round(y/w,4)*100))
    print('{x} out of {y} predictions correct in top 10 ({z}%)'.format(x=x, y=y, z=np.round(x/y,4)*100))

    x = df[['predict_correct_in_top_50']].values.sum()
    y = df[['predict_made_in_top_50']].values.sum()
    print('{y} out of {w} predictions made in top 50 ({z}%)'.format(y=y, w=w, z=np.round(y/w,4)*100))
    print('{x} out of {y} predictions correct in top 50 ({z}%)'.format(x=x, y=y, z=np.round(x/y,4)*100))

def prediction_accuracy_summary_by_subset(df):
    results_dict = {}
    # subset df by dataset
    dataset_list = df['dataset'].unique()
    for dataset in dataset_list:
        results_dict[dataset] = []
        print(dataset)
        df_subset = df[df['dataset']==dataset]
        w = df_subset.shape[0]
        x = df_subset[['predict_correct_in_top_5']].values.sum()
        y = df_subset[['predict_made_in_top_5']].values.sum()
        results_dict[dataset].append({'top_5_made': f"{y} / {w} ({np.round(y/w,4)*100:.2f}%)"})
        results_dict[dataset].append({'top_5_correct': f"{x} / {y} ({np.round(x/y,4)*100:.2f}%)"})
        print('{y} out of {w} predictions made in top 5 ({z}%)'.format(y=y, w=w, z=np.round(y/w,4)*100))
        print('{x} out of {y} predictions correct in top 5 ({z}%)'.format(x=x, y=y, z=np.round(x/y,4)*100))

        x = df_subset[['predict_correct_in_top_10']].values.sum()
        y = df_subset[['predict_made_in_top_10']].values.sum()
        results_dict[dataset].append({'top_10_made': f"{y} / {w} ({np.round(y/w,4)*100:.2f}%)"})
        results_dict[dataset].append({'top_10_correct': f"{x} / {y} ({np.round(x/y,4)*100:.2f}%)"})
        print('{y} out of {w} predictions made in top 10 ({z}%)'.format(y=y, w=w, z=np.round(y/w,4)*100))
        print('{x} out of {y} predictions correct in top 10 ({z}%)'.format(x=x, y=y, z=np.round(x/y,4)*100))

        x = df_subset[['predict_correct_in_top_50']].values.sum()
        y = df_subset[['predict_made_in_top_50']].values.sum()
        results_dict[dataset].append({'top_50_made': f"{y} / {w} ({np.round(y/w,4)*100:.2f}%)"})
        results_dict[dataset].append({'top_50_correct': f"{x} / {y} ({np.round(x/y,4)*100:.2f}%)"})
        print('{y} out of {w} predictions made in top 50 ({z}%)'.format(y=y, w=w, z=np.round(y/w,4)*100))
        print('{x} out of {y} predictions correct in top 50 ({z}%)'.format(x=x, y=y, z=np.round(x/y,4)*100))
    return results_dict

In [43]:
result_list = []
for df in [df1, df2, df3]:
    results_dict = {}
    print(f"dataset: {df['input'].tolist()[0]}")
    results_dict['input'] = df['input'].tolist()[0]
    prediction_accuracy_summary(df)
    results_dict['results'] = prediction_accuracy_summary_by_subset(df)
    result_list.append(results_dict)



dataset: HPO input
266 out of 276 predictions made in top 5 (96.38%)
28 out of 266 predictions correct in top 5 (10.530000000000001%)
266 out of 276 predictions made in top 10 (96.38%)
36 out of 266 predictions correct in top 10 (13.530000000000001%)
206 out of 276 predictions made in top 50 (74.64%)
35 out of 206 predictions correct in top 50 (16.99%)
ColumbiaU
27 out of 27 predictions made in top 5 (100.0%)
6 out of 27 predictions correct in top 5 (22.220000000000002%)
27 out of 27 predictions made in top 10 (100.0%)
7 out of 27 predictions correct in top 10 (25.929999999999996%)
22 out of 27 predictions made in top 50 (81.47999999999999%)
6 out of 22 predictions correct in top 50 (27.27%)
CSH
72 out of 72 predictions made in top 5 (100.0%)
12 out of 72 predictions correct in top 5 (16.669999999999998%)
68 out of 72 predictions made in top 10 (94.44%)
14 out of 68 predictions correct in top 10 (20.59%)
53 out of 72 predictions made in top 50 (73.61%)
15 out of 53 predictions correct 

In [44]:
# nested_dict_list is assumed to be available
df_list = []

for item in result_list:
    input_val = item['input']
    results_dict = item['results']
    for dataset, dataset_result_list in results_dict.items():
        for dataset_item in dataset_result_list:
            for top, value in dataset_item.items():
                row_dict = {'input': input_val, 'dataset': dataset, 'top': top, 'value': value}
                df_list.append(row_dict)
df = pd.DataFrame(df_list)
wide_df = df.pivot(index=['input','dataset'], columns='top', values='value').reset_index().sort_values(by=['dataset','input'])
wide_df

top,input,dataset,top_10_correct,top_10_made,top_50_correct,top_50_made,top_5_correct,top_5_made
0,HPO input,AJHG,0 / 76 (0.00%),76 / 78 (97.44%),1 / 52 (1.92%),52 / 78 (66.67%),0 / 78 (0.00%),78 / 78 (100.00%)
5,Simulation HPO input,AJHG,0 / 78 (0.00%),78 / 78 (100.00%),0 / 31 (0.00%),31 / 78 (39.74%),0 / 78 (0.00%),78 / 78 (100.00%)
10,free text input,AJHG,0 / 72 (0.00%),72 / 72 (100.00%),0 / 45 (0.00%),45 / 72 (62.50%),0 / 72 (0.00%),72 / 72 (100.00%)
1,HPO input,CSH,14 / 68 (20.59%),68 / 72 (94.44%),15 / 53 (28.30%),53 / 72 (73.61%),12 / 72 (16.67%),72 / 72 (100.00%)
6,Simulation HPO input,CSH,1 / 72 (1.39%),72 / 72 (100.00%),1 / 27 (3.70%),27 / 72 (37.50%),0 / 72 (0.00%),72 / 72 (100.00%)
11,free text input,CSH,11 / 49 (22.45%),49 / 49 (100.00%),11 / 31 (35.48%),31 / 49 (63.27%),12 / 49 (24.49%),49 / 49 (100.00%)
2,HPO input,ColumbiaU,7 / 27 (25.93%),27 / 27 (100.00%),6 / 22 (27.27%),22 / 27 (81.48%),6 / 27 (22.22%),27 / 27 (100.00%)
7,Simulation HPO input,ColumbiaU,0 / 27 (0.00%),27 / 27 (100.00%),0 / 8 (0.00%),8 / 27 (29.63%),0 / 27 (0.00%),27 / 27 (100.00%)
3,HPO input,DGD,15 / 82 (18.29%),82 / 85 (96.47%),13 / 67 (19.40%),67 / 85 (78.82%),10 / 85 (11.76%),85 / 85 (100.00%)
8,Simulation HPO input,DGD,1 / 85 (1.18%),85 / 85 (100.00%),0 / 20 (0.00%),20 / 85 (23.53%),0 / 84 (0.00%),84 / 85 (98.82%)


In [51]:
def probability_of_picking_specific_card(total_cards, specific_cards, draws):
    probability_not_picking = 1

    for i in range(draws):
        non_specific_cards = total_cards - specific_cards - i
        remaining_cards = total_cards - i
        probability_not_picking *= non_specific_cards / remaining_cards

    probability_picking = 1 - probability_not_picking
    return probability_picking
# calculate the probability of select the correct gene in top 5

total_cards = 30000
specific_cards = 1
# what is the probability of selecting the correct gene in top 5
draws = 5
probability = probability_of_picking_specific_card(total_cards, specific_cards, draws)
print('Probability of selecting the correct gene in top 5: {}%'.format(probability*100))
draws = 10
probability = probability_of_picking_specific_card(total_cards, specific_cards, draws)
print('Probability of selecting the correct gene in top 10: {}%'.format(probability*100))
draws = 50
probability = probability_of_picking_specific_card(total_cards, specific_cards, draws)
print('Probability of selecting the correct gene in top 50: {}%'.format(probability*100))

Probability of selecting the correct gene in top 5: 0.01666666666666483%
Probability of selecting the correct gene in top 10: 0.033333333333340764%
Probability of selecting the correct gene in top 50: 0.16666666666667052%


In [52]:
# This table is generated using ChatGPT-3

| Dataset | Top 5 Predictions | Top 5 Correct Predictions | Top 10 Predictions | Top 10 Correct Predictions | Top 50 Predictions | Top 50 Correct Predictions |
|---------|------------------|--------------------------|--------------------|----------------------------|--------------------|----------------------------|
| True HPO input|
| Overall | 266/276 (96.38%) | 28/266 (10.53%) | 266/276 (96.38%) | 36/266 (13.53%) | 206/276 (74.64%) | 35/206 (16.99%) |
| ColumbiaU | 27/27 (100.00%) | 6/27 (22.22%) | 27/27 (100.00%) | 7/27 (25.93%) | 22/27 (81.48%) | 6/22 (27.27%) |
| CSH | 72/72 (100.00%) | 12/72 (16.67%) | 68/72 (94.44%) | 14/68 (20.59%) | 53/72 (73.61%) | 15/53 (28.30%) |
| AJHG | 78/78 (100.00%) | 0/78 (0.00%) | 76/78 (97.44%) | 0/76 (0.00%) | 52/78 (66.67%) | 1/52 (1.92%) |
| DGD | 85/85 (100.00%) | 10/85 (11.76%) | 82/85 (96.47%) | 15/82 (18.29%) | 67/85 (78.82%) | 13/67 (19.40%) |
| TAF1 | 14/14 (100.00%) | 0/14 (0.00%) | 13/14 (92.86%) | 0/13 (0.00%) | 12/14 (85.71%) | 0/12 (0.00%) |
| Simulated HPO input    |
| Overall | 274/276 (99.28%) | 26/274 (9.49%) | 274/276 (99.28%) | 32/274 (11.68%) | 208/276 (75.36%) | 45/208 (21.63%) |
| ColumbiaU | 27/27 (100.00%) | 5/27 (18.52%) | 27/27 (100.00%) | 6/27 (22.22%) | 23/27 (85.19%) | 7/23 (30.43%) |
| CSH | 72/72 (100.00%) | 11/72 (15.28%) | 72/72 (100.00%) | 13/72 (18.06%) | 56/72 (77.78%) | 19/56 (33.93%) |
| AJHG | 78/78 (100.00%) | 0/78 (0.00%) | 78/78 (100.00%) | 0/78 (0.00%) | 48/78 (61.54%) | 0/48 (0.00%) |
| DGD | 82/85 (96.47%) | 10/82 (12.20%) | 83/85 (97.65%) | 13/83 (15.66%) | 72/85 (84.71%) | 19/72 (26.39%) |
| TAF1 | 13/14 (92.86%) | 0/13 (0.00%) | 14/14 (100.00%) | 0/14 (0.00%) | 9/14 (64.29%) | 0/9 (0.00%) |
| Free text input |
| Overall | 125/125 (100.00%) | 13/125 (10.40%) | 125/125 (100.00%) | 12/125 (9.60%) | 79/125 (63.20%) | 12/79 (15.19%) |
| AJHG | 72/72 (100.00%) | 0/72 (0.00%) | 72/72 (100.00%) | 0/72 (0.00%) | 45/72 (62.50%) | 0/45 (0.00%) |
| CSH | 49/49 (100.00%) | 12/49 (24.49%) | 49/49 (100.00%) | 11/49 (22.45%) | 31/49 (63.27%) | 11/31 (35.48%) |
| Others | 4/4 (100.00%) | 1/4 (25.00%) | 4/4 (100.00%) | 1/4 (25.00%) | 3/4 (75.00%) | 1/3 (33.33%) |