In [96]:
import json
from ontology_learner.coordinate_extraction.extraction_utils import get_sorted_coords

In [13]:
import os
from dotenv import load_dotenv
from pathlib import Path
import pandas as pd
import numpy as np
load_dotenv()

datadir = Path(os.getenv('DATADIR'))
print(datadir)

jsondir = datadir / 'json'


/Users/poldrack/Dropbox/data/ontology-learner/data


In [7]:
coord_df_gpt4 = pd.read_csv(datadir / 'coordinate_extraction/coords_df_gpt4.csv')
coord_df_llama3 = pd.read_csv(datadir / 'coordinate_extraction/coords_df_llama3.csv', low_memory=False)


llama2_pmids = set(coord_df_llama3.pmid.unique())
gpt4_pmids = set(coord_df_gpt4.pmid.unique())

common_pmids = llama2_pmids.intersection(gpt4_pmids)
print(len(common_pmids))

# restrict to common pmids
coord_df_llama3 = coord_df_llama3[coord_df_llama3.pmid.isin(common_pmids)]
coord_df_gpt4 = coord_df_gpt4[coord_df_gpt4.pmid.isin(common_pmids)]




2499


In [94]:


def get_matching_coords(llama3_coord, gpt4_coord):
    # return data frames with matching x/y/z values
    llama3_coord_xyz = llama3_coord[['x','y','z']]
    gpt4_coord_xyz = gpt4_coord[['x','y','z']]
    # Columns to compare
    columns_to_compare = ['x','y','z']

    # Find common rows based on the specified columns
    common_values = llama3_coord_xyz.merge(
        gpt4_coord_xyz, on=columns_to_compare)

    # Filter the original DataFrames based on the common values
    llama3_matching = llama3_coord[llama3_coord_xyz.apply(
        tuple, axis=1).isin(common_values.apply(tuple, axis=1))]
    # sometimes there are multiple rows with the same x/y/z values, so we need to drop duplicates
    llama3_matching = llama3_matching.drop_duplicates(subset=columns_to_compare)
    
    gpt4_matching = gpt4_coord[gpt4_coord_xyz[columns_to_compare].apply(
        tuple, axis=1).isin(common_values.apply(tuple, axis=1))]
    gpt4_matching = gpt4_matching.drop_duplicates(subset=columns_to_compare)

    assert(llama3_matching.x.values == gpt4_matching.x.values).all()
    assert(llama3_matching.y.values == gpt4_matching.y.values).all()
    assert(llama3_matching.z.values == gpt4_matching.z.values).all()
    return(llama3_coord.loc[llama3_matching.index], gpt4_coord.loc[gpt4_matching.index])

def compare_coords(llama3_coord, gpt4_coord):
    llama3_coord = llama3_coord[['x','y','z']]
    gpt4_coord = gpt4_coord[['x','y','z']]
    llama3_coord_set = set([tuple(x) for x in llama3_coord.to_numpy().tolist()])
    gpt4_coord_set = set([tuple(x) for x in gpt4_coord.to_numpy().tolist()])

    return(len(llama3_coord_set), len(gpt4_coord_set), len(llama3_coord_set.intersection(gpt4_coord_set)))

def compare_cluster_sizes(llama3_coords, gpt4_coords):
    llama3_matching, gpt4_matching = get_matching_coords(llama3_coords, gpt4_coords)
    llama3_cluster_sizes = llama3_matching['cluster_size']
    gpt4_cluster_sizes = gpt4_matching['cluster_size']

    # get number of non-None cluster sizes for each
    llama3_n_sizes = llama3_cluster_sizes.notna().sum()
    gpt4_n_sizes = gpt4_cluster_sizes.notna().sum()

    same_n_clustsize = llama3_n_sizes == gpt4_n_sizes
    if same_n_clustsize:
        # check if all values of cluster_size are the same in order
        same_clust_sizes = (llama3_cluster_sizes.dropna().values == gpt4_cluster_sizes.dropna().values).all()
    else:
        same_clust_sizes = False
    return(llama3_n_sizes, gpt4_n_sizes, same_n_clustsize, same_clust_sizes)

def compare_stats(llama3_coords, gpt4_coords):
    # compare statistic_type
    llama3_matching, gpt4_matching = get_matching_coords(llama3_coords, gpt4_coords)
    llama3_stat_type = [str(i).lower() for i in llama3_matching['statistic_type']]
    gpt4_stat_type = [str(i).lower() for i in gpt4_matching['statistic_type']]
    same_stat_type = all(i == j for i, j in zip(llama3_stat_type, gpt4_stat_type))
    llama3_stat_value = llama3_matching['statistic_value']
    gpt4_stat_value = gpt4_matching['statistic_value']
    same_stat_value = (llama3_stat_value.values == gpt4_stat_value.values).all()
    return(same_stat_type, same_stat_value)


In [95]:
coord_comparison = []


for pmid in common_pmids:
    llama3_coords = coord_df_llama3[coord_df_llama3.pmid == pmid]
    gpt4_coords = coord_df_gpt4[coord_df_gpt4.pmid == pmid]
    llama3_coords, gpt4_coords = get_sorted_coords(llama3_coords, gpt4_coords)
    coord_comp = compare_coords(llama3_coords, gpt4_coords)
    coord_match = coord_comp[0] == coord_comp[1] and coord_comp[0] == coord_comp[2]
    clust_comp = compare_cluster_sizes(llama3_coords, gpt4_coords)
    stat_comp = compare_stats(llama3_coords, gpt4_coords)
    coord_comparison.append([
        pmid, 
        *coord_comp,
        coord_match,
        *clust_comp,
        *stat_comp
 ])

coord_comparison_df = pd.DataFrame(coord_comparison, columns=['pmid', 'llama3_count', 'gpt4_count', 'coord_intersection_count', 
                                                              'coord_match', 'llama3_n_clustsize', 'gpt4_n_clustsize', 
                                                              'same_n_clustsize', 'same_clust_sizes',
                                                              'same_stat_type', 'same_stat_value'])

In [91]:
coord_comparison_df.mean()


pmid                        3.161610e+06
llama3_count                2.674070e+01
gpt4_count                  2.481192e+01
coord_intersection_count    2.399000e+01
coord_match                 8.207283e-01
llama3_n_clustsize          1.592397e+01
gpt4_n_clustsize            1.527051e+01
same_n_clustsize            9.259704e-01
same_clust_sizes            9.015606e-01
same_stat_type              7.499000e-01
same_stat_value             8.787515e-01
dtype: float64

In [52]:
llama3_coords


Unnamed: 0,x,y,z
49939,-42.0,-28.0,14.0
49946,-42.0,-28.0,14.0
49938,-32.0,-14.0,-14.0
49944,-26.0,-16.0,-20.0
49936,-26.0,2.0,-22.0
49942,-22.0,0.0,-18.0
49941,-22.0,38.0,-12.0
49949,-22.0,40.0,-12.0
49951,-6.0,-38.0,-30.0
49953,-6.0,-38.0,-30.0


In [22]:
np.mean(coord_comparison)

0.0