Convert results from llama3 into data frame

In [2]:
import json
from pathlib import Path
from ontology_learner.publication import Publication
from dotenv import load_dotenv
import os

import pandas as pd

load_dotenv()
api_key = os.getenv("OPENAI")

datadir = Path(os.getenv('DATADIR'))
print(datadir)

jsondir = datadir / 'json'


def get_messages(prompt):
    return [
        {"role": "system", "content": "You are an expert in neuroimaging research."},
        {"role": "user", "content": prompt}
    ]



/Users/poldrack/Dropbox/data/ontology-learner/data


In [3]:
with open(datadir / 'coordinate_extraction/coord_results_llama3.json', 'r') as f:
    responses = json.load(f)
print(f'loaded {len(responses)} responses from file')


loaded 2499 responses from file


### Process extracted coordinates

In [4]:
coord_results_clean = {k: v for k, v in responses.items() if len(v) > 0}
print(f'found {len(coord_results_clean)} tables with coordinates in {len(responses)} tables')



found 2499 tables with coordinates in 2499 tables


In [5]:
# check if the result is a list of dictionaries
result_is_dict = [isinstance(coord_results_clean[k], list) for k in coord_results_clean.keys()]
print(f'{sum(result_is_dict)} results of {len(coord_results_clean)} are lists of dictionaries')

def is_good_coord(coord):
    return 'x' in coord and 'y' in coord and 'z' in coord

all_coords = []
good_coords = []
coords_with_contrast = []
coords_with_cluster_size = []
coords_with_label = []
coords_with_statistic_type = []
coords_with_statistic_value = []
coords_with_coordinate_type = []
coords_list = []

for k, v in coord_results_clean.items():
    for coord_dict in v:
        all_coords.append(coord_dict)
        contrast, x, y, z, cluster_size, label, statistic, coordinate_type = None, None, None, None, None, None, None, None
        if is_good_coord(coord_dict):
            good_coords.append(coord_dict)
            x, y, z = coord_dict['x'], coord_dict['y'], coord_dict['z']
        if 'contrast' in coord_dict and coord_dict['contrast'] is not None:
            contrast = coord_dict['contrast']
            coords_with_contrast.append(coord_dict)
        if 'cluster_size' in coord_dict and coord_dict['cluster_size'] is not None:
            cluster_size = coord_dict['cluster_size']
            coords_with_cluster_size.append(coord_dict)
        if 'label' in coord_dict and coord_dict['label'] is not None:
            label = coord_dict['label']
            coords_with_label.append(coord_dict)
        if 'statistic_type' in coord_dict and coord_dict['statistic_type'] is not None:
            statistic_type = coord_dict['statistic_type']
            coords_with_statistic_type.append(coord_dict)
        if 'statistic_value' in coord_dict and coord_dict['statistic_value'] is not None:
            statistic_value = coord_dict['statistic_value']
            coords_with_statistic_value.append(coord_dict)
        if 'coordinate_type' in coord_dict and coord_dict['coordinate_type'] is not None:
            coordinate_type = coord_dict['coordinate_type']
            coords_with_coordinate_type.append(coord_dict)
        coords_list.append({
            'pmid': k,
            'contrast': contrast,
            'x': x,
            'y': y,
            'z': z,
            'cluster_size': cluster_size,
            'label': label,
            'statistic_type': statistic_type,
            'statistic_value': statistic_value,
            'coordinate_type': coordinate_type
        })

print(f'found {len(good_coords)} good coordinates out of {len(all_coords)} total coordinates')
print(f'found {len(coords_with_contrast)} coordinates with contrast out of {len(all_coords)} total coordinates')
print(f'found {len(coords_with_cluster_size)} coordinates with cluster size out of {len(all_coords)} total coordinates')
print(f'found {len(coords_with_label)} coordinates with label out of {len(all_coords)} total coordinates')
print(f'found {len(coords_with_statistic_type)} coordinates with statistic type out of {len(all_coords)} total coordinates')
print(f'found {len(coords_with_statistic_value)} coordinates with statistic value out of {len(all_coords)} total coordinates')
print(f'found {len(coords_with_coordinate_type)} coordinates with coordinate type out of {len(all_coords)} total coordinates')


2499 results of 2499 are lists of dictionaries
found 70557 good coordinates out of 70557 total coordinates
found 70378 coordinates with contrast out of 70557 total coordinates
found 44995 coordinates with cluster size out of 70557 total coordinates
found 69295 coordinates with label out of 70557 total coordinates
found 60404 coordinates with statistic type out of 70557 total coordinates
found 59950 coordinates with statistic value out of 70557 total coordinates
found 64191 coordinates with coordinate type out of 70557 total coordinates


In [6]:
coords_df = pd.DataFrame(coords_list)
coords_df.head()

Unnamed: 0,pmid,contrast,x,y,z,cluster_size,label,statistic_type,statistic_value,coordinate_type
0,166149,Rate Effect,-64.0,-16.0,8.0,386.0,HG,Z,10.0,MNI
1,166149,Rate Effect,-60.0,-4.0,4.0,,PP,Z,7.7,MNI
2,166149,Rate Effect,-44.0,-28.0,12.0,,PT,Z,7.6,MNI
3,166149,Rate Effect,56.0,-8.0,-4.0,430.0,PP,Z,7.1,MNI
4,166149,Rate Effect,64.0,-20.0,12.0,,HG,Z,7.0,MNI


In [7]:
full_length = len(coords_df)
# remove rows with missing x/y/z coords
coords_df = coords_df[coords_df['x'].notna() & coords_df['y'].notna() & coords_df['z'].notna()]

# remove rows with x/y/z as lists
coords_df = coords_df[~coords_df['x'].apply(lambda x: isinstance(x, list))]

# remove rows with x/y/z values that are not numbers
coords_df = coords_df[coords_df['x'].apply(lambda x: isinstance(x, (int, float)))]
coords_df = coords_df[coords_df['y'].apply(lambda x: isinstance(x, (int, float)))]
coords_df = coords_df[coords_df['z'].apply(lambda x: isinstance(x, (int, float)))]

# remove rows with x/y/z values that are outside of the range of -200 to 200
coords_df = coords_df[coords_df['x'].apply(lambda x: x >= -200 and x <= 200)]
coords_df = coords_df[coords_df['y'].apply(lambda x: x >= -200 and x <= 200)]
coords_df = coords_df[coords_df['z'].apply(lambda x: x >= -200 and x <= 200)]


print(f'cleaup removed {full_length - len(coords_df)} rows out of {full_length} ({len(coords_df)} rows remaining)')

coords_df.to_csv(datadir / 'coordinate_extraction/coords_df_llama3.csv', index=False)


cleaup removed 231 rows out of 70557 (70326 rows remaining)
