## Ontology learning using GPT-4

Used GPT-4-o1-preview to generate an initial ontology.  See notes in [README_gpt4.md]().

Results from this were stored in [chatgpt/gpt4_task_ontology.json]().

In [42]:
#autoreload
%load_ext autoreload
%autoreload 2

import json
import os
from openai import OpenAI
from dotenv import load_dotenv
from ontology_learner.gpt4_batch_utils import get_batch_results, save_batch_results
from ontology_learner.json_utils import load_jsonl, parse_jsonl_results
from llm_query.chat_client import ChatClientFactory
from tqdm import tqdm
from pathlib import Path
import fasttext
import time
import hashlib
import secrets
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from collections import defaultdict
from ontology_learner.utils import scale_df, generate_random_hash
from gpt_term_mining import (
    clean_task_name,
    clean_task_ontology,
    get_construct_task_dict_from_task_ontology,
    get_construct_prompt,
    mk_batch_script,
    run_batch_request,
    wait_for_batch_completion,
    get_main_construct_dict,
    get_task_prompt,
    get_task_cluster_prompt,
    get_task_singleton_prompt)

# Load environment variables from .env file
load_dotenv()

datadir = Path(os.getenv('DATADIR'))
print(datadir)




The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/Users/poldrack/Dropbox/data/ontology-learner/data


### Load task ontology results generated using the ChatGPT console

In [43]:
with open(datadir / 'gpt4/chatgpt/gpt4_o1-preview_task_ontology.json') as f:
    task_ontology = json.load(f)

print(len(task_ontology.keys()))

144


Clean up the ontology - make lower case and move acronyms into a separate list

In [44]:
ontology_clean = clean_task_ontology(task_ontology)

Extract all of the constructs from the task annotation for further annotation.

In [45]:
construct_task_dict = get_construct_task_dict_from_task_ontology(ontology_clean)

# save json list of constructs

with open(datadir / 'gpt4/gpt4_intitial_construct_list.json', 'w') as f:
    json.dump(list(construct_task_dict.keys()), f, indent=2)

This list was then fed into GPT-4-o1-preview via ChatGPT to expand the construct list.

the result was saved to [chatgpt/gpt4_o1-preview_expanded_constructs.json]().

In [46]:
with open(datadir / 'gpt4/chatgpt/gpt4_o1-preview_expanded_constructs.json') as f:
    expanded_constructs = json.load(f)

print(f'{len(construct_task_dict)} constructs before expansion')

expanded_constructs = list(set(expanded_constructs))
print(f'{len(expanded_constructs)} constructs after expansion')


186 constructs before expansion
807 constructs after expansion


I tried to further expand these but the chatgpt interface wouldn't do it due to the length of the list, so we then move to the API.  We also switch to GPT-4o due to cost of o1-preview.

### Construct refinement using GPT-4o

create batch submission using the construct expansion prompt

In [47]:

construct_refinement_dir = datadir / 'gpt4/construct_refinement_results'
construct_refinement_dir.mkdir(exist_ok=True, parents=True)
construct_refinement_result_file = list(construct_refinement_dir.glob('*.jsonl'))[0]
batchfile = datadir / 'gpt4/gpt4_construct_expansion_batch.jsonl'

if not construct_refinement_result_file.exists():
    mk_batch_script(batchfile, expanded_constructs, get_construct_prompt)
    batch_metadata = run_batch_request(batchfile)

    wait_for_batch_completion(batch_metadata)
    batch_results = get_batch_results(batch_client, batch_metadata.id)
    outfile = save_batch_results(batch_results, batch_metadata.id, construct_refinement_dir)

Load construct refinement results

In [48]:


construct_refinement_result_file = get_jsonl_file(construct_refinement_dir)

construct_refinement_results = parse_jsonl_results(load_jsonl(construct_refinement_result_file))

# exclude non-constructs
construct_refinement_results = {k: v for k, v in construct_refinement_results.items() if v['type'] == 'construct'}
print(f'{len(construct_refinement_results)} constructs after refinement')

741 constructs after refinement


Create a main list of constructs by combining the construct refinement result keys with any related constructs that are not in the main list




In [49]:

main_construct_dict = get_main_construct_dict(construct_refinement_results, construct_task_dict)

print(f'{len(main_construct_dict)} constructs after inclusion of task results')

1524 constructs after inclusion of task results


Some of the constructs (those identified in the last round of refinement) will not yet be annotated, so we identify and annotate those.

In [50]:
unannotated_constructs = [k for k, v in main_construct_dict.items() if len(v) == 0]
print(f'{len(unannotated_constructs)} constructs to be annotated')


783 constructs to be annotated


In [51]:
construct_expansion_unannotated_dir = datadir / 'gpt4/construct_expansion_unannotated_results'
construct_expansion_unannotated_dir.mkdir(exist_ok=True, parents=True)
batchfile = datadir / 'gpt4/gpt4_construct_expansion_unannotated_batch.jsonl'

try:
    construct_expansion_result_file = get_jsonl_file(construct_expansion_unannotated_dir)
except FileNotFoundError:
    mk_batch_script(batchfile, unannotated_constructs, get_construct_prompt)

    batch_metadata, batch_client = run_batch_request(batchfile)
    wait_for_batch_completion(batch_metadata, batch_client)

    batch_results = get_batch_results(batch_client, batch_metadata.id)
    outfile = save_batch_results(batch_results, batch_metadata.id, outdir)
    construct_expansion_result_file = get_jsonl_file(construct_expansion_unannotated_dir)

construct_expansion_results = parse_jsonl_results(load_jsonl(construct_expansion_result_file))

print(f'{len(construct_expansion_results)} constructs after expansion')


783 constructs after expansion


Combine expansion results with refinement results.

In [52]:
# exclude non-constructs
construct_expansion_results = {k: v for k, v in construct_expansion_results.items() if v['type'] == 'construct'}
print(f'{len(construct_expansion_results)} expanded constructs after exclusion of non-constructs')

full_construct_results = {**construct_refinement_results, **construct_expansion_results}
print(f'{len(full_construct_results)} constructs after combining refinement and expansion results')


720 expanded constructs after exclusion of non-constructs
1461 constructs after combining refinement and expansion results


### Task expansion

Now we find all of the tasks listed in the construct dicts and add them to the task ontology if they don't already exist - i.e. just the same as we did above for constructs.

In [53]:
expanded_task_ontology = ontology_clean.copy()
unexpanded_tasks = []

for construct, result in full_construct_results.items():
    result = result.copy()
    if result['type'] == 'construct':
        for task in result['tasks']:
            taskname_clean, acronym = clean_task_name(task)
            if taskname_clean not in expanded_task_ontology:
                unexpanded_tasks.append(taskname_clean)
                expanded_task_ontology[taskname_clean] = {
                    'description': None,
                    'constructs': [construct],
                    'acronym': [acronym]
                }
            elif construct not in expanded_task_ontology[taskname_clean]['constructs']:
                expanded_task_ontology[taskname_clean]['constructs'].append(construct)

print(f'{len(ontology_clean)} tasks prior to expansion')
print(f'{len(expanded_task_ontology)} tasks after expansion')

143 tasks prior to expansion
3167 tasks after expansion


Use GPT-4 to clean up the expanded tasks

In [54]:
batchfile = datadir / 'gpt4/gpt4_task_expansion_unannotated_batch.jsonl'
task_expansion_dir = datadir / 'gpt4/task_expansion_unannotated_results'

try:
    task_expansion_result_file = get_jsonl_file(task_expansion_dir)
except FileNotFoundError:
    mk_batch_script(batchfile, unexpanded_tasks, get_task_prompt)

    batch_metadata, batch_client = run_batch_request(batchfile)

    wait_for_batch_completion(batch_metadata, batch_client)

    batch_results = get_batch_results(batch_client, batch_metadata.id)
    outfile = save_batch_results(batch_results, batch_metadata.id, outdir)
    task_expansion_result_file = get_jsonl_file(task_expansion_dir)

task_expansion_results = parse_jsonl_results(load_jsonl(task_expansion_result_file))
print(f'{len(task_expansion_results)} tasks after expansion')

error decoding distance matching task
3023 tasks after expansion


In [55]:
expanded_task_ontology['multi-voxel pattern analysis']

{'description': None, 'constructs': ['decoding'], 'acronym': ['mvpa']}

In [56]:
task_expansion_results['multi-voxel pattern analysis']

{'type': 'other'}

In [57]:

# exclude non-tasks
full_task_results = expanded_task_ontology.copy()
tasks = list(full_task_results.keys())
for task in tasks:
    v = full_task_results[task]
    if task in task_expansion_results.keys():
        if task_expansion_results[task]['type'] not in ['task', 'survey']:
            del full_task_results[task]
        else:
            full_task_results[task] = task_expansion_results[task]

print(f'{len(task_expansion_results)} tasks after exclusion of non-task')

print(f'{len(full_task_results)} total task results after combination')


3023 tasks after exclusion of non-task
2698 total task results after combination


In [58]:
full_task_results['stroop task']

{'description': "A cognitive task where participants name the ink color of a word that may spell out a different color (e.g., the word 'Red' printed in blue ink), measuring the ability to inhibit cognitive interference.",
 'constructs': ['Attention',
  'Cognitive Control',
  'Inhibitory Control',
  'Executive Function',
  'Bilingual Cognitive Advantage',
  'Response Inhibition',
  'Resource Allocation',
  'Goal Maintenance',
  'Cognitive Resources',
  'Information Processing',
  'Monitoring',
  'Cognitive Flexibility',
  'Behavioral Control',
  'Executive Control',
  'Top-Down Processing',
  'Reaction Time',
  'Performance',
  'Dual-Task Performance',
  'Task Switching',
  'Executive Attention',
  'Selective Attention',
  'Conflict Monitoring',
  'Attentional Set',
  'Self-Regulation',
  'Error Monitoring',
  'Cognitive Processing',
  'Attention Bias',
  'Set Shifting',
  'Central Executive',
  'Response Selection',
  'Ego Depletion',
  'Behavioral Regulation',
  'Cognitive Inhibition'

### Clustering tasks

Generate a text embedding using fasttext for all of the concepts and tasks, for use in identifying overlapping items.

In [59]:
# first generate text files with all of the concepts and tasks
# as required for fasttext

with open(datadir / 'gpt4/gpt4_full_text_for_embedding_concepts.txt', 'w') as f:

    for k, v in full_construct_results.items():
        v = v.copy()
        f.write(f'construct_{k.replace(' ', '_')}: {k} {json.dumps(v)}\n')


with open(datadir / 'gpt4/gpt4_full_text_for_embedding_tasks.txt', 'w') as f:
    for k, v in full_task_results.items():
        v = v.copy()
        if 'acronym' in v:
            del v['acronym']
        v['constructs'] = list(set([i.lower() for i in v['constructs']]))
        v['type'] = 'task'
        f.write(f'task_{k.replace(' ', '_')}: {k} {json.dumps(v)}\n')



In [60]:
# generate a fasttext model from the task names plus additional info
task_model_file = datadir / 'gpt4/gpt4_task_model.bin'
if not task_model_file.exists():
    task_model = fasttext.train_unsupervised(
        (datadir / 'gpt4/gpt4_full_text_for_embedding_tasks.txt').as_posix(), 
        dim=200)
    task_model.save_model(task_model_file.as_posix())
else:
    print(f'Loading task model from {task_model_file}')
    task_model = fasttext.load_model(task_model_file.as_posix())


Loading task model from /Users/poldrack/Dropbox/data/ontology-learner/data/gpt4/gpt4_task_model.bin


In [61]:
# create embeddings for the task names
task_embeddings = {}
for k in full_task_results.keys():
    task_embeddings[k] = task_model.get_sentence_vector(k)

task_embeddings_df = pd.DataFrame(task_embeddings).T
# scale the embeddings

task_embeddings_scaled_df = scale_df(task_embeddings_df)
print(task_embeddings_scaled_df.shape)


(2698, 200)


In [62]:
# cluster the task names
cluster = AgglomerativeClustering(n_clusters=None, distance_threshold=7)
cluster.fit(task_embeddings_scaled_df)
print(f'Found {len(set(cluster.labels_))} clusters')
# print out the clusters
task_cluster_dict = defaultdict(list)

task_singleton_dict = {}
for i in set(cluster.labels_):
    if np.sum(cluster.labels_ == i) > 1:
        #print(f'Cluster {i}')
        for j in task_embeddings_scaled_df.index[cluster.labels_ == i]:
            #print(f'  {j}')
            task_cluster_dict[i].append(j)
    else:
        task_singleton_dict[i] = task_embeddings_scaled_df.index[cluster.labels_ == i][0]

print(f'found {len(task_cluster_dict)} multi-task clusters')
print(f'found {len(task_singleton_dict)} singleton clusters')


Found 2294 clusters
found 344 multi-task clusters
found 1950 singleton clusters


In [63]:
batchfile = datadir / 'gpt4/gpt4_task_clustering_batch.jsonl'
task_clustering_result_dir = datadir / 'gpt4/task_clustering_results'
task_clustering_result_dir.mkdir(exist_ok=True, parents=True)
if batchfile.exists():
    batchfile.unlink()

#  include descriptions when available
try:
    task_clustering_result_file = get_jsonl_file(task_clustering_result_dir)
except FileNotFoundError:

    ids = []
    tasks = []
    for k, v in task_cluster_dict.items():

        ids.append(k)
        tasks.append(json.dumps(v))

    mk_batch_script(batchfile, tasks, 
                    get_task_cluster_prompt,
                    custom_ids=tasks)

    batch_metadata, batch_client = run_batch_request(batchfile, 'task clustering')

    wait_for_batch_completion(batch_metadata, batch_client)

    batch_results = get_batch_results(batch_client, batch_metadata.id)
    outfile = save_batch_results(batch_results, batch_metadata.id, task_clustering_result_dir)

    task_clustering_result_file = get_jsonl_file(task_clustering_result_dir)


In [64]:
task_clustering_results = parse_jsonl_results(load_jsonl(task_clustering_result_file))



Get kind_of and description for singletons

In [65]:
batchfile = datadir / 'gpt4/gpt4_task_singleton_batch.jsonl'
task_singleton_result_dir = datadir / 'gpt4/task_singleton_results'
task_singleton_result_dir.mkdir(exist_ok=True, parents=True)
if batchfile.exists():
    batchfile.unlink()

#  include descriptions when available
try:
    task_singleton_result_file = get_jsonl_file(task_singleton_result_dir)
except FileNotFoundError:

    ids = []
    tasks = []
    for k, v in task_singleton_dict.items():

        ids.append(k)
        tasks.append(v)

    mk_batch_script(batchfile, tasks, 
                    get_task_singleton_prompt,
                    custom_ids=tasks)

    batch_metadata, batch_client = run_batch_request(batchfile, 'task clustering')

    wait_for_batch_completion(batch_metadata, batch_client)

    batch_results = get_batch_results(batch_client, batch_metadata.id)
    outfile = save_batch_results(batch_results, batch_metadata.id, task_singleton_result_dir)

    task_singleton_result_file = get_jsonl_file(task_singleton_result_dir)

In [66]:
task_singleton_results = parse_jsonl_results(load_jsonl(task_singleton_result_file))

print(f'{len(task_singleton_results)} singleton results')



2340 singleton results


In [67]:
task_singleton_results

{'genderbread person model evaluations': {'name': 'Genderbread Person Model Evaluations',
  'description': "The Genderbread Person Model Evaluations involve using a visual tool to assess individuals' understanding and personal identification with the concepts of gender identity, gender expression, biological sex, and sexual orientation. The model visually represents these aspects independently, and evaluations typically focus on exploring the nuances of how individuals perceive their own and others' gender-related identities.",
  'kind_of': 'Gender Identity Evaluation Task'},
 'the stress appraisal measure': {'name': 'Stress Appraisal Measure',
  'description': 'The Stress Appraisal Measure (SAM) is a psychological tool used to assess how individuals evaluate stressors in their environment. It evaluates dimensions such as perceived threat, challenge, and resource availability in relation to a specific stressor.',
  'kind_of': 'Appraisal Task'},
 'test of gross motor development': {'nam

Combine singleton and other results.

In [68]:
# first create a dict that goes from original task names to harmonized/clustered labels
# for singletons, we will use the original task name as the label
# for clusters, we will use the cluster label
# also create a dict from the new labels to unique IDs (hashes) and from the original labels to the IDs

# first add all of the singletons

orig_task_to_harmonized_dict = {k:k for k in task_singleton_results.keys()}
print(f'{len(orig_task_to_harmonized_dict)} singleton orig task to harmonized dict')

# then add the clusters
# ["picture naming task", "naming task"]': {'Naming Task': {'items': ['picture naming task',
#     'naming task'],
#    'description': 'Tasks that involve identifying and naming objects, pictures, or stimuli, often used to assess language, memory, or visual processing abilities.',
#    'kind_of': ['language task']}},
for k, v in task_clustering_results.items():
    for label, tasks in v.items():
        for task in tasks['items']:
            orig_task_to_harmonized_dict[task] = label

print(f'{len(orig_task_to_harmonized_dict)} orig task to harmonized dict after clustering')

new_task_labels = list(set(orig_task_to_harmonized_dict.values()))

new_task_id_dict = {k: 'task_' + generate_random_hash() for k in new_task_labels}

print(f'{len(new_task_labels)} new task labels')
print(f'{len(set(new_task_id_dict.values()))} unique task ids')

orig_task_to_id_dict = {k: new_task_id_dict[orig_task_to_harmonized_dict[k]] 
                        for k in orig_task_to_harmonized_dict.keys()}
print(f'{len(orig_task_to_id_dict)} orig task to id dict')

2340 singleton orig task to harmonized dict
3166 orig task to harmonized dict after clustering
2903 new task labels
2903 unique task ids
3166 orig task to id dict


In [69]:
with open(datadir / 'gpt4/gpt4_new_task_id_dict.json', 'w') as f:
    json.dump(new_task_id_dict, f, indent=2)

with open(datadir / 'gpt4/gpt4_orig_task_to_id_dict.json', 'w') as f:
    json.dump(orig_task_to_id_dict, f, indent=2)


Create the full task dict, keyed by unique id

In [70]:
all_task_results = task_singleton_results.copy()
all_task_results.update(full_task_results)


full_task_results = {}
for k, v in all_task_results.items():
    # a few of the tasks seem to have been screwed up by the expansion so exclude hose
    if k in orig_task_to_id_dict:
        full_task_results[orig_task_to_id_dict[k]] = v
        full_task_results[orig_task_to_id_dict[k]]['name'] = k
    else:
        print(f'task {k} not found in orig_task_to_id_dict')

with open(datadir / 'gpt4/gpt4_full_task_results.json', 'w') as f:
    json.dump(full_task_results, f, indent=2)



task reading the mind in the eyes test: measures the ability to understand others' emotions by looking at their eyes. not found in orig_task_to_id_dict
task strange stories test: evaluates the ability to comprehend social situations and infer mental states. not found in orig_task_to_id_dict
task categorical perception task: often used to explore how language affects perception of color or phonemes. not found in orig_task_to_id_dict
task time perception task: used to investigate how different linguistic tenses influence cognitive perception of time. not found in orig_task_to_id_dict
task montgomery-åsberg depression rating scale not found in orig_task_to_id_dict


create a full construct dict keyed by unique ID

In [71]:
construct_to_id_dict = {}
for k in full_construct_results.keys():
    construct_to_id_dict[k] = 'concept_' + generate_random_hash()
    construct_to_id_dict[k.lower()] = construct_to_id_dict[k]

with open(datadir / 'gpt4/gpt4_construct_to_id_dict.json', 'w') as f:
    json.dump(construct_to_id_dict, f, indent=2)

full_construct_dict = {}
for k, v in full_construct_results.items():
    full_construct_dict[construct_to_id_dict[k]] = v
    full_construct_dict[construct_to_id_dict[k]]['name'] = k

with open(datadir / 'gpt4/gpt4_full_construct_dict.json', 'w') as f:
    json.dump(full_construct_dict, f, indent=2)


### Link task IDs to main construct dict and construct IDs to task dict

In [72]:

for construct, v in full_construct_results.items():
    # take advantage of fact that iterated items are not copies
    if 'task_ids' not in v:
        v['task_ids'] = []
    for task in v['tasks']:
            taskname_clean, _ = clean_task_name(task)
            if taskname_clean in orig_task_to_id_dict:
                v['task_ids'].append(orig_task_to_id_dict[taskname_clean])
            else:
                print(f'task {taskname_clean} not found in orig_task_to_id_dict')

with open(datadir / 'gpt4/gpt4_full_construct_results.json', 'w') as f:
    json.dump(full_construct_results, f, indent=2)



task reading the mind in the eyes test: measures the ability to understand others' emotions by looking at their eyes. not found in orig_task_to_id_dict
task strange stories test: evaluates the ability to comprehend social situations and infer mental states. not found in orig_task_to_id_dict
task categorical perception task: often used to explore how language affects perception of color or phonemes. not found in orig_task_to_id_dict
task time perception task: used to investigate how different linguistic tenses influence cognitive perception of time. not found in orig_task_to_id_dict
task montgomery-åsberg depression rating scale not found in orig_task_to_id_dict


In [73]:
for task, v in full_task_results.items():
    if 'constructs' not in v:
        continue
    if 'construct_ids' not in v:
        v['construct_ids'] = []
    for construct in v['constructs']:
        if construct in construct_to_id_dict:
            v['construct_ids'].append(construct_to_id_dict[construct])

with open(datadir / 'gpt4/gpt4_full_task_results.json', 'w') as f:
    json.dump(full_task_results, f, indent=2)



## add to database



## OLDER

Used GPT-4-o1-preview to generate an initial ontology.  See notes in [README_gpt4.md]().

Results from this were stored in [chatgpt/gpt4_task_ontology.json]().

In [74]:

clustered_task_dict = {}
orig_task_to_cluster_dict = {}
for k, v in task_clustering_results.items():
    for label, tasks in v.items():
        clustered_task_dict[label] = tasks
        for task in tasks:
            orig_task_to_cluster_dict[task] = label

for k, v in clustered_task_dict.items():
    if len(v) > 1:
        print(k)
        for task in v:
            print(f'  {task}')
        print()



Naming Task
  items
  description
  kind_of

Conditioned Taste Aversion
  items
  description
  kind_of

Eriksen Flanker Task
  items
  description
  kind_of

Inescapable Shock Task
  items
  description
  kind_of

Achievement Motivation Scale
  items
  description
  kind_of

Academic Motivation Scale
  items
  description
  kind_of

Praxis Subtest of Western Aphasia Battery
  items
  description
  kind_of

Bruininks-Oseretsky Test of Motor Proficiency
  items
  description
  kind_of

Emotion Regulation Questionnaires
  items
  description
  kind_of

Reading the Mind in the Eyes Test
  items
  description
  kind_of

Strange Stories Test
  items
  description
  kind_of

Acoustic Startle Reflex/Response
  items
  description
  kind_of

Digit Span Task
  items
  description
  kind_of

Matrix Span Task
  items
  description
  kind_of

Matching-to-Sample Task
  items
  description
  kind_of

Hardiness Scale
  items
  description
  kind_of

Shyness Scale
  items
  description
  kind_of

Soci

### perform concept clustering

First do agglomerative clustering on concepts


In [75]:
# fit embeddings for concepts
concept_model_file = datadir / 'gpt4/gpt4_concept_model.bin'
if not concept_model_file.exists():
    concept_model = fasttext.train_unsupervised(
        (datadir / 'gpt4/gpt4_full_text_for_embedding_concepts.txt').as_posix(), 
        dim=200)
    concept_model.save_model(concept_model_file.as_posix())
else:
    print(f'Loading concept model from {concept_model_file}')
    concept_model = fasttext.load_model(concept_model_file.as_posix())



Loading concept model from /Users/poldrack/Dropbox/data/ontology-learner/data/gpt4/gpt4_concept_model.bin


In [76]:
# generate embeddings for concepts
concept_embeddings = {}
for k, v in full_construct_results.items():
    if v['type'] == 'construct':
        concept_embeddings[k] = task_model.get_sentence_vector(k)

concept_embeddings_df = pd.DataFrame(concept_embeddings).T
# scale the embeddings
scaler = StandardScaler()
concept_embeddings_scaled = scaler.fit_transform(concept_embeddings_df)
concept_embeddings_scaled_df = pd.DataFrame(concept_embeddings_scaled, index=concept_embeddings_df.index)
print(concept_embeddings_scaled_df.shape)


(1461, 200)


In [77]:
# cluster the task names
cluster = AgglomerativeClustering(n_clusters=None, distance_threshold=7)
cluster.fit(concept_embeddings_scaled_df)
print(f'Found {len(set(cluster.labels_))} clusters')
# print out the clusters
concept_cluster_dict = defaultdict(list)

for i in set(cluster.labels_):
    if np.sum(cluster.labels_ == i) > 1:
        #print(f'Cluster {i}')
        for j in concept_embeddings_scaled_df.index[cluster.labels_ == i]:
            #print(f'  {j}')
            concept_cluster_dict[i].append(j)

concept_cluster_dict

Found 1327 clusters


defaultdict(list,
            {0: ['Executive Attention', 'Selective Attention'],
             1: ['Servant Leadership', 'Leadership'],
             2: ['Curiosity', 'grandiosity'],
             3: ['non-declarative memory', 'retrospective memory'],
             4: ['cohesion', 'team cohesion'],
             5: ['group cohesion', 'group cohesiveness'],
             6: ['Cognitive Reframing', 'Cognitive Aging'],
             7: ['Metaphor Processing', 'Language Processing'],
             8: ['Cultural Competence', 'Cultural Intelligence'],
             9: ['linguistic awareness',
              'syntactic awareness',
              'phonemic awareness'],
             10: ['substance use disorder', 'substance use'],
             11: ['urgency', 'agency'],
             12: ['Social Identity', 'Social Identity Theory'],
             13: ['tactile perception',
              'color perception',
              'shape perception'],
             14: ['Substance Use Disorders', 'Substance Abuse'],


Examination of these clusters showed that they rarely contained synonymous terms; more often, they contained contrasting terms. Thus, we don't do any further refinement.

In [78]:
constructs = sorted(list(full_construct_results.keys()))
for k in constructs:
    print(k)


ADHD Symptoms
Abstract Reasoning
Abstract Thinking
Acceptance and Commitment
Acculturation
Acculturative Stress
Action Selection
Actor-Observer Bias
Adaptive Coping
Affect
Affective Empathy
Affective Forecasting
Affective Symptoms
Aggression
Agnosia
Agreeableness
Alcohol Use
Alerting
Alexithymia
Altruism
Ambivalent Sexism
Analogical Reasoning
Anchoring
Anger
Anima and Animus
Animal Cognition
Anosognosia
Anterograde Amnesia
Antisocial Behavior
Anxiety
Anxiety Disorders
Aphasia
Apraxia
Archetypes
Associative Learning
Associative Networks
Associative Thinking
Attachment
Attachment Behaviors
Attachment Disorders
Attachment Security
Attachment Styles
Attention
Attention Bias
Attention Deficit
Attention Deficits
Attention Restoration
Attention to Detail
Attentional Blink
Attentional Control
Attentional Set
Attitude Change
Attitudes
Attribution
Auditory Processing
Authentic Leadership
Authority Influence
Automatic Thoughts
Autonomy
Availability Heuristic
Avoidance
Avoidance Behaviors
Avoidant

In [79]:

scaler = StandardScaler()
task_embeddings_scaled = scaler.fit_transform(task_embeddings_df)
task_embeddings_scaled_df = pd.DataFrame(task_embeddings_scaled, index=task_embeddings_df.index)

cluster = AgglomerativeClustering(n_clusters=None, distance_threshold=10)
cluster.fit(task_embeddings_scaled)
print(f'Found {len(set(cluster.labels_))} clusters')

Found 1571 clusters


In [80]:
# print out the clusters
for i in set(cluster.labels_):
    if np.sum(cluster.labels_ == i) > 1:
        print(f'Cluster {i}')
        for j in task_embeddings_scaled_df.index[cluster.labels_ == i]:
            print(f'  {j}')


Cluster 0
  gratitude resentment and appreciation test
  convergent and divergent response test
Cluster 1
  team diagnostic survey
  teacher efficacy survey
  job diagnostic survey
Cluster 2
  role-playing scenarios
  role-playing exercises
Cluster 3
  leader authenticity scale
  conger-kanungo charismatic leadership scale
Cluster 4
  celebrity attitude scale
  responsibility attitudes scale
Cluster 5
  behavior assessment system for children
  vineland adaptive behavior scales
  adaptive behavior assessment system
Cluster 6
  obsessive beliefs questionnaire
  immersive tendencies questionnaire
Cluster 7
  standard anchoring task
  price anchoring task
  simulated crime task
  anchoring task
Cluster 8
  conflict resolution questionnaire
  interpersonal conflict resolution task
  conflict resolution style inventory
Cluster 9
  discourse completion task
  arithmetic verification task
  prototype distortion task
  property verification task
Cluster 10
  cyberbullying prevention and respon

Create a full task ontology, keyed by a hash rather than task labels.  

In [81]:
full_task_ontology = {}

for k, v in expanded_task_ontology.items():
    # create a random hash for the id
    v = v.copy()
    task_id = k
    while task_id in expanded_task_ontology:
        task_id = 'task_' + generate_random_hash()
    v['name'] = k
    v['construct_names'] = [i.lower() for i in v['constructs']]
    v['construct_ids'] = []
    full_task_ontology[task_id] = v
    

assert len(full_task_ontology) == len(expanded_task_ontology)



Do the same for constructs.

In [82]:
full_construct_ontology = {}

def task_name_to_hash(taskname, full_task_ontology):
    return [k for k, v in full_task_ontology.items() if taskname_clean == v['name']][0]

for k, v in construct_refinement_results.items():
    v = v.copy()
    if v['type'] == 'construct':
        construct_id = k
        while construct_id in construct_refinement_results:
            construct_id = 'concept_' + generate_random_hash()
        v['name'] = k
        v['task_ids'] = []
        full_construct_ontology[construct_id] = v
        for task in v['tasks']:
            taskname_clean, _ = clean_task_name(task)
            task_hash = task_name_to_hash(taskname_clean, full_task_ontology)
            v['task_ids'].append(task_hash)
            if task_hash not in full_task_ontology:
                print(f'task {taskname_clean} not found in full task ontology')
            elif construct not in full_task_ontology[task_hash]['construct_names']:
                full_task_ontology[task_hash]['construct_ids'].append(construct_id)
                full_task_ontology[task_hash]['construct_names'].append(k)

# now replace constructs in full_task_ontology with the new construct ids


with open(datadir / 'gpt4/gpt4_full_construct_ontology.json', 'w') as f:
    json.dump(full_construct_ontology, f, indent=2)

with open(datadir / 'gpt4/gpt4_full_task_ontology.json', 'w') as f:
    json.dump(full_task_ontology, f, indent=2)
