In [1]:
import pandas as pd
import numpy as np
from collections import Counter, defaultdict
import pickle
import urllib.request


# File dependencies:
model_data = pd.read_parquet("model_data_1014_MeSH_terms.parquet").reset_index()
DESCRIPTOR_DEFINITION_FILE = "descriptor_definition_dict.pkl"

In [2]:
model_data

Unnamed: 0,index,name,target,intercept,beta_scaling_factor,beta_unit_vector
0,0,Humans,D006801,2.691081,21.267866,"[0.01726822,-0.010626245,-0.027259905,0.000992..."
1,1,Female,D005260,0.572347,8.489115,"[-0.008675035,0.004081206,-0.01882576,-0.01899..."
2,2,Male,D008297,0.405352,8.966750,"[0.0035645857,-0.038743176,0.021385735,0.05438..."
3,3,Animals,D000818,-1.375631,23.026026,"[0.025583062,0.031996664,0.038607825,0.0200140..."
4,4,Adult,D000328,-0.445872,9.237172,"[0.0039388733,0.0067770644,-0.010149287,-0.009..."
...,...,...,...,...,...,...
1009,1009,Memory B Cells,D000091245,-6.356035,82.287761,"[0.021362,0,-0.070894,0.019598,-0.060034,0,0.0..."
1010,1010,Overdiagnosis,D000088522,-5.906156,29.307291,"[0,0,0,-0.095356,0,0,0,0,-0.145987,0.025026,0...."
1011,1011,Disinformation,D000087862,-4.841627,31.897502,"[0.01806,0,-0.069801,0.007722,-0.139574,-0.019..."
1012,1012,Biosecurity,D000089062,-6.307432,36.034649,"[0,0,0,0,0,0,0,0.026964,0,0,0.021554,0.002459,..."


In [3]:

# from https://github.com/rmhorton/ThoughtGraph
def add_cluster_cols(df, embedding_col='embedding', prefix='cluster', letters='ABCDE', max_threshold=1):
    from scipy.cluster.hierarchy import ward, fcluster
    from scipy.spatial.distance import pdist
    import math

    # cluster the sentence vectors at various levels
    X = np.array([ eval(v) for v in df[embedding_col]])  # df[embedding_col].tolist()
    y = pdist(X, metric='cosine')
    z = ward(y)

    thresholds = [max_threshold/(1.414)**i for i in range(len(letters))]
    for i in range(len(letters)):
        letter = letters[i]
        col_name = f'{prefix}_{letter}'
        cluster_id = fcluster(z, thresholds[i], criterion='distance')  # max_threshold/2**i, (i+1), (1.5)**i
        digits = 1 + math.floor(math.log10(max(cluster_id)))
        df[col_name] = [col_name + str(cid).zfill(digits) for cid in cluster_id]

    cluster_cols = [c for c in df.columns if c.startswith(f'{prefix}_')]
    return df.sort_values(by=cluster_cols)



In [4]:
model_data_clusters = add_cluster_cols(model_data.copy(), embedding_col='beta_unit_vector', prefix='cluster', letters='ABCDEFG', max_threshold=4)


cluster_cols = [c for c in model_data_clusters.columns if c.startswith('cluster_')]

model_data_clusters[['name', 'target', *cluster_cols]]

Unnamed: 0,name,target,cluster_A,cluster_B,cluster_C,cluster_D,cluster_E,cluster_F,cluster_G
384,Motivation,D009042,cluster_A1,cluster_B01,cluster_C01,cluster_D01,cluster_E001,cluster_F001,cluster_G001
537,Perception,D010465,cluster_A1,cluster_B01,cluster_C01,cluster_D01,cluster_E001,cluster_F002,cluster_G002
640,Social Behavior,D012919,cluster_A1,cluster_B01,cluster_C01,cluster_D01,cluster_E001,cluster_F003,cluster_G003
985,Choice Behavior,D002755,cluster_A1,cluster_B01,cluster_C01,cluster_D01,cluster_E001,cluster_F003,cluster_G003
374,Communication,D003142,cluster_A1,cluster_B01,cluster_C01,cluster_D01,cluster_E001,cluster_F004,cluster_G004
...,...,...,...,...,...,...,...,...,...
222,Peptides,D010455,cluster_A7,cluster_B12,cluster_C26,cluster_D71,cluster_E185,cluster_F555,cluster_G905
479,Amino Acids,D000596,cluster_A7,cluster_B12,cluster_C26,cluster_D71,cluster_E185,cluster_F555,cluster_G906
156,Proteomics,D040901,cluster_A7,cluster_B12,cluster_C26,cluster_D71,cluster_E186,cluster_F556,cluster_G907
315,Proteome,D020543,cluster_A7,cluster_B12,cluster_C26,cluster_D71,cluster_E186,cluster_F556,cluster_G907


In [5]:
# Add definitions
import pickle
with open(DESCRIPTOR_DEFINITION_FILE, "rb") as fh:
    def_dd = defaultdict(lambda: '', pickle.load(fh))

model_data_clusters['definition'] = [def_dd[term] for term in model_data_clusters['name']]

In [6]:
# Add trees info

trees_url = 'https://nlmpubs.nlm.nih.gov/projects/mesh/MESH_FILES/meshtrees/mtrees2024.bin'

trees_dict = {}  # key=term, value=list of tree strings
for line in urllib.request.urlopen(trees_url):
    term, tree_str = line.decode('utf8').strip().split(';')
    if term not in trees_dict:
        trees_dict[term] = []
    trees_dict[term].append(tree_str)
trees_dd = defaultdict(lambda: '', trees_dict)

model_data_clusters['trees'] = ['|'.join(trees_dd[term]) for term in model_data_clusters['name']]


In [7]:

model_data_clusters[['name', 'target', 'definition', 'trees', *cluster_cols]].to_excel('concept_clusters.xlsx')

In [8]:

Counter(model_data_clusters['cluster_F']).most_common(10)

[('cluster_F270', 8),
 ('cluster_F236', 7),
 ('cluster_F326', 7),
 ('cluster_F295', 6),
 ('cluster_F517', 6),
 ('cluster_F039', 5),
 ('cluster_F080', 5),
 ('cluster_F083', 5),
 ('cluster_F093', 5),
 ('cluster_F098', 5)]