In [18]:
import pandas as pd
import json
from huggingface_hub import hf_hub_download
import collections

# Set Up

In [12]:
# Download CLMBR config
path_to_config_json = hf_hub_download(
    repo_id="StanfordShahLab/clmbr-t-base",
    filename="clmbr_v8_original_dictionary.json"
)
print(f"Saved to: {path_to_config_json}")

# Read CLMBR vocab config
with open(path_to_config_json, 'r') as f:
    config = json.load(f)

Saved to: /Users/mwornow/.cache/huggingface/hub/models--StanfordShahLab--clmbr-t-base/snapshots/33d720931853fb1e0f77ffb50244219c9158db4b/clmbr_v8_original_dictionary.json


In [13]:
# Print keys in config
print(config.keys())

dict_keys(['age_stats', 'is_hierarchical', 'regular'])


# Overview

What does each token in the vocabulary look like?

In [None]:
# Print example token in vocab
config['regular'][0]

{'code_string': 'SNOMED/3950001',
 'text_string': '',
 'type': 'code',
 'val_end': 0.0,
 'val_start': 0.0,
 'weight': -0.18811663446051943}

Next, calculate the number of unique tokens in the vocabulary. Filter out any tokens that are ignored (i.e. "unused").

In [None]:
vocab = [ x for x in config['regular'] if x['type'] != 'unused' ] # remove unused tokens
print("Age stats:", config['age_stats'])
print("Is hierarchical:", config['is_hierarchical'])
print("Vocab size:", len(vocab))

Age stats: {'mean': 18144688.37614148, 'std': 13883171.660255756}
Is hierarchical: False
Vocab size: 39811


There are a total of 39811 unique tokens in the CLMBR-t-base vocabulary.

# Token Types

Let's count the number of tokens of each type.

In [32]:
# List types of tokens + counts
token_types = [ x['type'] for x in vocab ]
token_counts = collections.Counter(token_types)
df = pd.DataFrame(token_counts.items(), columns=['Token Type', 'Count'])
df.sort_values(by='Count', ascending=False, inplace=True)
df

Unnamed: 0,Token Type,Count
0,code,25667
2,numeric,11183
1,text,2961


There are 3 different types of tokens in the vocabulary:
    
| Token Type | Count |
|------------|-------|
| code    | 25667 |
| numeric     | 11183     |
| text    | 2961    |

The `code` token type represents a **code without considering any metadata** associated with it. It maps an occurrence of a specific code (e.g. "LOINC/7094-6") directly to a token.

The `text` token type represents a **categorical variable**. It maps a code + its discrete value (e.g. "SNOMED/228490006" with the value "N") to a token.

The `numeric` token type represents a **numerical variable**. It maps a code + its value in a specific range (e.g. "LOINC/8867-4" with a value between [69.0, 76.0]) to a token.

Here is an example of each token type:

In [24]:
code = [ x for x in vocab if x['type'] == 'code' ][0]
text = [ x for x in vocab if x['type'] == 'text' ][0]
numeric = [ x for x in vocab if x['type'] == 'numeric' ][0]
print("`Code` token example: ", code)
print("`Text` token example: ", text)
print("`Numeric` token example: ", numeric)

`Code` token example:  {'code_string': 'SNOMED/3950001', 'text_string': '', 'type': 'code', 'val_end': 0.0, 'val_start': 0.0, 'weight': -0.18811663446051943}
`Text` token example:  {'code_string': 'SNOMED/228490006', 'text_string': 'N', 'type': 'text', 'val_end': 0.0, 'val_start': 0.0, 'weight': -0.0353444009671022}
`Numeric` token example:  {'code_string': 'LOINC/8867-4', 'text_string': '', 'type': 'numeric', 'val_end': 69.0, 'val_start': -1.7976931348623157e+308, 'weight': -0.007933071663676598}


# Ontologies

Let's now count how many tokens are from each OMOP ontology.

In [31]:
counter = collections.Counter()
for code in vocab:
    ontology = code['code_string'].split('/')[0]
    counter[ontology] += 1
df = pd.DataFrame(counter.items(), columns=['Ontology', 'Count'])
df.sort_values(by='Count', ascending=False, inplace=True)
df

Unnamed: 0,Ontology,Count
0,SNOMED,16299
7,LOINC,13837
11,RxNorm,4678
9,CPT4,3728
8,CARE_SITE,396
15,RxNorm Extension,255
19,ICD10PCS,233
20,ICD9Proc,196
16,HCPCS,54
18,ICDO3,52


SNOMED is the most commonly represented ontology in the CLMBR vocabulary with 16299 derived tokens, followed by LOINC with 13837 and RxNorm with 4678.