# Immune Data Tokenization

In [33]:
from config import ScImmuneConfig
from model import ScImmuneModel
from tokenizer import ScImmuneTokenizer # refactored version
import torch
import os
import shutil
from utils import generate_metadata_embeddings, generate_metadata_tokens
from gensim.models import Word2Vec
import anndata as ad
import pandas as pd
import scanpy as sc
import json

In [None]:
## Set data folder
data_path = "../data/cellxgene_data"

## Load dataset
adata_immune = sc.read_h5ad(f"{data_path}/immune_1M_merged.h5ad")

## Modify tokens and vocab


In [36]:
shutil.copy("config.json", "scimmune-model/config.json")
shutil.copy("og_model.bin", "scimmune-model/pytorch_model.bin")

'scimmune-model/pytorch_model.bin'

In [37]:
local_config = ScImmuneConfig.from_pretrained("scimmune-model") # load config locally
local_model = ScImmuneModel(local_config) # load model locally
local_tokenizer = ScImmuneTokenizer(vocab_file="vocab.json") # initialize tokenizer

In [38]:
len(local_tokenizer)


60698

In [39]:
# Obtain ontology IDs for  metadata tokenization protocol
ontology_fields = [
                    "cell_type_ontology_term_id",
                    "self_reported_ethnicity_ontology_term_id", 
                    "tissue_general_ontology_term_id",
                    "development_stage_ontology_term_id"
                    # "disease" # use DOID:4 - ID for root
]

ontology_ids = {
    ontology: sorted(adata_immune.obs[ontology].dropna().unique().tolist()) # same result with or without dropna(), which means all entries are annotated
    for ontology in ontology_fields
}

In [40]:
ontology_ids

{'cell_type_ontology_term_id': ['CL:0000034',
  'CL:0000037',
  'CL:0000038',
  'CL:0000049',
  'CL:0000050',
  'CL:0000051',
  'CL:0000057',
  'CL:0000064',
  'CL:0000066',
  'CL:0000071',
  'CL:0000077',
  'CL:0000081',
  'CL:0000084',
  'CL:0000094',
  'CL:0000097',
  'CL:0000113',
  'CL:0000115',
  'CL:0000125',
  'CL:0000138',
  'CL:0000151',
  'CL:0000158',
  'CL:0000160',
  'CL:0000165',
  'CL:0000185',
  'CL:0000186',
  'CL:0000187',
  'CL:0000192',
  'CL:0000232',
  'CL:0000233',
  'CL:0000235',
  'CL:0000236',
  'CL:0000313',
  'CL:0000319',
  'CL:0000322',
  'CL:0000359',
  'CL:0000432',
  'CL:0000451',
  'CL:0000492',
  'CL:0000499',
  'CL:0000542',
  'CL:0000545',
  'CL:0000546',
  'CL:0000549',
  'CL:0000550',
  'CL:0000552',
  'CL:0000553',
  'CL:0000556',
  'CL:0000557',
  'CL:0000559',
  'CL:0000576',
  'CL:0000577',
  'CL:0000583',
  'CL:0000623',
  'CL:0000624',
  'CL:0000625',
  'CL:0000646',
  'CL:0000669',
  'CL:0000738',
  'CL:0000763',
  'CL:0000764',
  'CL:0000

In [41]:
ontology_tokens = {
    ontology: generate_metadata_tokens(ontology_ids[ontology], tag=ontology[:-17]) # remove trailing string '_ontology_term_id' from tag
    for ontology in ontology_fields
}
for category in ontology_tokens:
    print(category, len(ontology_tokens[category]))

cell_type_ontology_term_id 228
self_reported_ethnicity_ontology_term_id 15
tissue_general_ontology_term_id 5
development_stage_ontology_term_id 99


In [42]:
local_tokenizer.get_vocab().keys()

dict_keys(['<unk>', '<eoc>', '<cls>', '<pad>', 'RP5-973N23.5', 'RP11-22E12.2', 'RP11-182N22.10', 'RP11-15L13.5', 'CTB-53D8.3', 'FLJ43315', 'XGY2', 'AC008079.12', 'RP11-326C3.17', 'RP5-1087E8.6', 'CTD-3214H19.17', 'RP11-439H13.3', 'RP11-680A11.7', 'CTA-246H3.13', 'CH17-476P10.1', 'LLNLR-271E8.1', 'LLNLR-265D5.1', 'CTC-359D24.6', 'RP1-138B7.8', 'RP5-967N21.13', 'RP5-1171I10.8', 'RP11-231C14.10', 'RP11-1212A22.9', 'RP11-127I20.9', 'RP11-473M10.4', 'RP11-766N7.5', 'RP11-107C16.2', 'RP11-348J12.6', 'RP11-80K21.4', 'RP11-770J1.8', 'RP11-554A11.11', 'RP11-867O8.9', 'TMX2-CTNND1', 'RP11-732A19.10', 'OR4F8BP', 'RP11-1C8.8', 'C8orf44-SGK3', 'GS1-273L24.6', 'RP11-641J8.4', 'LL0XNC01-30I4.1', 'RP11-478H11.3-001', 'CTB-163K11.1', 'RP11-730B22.5', 'RNF216P1_ENSG00000288620', 'RP11-383A19.1', 'XXbac-BPG254B15.11', 'XXbac-BPG118E17.11', 'CTD-2537M22.2', 'RP11-115L11.3', 'RP13-511M20.1', 'RP5-876B10.7', 'RP11-422P24.15', 'CH17-159N18.5', 'RP11-118B23.9', 'RP1-224A6.11', 'RP11-426C22.12', 'RP11-556O5.7'