# Immune Data Embeddings

In [63]:
from config import ScImmuneConfig
from model import ScImmuneModel
from tokenizer import ScImmuneTokenizer # refactored version

import torch
import os
import shutil
from utils import generate_metadata_embeddings, generate_metadata_tokens, assign_ontology_embeddings
from gensim.models import Word2Vec
import anndata as ad
import pandas as pd
import scanpy as sc
import json

## Modify tokens and embeddings

In [64]:
shutil.copy("config.json", "scimmune-model/config.json")
shutil.copy("og_model.bin", "scimmune-model/pytorch_model.bin")

'scimmune-model/pytorch_model.bin'

In [65]:
local_config = ScImmuneConfig.from_pretrained("scimmune-model") # load config locally
local_model = ScImmuneModel(local_config) # load model locally
local_tokenizer = ScImmuneTokenizer(vocab_file="vocab_with_metadata.json") # initialize tokenizer

In [66]:
len(local_tokenizer) # 350 new metadata tokens -> 60698 + 350 = 61048
new_vocab_len = len(local_tokenizer)
local_model.resize_token_embeddings(new_vocab_len)

Embedding(61048, 512, padding_idx=0)

In [67]:
local_model # inspect full model

ScImmuneModel(
  (gene_encoder): ModuleDict(
    (embedding): Embedding(61048, 512, padding_idx=0)
    (enc_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  )
  (value_encoder): ModuleDict(
    (linear1): Linear(in_features=1, out_features=512, bias=True)
    (linear2): Linear(in_features=512, out_features=512, bias=True)
    (norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-11): 12 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=512, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
        (linear2): Linear(in_features=512, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNor

## Re-initialize metadata tokens with Node2Vec vectors

In [68]:
embedding_layer = local_model.get_input_embeddings()
embedding_layer

Embedding(61048, 512, padding_idx=0)

In [69]:
# Set Node2Vec model folder path

n2vmodel_path = "../utils/obo_models"

# Load all embedding vectors
doid_embeddings = Word2Vec.load(f"{n2vmodel_path}/doid_node2vec.model")
cl_embeddings = Word2Vec.load(f"{n2vmodel_path}/cl_node2vec.model")
hancestro_embeddings = Word2Vec.load(f"{n2vmodel_path}/hancestro_node2vec.model")
hsapdv_embeddings = Word2Vec.load(f"{n2vmodel_path}/hsapdv_node2vec.model")
uberon_embeddings = Word2Vec.load(f"{n2vmodel_path}/uberon_node2vec.model")

In [70]:
with open("vocab_with_metadata.json", "r") as f:
    vocab_with_metadata_dict = json.load(f) # load this as a dict for lookup

In [71]:
# Assign cell type ontology embeddings
assign_ontology_embeddings(
    tokenizer=local_tokenizer,
    model=local_model,
    node2vec_model_path=f"{n2vmodel_path}/cl_node2vec.model",
    tag="cell_type"
)

[WARN] No Node2Vec embedding found for CL:0010003, skipping.
[WARN] No Node2Vec embedding found for unknown, skipping.
[INFO] Assigned 226 Node2Vec embeddings for tag <cell_type=...>


In [72]:
# Assign self reported ethinicity ontology embeddings
assign_ontology_embeddings(
    tokenizer=local_tokenizer,
    model=local_model,
    node2vec_model_path=f"{n2vmodel_path}/hancestro_node2vec.model",
    tag="self_reported_ethnicity"
)

[WARN] No Node2Vec embedding found for unknown, skipping.
[INFO] Assigned 14 Node2Vec embeddings for tag <self_reported_ethnicity=...>


In [73]:
# Assign human development stage ontology embeddings
assign_ontology_embeddings(
    tokenizer=local_tokenizer,
    model=local_model,
    node2vec_model_path=f"{n2vmodel_path}/hsapdv_node2vec.model",
    tag="development_stage"
)

[INFO] Assigned 99 Node2Vec embeddings for tag <development_stage=...>


In [74]:
# # Assign disease ontology embeddings - only for "normal" -> assign root embeddings
assign_ontology_embeddings(
    tokenizer=local_tokenizer,
    model=local_model,
    node2vec_model_path=f"{n2vmodel_path}/doid_node2vec.model",
    tag="disease"
)

[INFO] Assigned 1 Node2Vec embeddings for tag <disease=...>


In [75]:
# Assign tissue (general) ontology embeddings
assign_ontology_embeddings(
    tokenizer=local_tokenizer,
    model=local_model,
    node2vec_model_path=f"{n2vmodel_path}/uberon_node2vec.model",
    tag="tissue_general"
)

[INFO] Assigned 5 Node2Vec embeddings for tag <tissue_general=...>


In [76]:
# Check at random 

token = "<self_reported_ethnicity=HANCESTRO:0014>"
oid = token[1:-1].split("=")[1] # get OID
token_id = local_tokenizer.convert_tokens_to_ids(token)

print(token)
print(token_id)

n2v_embedding = hancestro_embeddings.wv[oid]

model_embedding = embedding_layer.weight.data[token_id]

<self_reported_ethnicity=HANCESTRO:0014>
60929


In [77]:
model_embedding

tensor([ 0.0497,  0.0331,  0.1329,  0.1752, -0.4556, -0.2980,  0.2807,  0.0052,
        -0.0163, -0.2098,  0.1667, -0.0566,  0.3336, -0.1207,  0.2712,  0.0131,
         0.0078,  0.4117, -0.0967,  0.3313, -0.1634, -0.0084,  0.1873, -0.2433,
        -0.0813,  0.1544,  0.2501,  0.0731, -0.0667, -0.5940,  0.2701, -0.5355,
        -0.0975, -0.2299,  0.0164, -0.2136, -0.0139, -0.1354, -0.4513, -0.4926,
        -0.4118,  0.2164, -0.2083, -0.1436,  0.5083,  0.1289,  0.2874, -0.4932,
         0.3537,  0.0788,  0.3032, -0.2332,  0.4344,  0.0944, -0.0626,  0.1657,
         0.0728,  0.0890, -0.1415, -0.1978,  0.0555, -0.4609, -0.2341,  0.2670,
        -0.1322,  0.3451,  0.2069, -0.0756, -0.1633,  0.0342,  0.1766, -0.1364,
        -0.1535, -0.6566,  0.4714, -0.0353,  0.1606,  0.1889, -0.1495,  0.2821,
         0.4582, -0.0344,  0.1385,  0.7225, -0.1365, -0.0119,  0.2960, -0.1565,
         0.0286,  0.1415,  0.1369, -0.1873,  0.4340, -0.0682,  0.3822,  0.1592,
        -0.1205, -0.0395, -0.0531,  0.23

In [78]:
n2v_embedding

array([ 0.04970636,  0.03314682,  0.13287885,  0.17518593, -0.4556167 ,
       -0.29798904,  0.28066406,  0.00518188, -0.01629672, -0.20980982,
        0.16674723, -0.05661917,  0.3335988 , -0.1206904 ,  0.271185  ,
        0.01309168,  0.00780445,  0.41172418, -0.09669392,  0.33133084,
       -0.16342807, -0.008433  ,  0.1873368 , -0.24328186, -0.08125986,
        0.15443397,  0.25009   ,  0.07314015, -0.06665672, -0.5940084 ,
        0.2701056 , -0.5355188 , -0.09754764, -0.22986086,  0.01638571,
       -0.21357164, -0.01386153, -0.13539767, -0.45130175, -0.4925683 ,
       -0.41183546,  0.2163881 , -0.2082856 , -0.14359395,  0.508321  ,
        0.12891372,  0.28744298, -0.4931968 ,  0.35366276,  0.07875969,
        0.30317426, -0.23324612,  0.4343534 ,  0.09436501, -0.06263129,
        0.16574658,  0.07284207,  0.08897053, -0.14151163, -0.1977543 ,
        0.05552471, -0.4609493 , -0.23409963,  0.26699138, -0.13215336,
        0.3451376 ,  0.20689224, -0.07558941, -0.16332223,  0.03

In [79]:
local_model.save_pretrained("scImmune_metadata_model")