# Immune Data Embeddings

In [1]:
from config import ScImmuneConfig
from model import ScImmuneModel
from tokenizer import ScImmuneTokenizer # refactored version

import torch
import os
import shutil
from utils import generate_metadata_embeddings, generate_metadata_tokens, assign_ontology_embeddings
from gensim.models import Word2Vec
import anndata as ad
import pandas as pd
import scanpy as sc
import json

  from .autonotebook import tqdm as notebook_tqdm


ImportError: /home/s5srinivasan/immune-foundational-model/.venv/lib64/python3.9/site-packages/flash_attn_2_cuda.cpython-39-x86_64-linux-gnu.so: undefined symbol: _ZN3c104cuda9SetDeviceEa

## Modify tokens and embeddings

In [None]:
shutil.copy("config.json", "scimmune-model/config.json")
shutil.copy("og_model.bin", "scimmune-model/pytorch_model.bin")

In [None]:
local_config = ScImmuneConfig.from_pretrained("scimmune-model") # load config locally
local_model = ScImmuneModel(local_config) # load model locally
local_tokenizer = ScImmuneTokenizer(vocab_file="vocab_with_metadata.json") # initialize tokenizer

In [None]:
len(local_tokenizer) # 350 new metadata tokens -> 60698 + 350 = 61048
new_vocab_len = len(local_tokenizer)
local_model.resize_token_embeddings(new_vocab_len)

In [None]:
local_model # inspect full model

## Re-initialize metadata tokens with Node2Vec vectors

In [None]:
embedding_layer = local_model.get_input_embeddings()
embedding_layer

In [None]:
# Set Node2Vec model folder path

n2vmodel_path = "../utils/obo_models"

# Load all embedding vectors
doid_embeddings = Word2Vec.load(f"{n2vmodel_path}/doid_node2vec.model")
cl_embeddings = Word2Vec.load(f"{n2vmodel_path}/cl_node2vec.model")
hancestro_embeddings = Word2Vec.load(f"{n2vmodel_path}/hancestro_node2vec.model")
hsapdv_embeddings = Word2Vec.load(f"{n2vmodel_path}/hsapdv_node2vec.model")
uberon_embeddings = Word2Vec.load(f"{n2vmodel_path}/uberon_node2vec.model")

In [None]:
with open("vocab_with_metadata.json", "r") as f:
    vocab_with_metadata_dict = json.load(f) # load this as a dict for lookup

In [None]:
# Assign cell type ontology embeddings
assign_ontology_embeddings(
    tokenizer=local_tokenizer,
    model=local_model,
    node2vec_model_path=f"{n2vmodel_path}/cl_node2vec.model",
    tag="cell_type"
)

In [None]:
# Assign self reported ethinicity ontology embeddings
assign_ontology_embeddings(
    tokenizer=local_tokenizer,
    model=local_model,
    node2vec_model_path=f"{n2vmodel_path}/hancestro_node2vec.model",
    tag="self_reported_ethnicity"
)

In [None]:
# Assign human development stage ontology embeddings
assign_ontology_embeddings(
    tokenizer=local_tokenizer,
    model=local_model,
    node2vec_model_path=f"{n2vmodel_path}/hsapdv_node2vec.model",
    tag="development_stage"
)

In [None]:
# # Assign disease ontology embeddings - only for "normal" -> assign root embeddings
assign_ontology_embeddings(
    tokenizer=local_tokenizer,
    model=local_model,
    node2vec_model_path=f"{n2vmodel_path}/doid_node2vec.model",
    tag="disease"
)

In [None]:
# Assign tissue (general) ontology embeddings
assign_ontology_embeddings(
    tokenizer=local_tokenizer,
    model=local_model,
    node2vec_model_path=f"{n2vmodel_path}/uberon_node2vec.model",
    tag="tissue_general"
)

In [None]:
# Check at random 

token = "<self_reported_ethnicity=HANCESTRO:0014>"
oid = token[1:-1].split("=")[1] # get OID
token_id = local_tokenizer.convert_tokens_to_ids(token)

print(token)
print(token_id)

n2v_embedding = hancestro_embeddings.wv[oid]

model_embedding = embedding_layer.weight.data[token_id]

In [None]:
model_embedding

In [None]:
n2v_embedding

In [None]:
local_model.save_pretrained("scImmune_metadata_model")