Generate gene embedding from scGPT, from: https://github.com/bowang-lab/scGPT/blob/main/tutorials/Tutorial_GRN.ipynb

In [None]:
import copy
import json
import os
from pathlib import Path
import sys
import warnings

import torch
# from anndata import AnnData
# import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import pandas as pd
import tqdm

# from torchtext.vocab import Vocab
# from torchtext._torchtext import (
#     Vocab as VocabPybind,
# )

sys.path.insert(0, "../")
import scgpt as scg
from scgpt.tasks import GeneEmbedding
from scgpt.tokenizer.gene_tokenizer import GeneVocab
from scgpt.model import TransformerModel
from scgpt.preprocess import Preprocessor
from scgpt.utils import set_seed 

os.environ["KMP_WARNINGS"] = "off"
warnings.filterwarnings('ignore')

In [None]:
set_seed(42)
pad_token = "<pad>"
special_tokens = [pad_token, "<cls>", "<eoc>"]
# n_hvg = 1200
n_bins = 51
mask_value = -1
pad_value = -2
n_input_bins = n_bins

In [None]:
# Specify model path; here we load the pre-trained scGPT model downloaded from the scGPT repository
model_dir = Path("/scratch/ssd004/scratch/znavidi/cell_painting/scgpt/scGPT_human")
model_config_file = model_dir / "args.json"
model_file = model_dir / "best_model.pt"
vocab_file = model_dir / "vocab.json"

vocab = GeneVocab.from_file(vocab_file)
for s in special_tokens:
    if s not in vocab:
        vocab.append_token(s)

# Retrieve model parameters from config files
with open(model_config_file, "r") as f:
    model_configs = json.load(f)
print(
    f"Resume model from {model_file}, the model args will override the "
    f"config {model_config_file}."
)
embsize = model_configs["embsize"]
nhead = model_configs["nheads"]
d_hid = model_configs["d_hid"]
nlayers = model_configs["nlayers"]
n_layers_cls = model_configs["n_layers_cls"]

gene2idx = vocab.get_stoi()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

ntokens = len(vocab)  # size of vocabulary
model = TransformerModel(
    ntokens,
    embsize,
    nhead,
    d_hid,
    nlayers,
    vocab=vocab,
    pad_value=pad_value,
    n_input_bins=n_input_bins,
)

try:
    model.load_state_dict(torch.load(model_file, map_location=torch.device('cpu')))
    print(f"Loading all model params from {model_file}")
except:
    # only load params that are in the model and match the size
    model_dict = model.state_dict()
    pretrained_dict = torch.load(model_file, map_location=torch.device('cpu'))
    pretrained_dict = {
        k: v
        for k, v in pretrained_dict.items()
        if k in model_dict and v.shape == model_dict[k].shape
    }
    for k, v in pretrained_dict.items():
        print(f"Loading params {k} with shape {v.shape}")
        model_dict.update(pretrained_dict)
        model.load_state_dict(model_dict)

model.to(device)

In [None]:
# Retrieve the data-independent gene embeddings from scGPT
gene_ids = np.array([id for id in gene2idx.values()])
gene_embeddings = model.encoder(torch.tensor(gene_ids, dtype=torch.long).to(device))
gene_embeddings = gene_embeddings.detach().cpu().numpy()

In [None]:
# Filter on the intersection between the rohban et al genes and scGPT's 30+K foundation model vocab
gene_embeddings_rohban = {gene: gene_embeddings[i] for i, gene in enumerate(gene2idx.keys()) if gene in metadata_df['gene_name'].unique()}
print('Retrieved gene embeddings for {} genes.'.format(len(gene_embeddings_rohban)))
print(type(gene_embeddings_rohban))

In [None]:
# experiment 01 genes
gene_list = ['RAC1', 'KRAS', 'CDC42', 'RHOA', 'PAK1']

# check if the gene embedding inside gene_list is generated by scGPT
for gene in gene_list:
    if gene in list(gene_embeddings_rohban.keys()):
        continue
    else:
        print(gene)

In [None]:
import numpy as np
import csv

def write_dict_to_csv(input_dict, filename):
    with open(filename, mode='w', newline='') as file:
        for key, values in input_dict.items():
            
            row = key.lower()
            for value in gene_embeddings_rohban[key]:
                row = row+','+str(value)
            file.write(row+'\n')

write_dict_to_csv(gene_embeddings_rohban, 'required_file/perturbation_embedding_rohban_check.csv')

In [None]:
pert_to_embedding = pd.read_csv('required_file/perturbation_embedding_rohban_check.csv', header=None)

In [None]:
pert_to_embedding.columns = ['gene_name'] + [str(i) for i in range(512)]
pert_to_embedding