# Task 2 (ISP): Apply Perturbations with Geneformer InSilicoPerturber

This optional variant uses Geneformer's native InSilicoPerturber (ISP) to quantify cosine-similarity shifts in CLS embeddings.



In [None]:
# Setup paths
from pathlib import Path
import sys, os
repo_root = Path.cwd().parent
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

DATA_DIR = repo_root / 'als-perturb-geneformer' / 'als-perturb-geneformer' / 'data'
AD_DIR = DATA_DIR / 'adata'
ISP_DIR = DATA_DIR / 'isp'
ISP_DIR.mkdir(parents=True, exist_ok=True)

print('Repo root:', repo_root)
print('AnnData dir:', AD_DIR)
print('ISP out dir:', ISP_DIR)



In [None]:
# Imports and model/tokenizer mapping
import numpy as np
import anndata as ad
import pickle
from geneformer import TranscriptomeTokenizer, InSilicoPerturber, ENSEMBL_DICTIONARY_FILE

# Map ALS symbols to Ensembl IDs
ALS_GENES = ["C9orf72","SOD1","TARDBP","FUS","TBK1","NEK1"]
with open(ENSEMBL_DICTIONARY_FILE, 'rb') as f:
    name_to_id = pickle.load(f)
name_to_id = {str(k).upper(): v for k, v in name_to_id.items()}
ALS_ENSEMBL = [name_to_id.get(g.upper()) or g for g in ALS_GENES]
print('ALS genes (Ensembl):', ALS_ENSEMBL)



In [None]:
# Tokenize baseline.h5ad to a Geneformer Dataset with metadata 'Condition'
from pathlib import Path
baseline_path = AD_DIR / 'baseline.h5ad'
if not baseline_path.exists():
    raise FileNotFoundError('Missing baseline.h5ad in data/adata')

# Use Tokenizer to write a small tokenized dataset for ISP
# Keep a small subset by providing max_ncells later in ISP
TOKENIZED_DIR = ISP_DIR / 'tokenized.dataset'
if TOKENIZED_DIR.exists():
    print('Tokenized dataset exists at:', TOKENIZED_DIR)
else:
    tk = TranscriptomeTokenizer(custom_attr_name_dict={"Condition":"Condition"}, model_version='V2')
    # Use tokenizer's anndata path; create datasets and save
    cells, meta, counts = tk.tokenize_anndata(baseline_path, file_format='h5ad')
    ds = tk.create_dataset(cells, meta, counts, use_generator=False)
    ds.save_to_disk(str(TOKENIZED_DIR))
    print('Saved tokenized dataset to:', TOKENIZED_DIR)



In [None]:
# Run ISP: ALS_down (simulate knock-down via delete)
from geneformer import InSilicoPerturper
from geneformer import InSilicoPerturber

isp_out_als = ISP_DIR / 'als_down'
isp_out_als.mkdir(exist_ok=True)

isp = InSilicoPerturber(
    perturb_type='delete',
    genes_to_perturb=ALS_ENSEMBL,
    model_type='Pretrained',
    emb_mode='cls',
    filter_data={"Condition":["ALS"]},
    max_ncells=2000,
    model_version='V2',
    nproc=8,
)

isp.perturb_data(
    model_directory='ctheodoris/Geneformer',
    input_data_file=str(TOKENIZED_DIR),
    output_directory=str(isp_out_als),
    output_prefix='als_down',
)
print('ALS_down ISP complete →', isp_out_als)



In [None]:
# Run ISP: Healthy_up (simulate knock-up via overexpress)
isp_out_h = ISP_DIR / 'healthy_up'
isp_out_h.mkdir(exist_ok=True)

isp_h = InSilicoPerturber(
    perturb_type='overexpress',
    genes_to_perturb=ALS_ENSEMBL,
    model_type='Pretrained',
    emb_mode='cls',
    filter_data={"Condition":["PN"]},
    max_ncells=2000,
    model_version='V2',
    nproc=8,
)

isp_h.perturb_data(
    model_directory='ctheodoris/Geneformer',
    input_data_file=str(TOKENIZED_DIR),
    output_directory=str(isp_out_h),
    output_prefix='healthy_up',
)
print('Healthy_up ISP complete →', isp_out_h)

