In [1]:
import scanpy as sc
import numpy as np
import os
import subprocess as sp
from scipy.sparse import csr_matrix

%load_ext autoreload
%autoreload 2

In [None]:
data_url = 'https://zenodo.org/records/7041849/files/FrangiehIzar2021_RNA.h5ad?download=1'
data_cache_dir = '../perturbench_data' ## Change this to your local data directory

if not os.path.exists(data_cache_dir):
    os.makedirs(data_cache_dir)

tmp_data_dir = f'{data_cache_dir}/frangieh21_downloaded.h5ad'

if not os.path.exists(tmp_data_dir):
    sp.call(f'wget {data_url} -O {tmp_data_dir}', shell=True)

In [3]:
adata = sc.read_h5ad(tmp_data_dir)
adata

AnnData object with n_obs × n_vars = 218331 × 23712
    obs: 'library_preparation_protocol', 'perturbation_2', 'MOI', 'sgRNA', 'UMI_count', 'guide_id', 'umap_x', 'umap_y', 'perturbation', 'tissue_type', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'perturbation_type_2', 'nperts', 'ngenes', 'ncounts', 'percent_mito', 'percent_ribo'
    var: 'ensembl_id', 'ncounts', 'ncells'

In [4]:
adata.obs.celltype.value_counts()

celltype
melanocytes    218331
Name: count, dtype: int64

In [5]:
adata.obs['cell_type'] = 'melanocyte'

In [6]:
adata.obs.perturbation_2.value_counts()

perturbation_2
IFNγ          87590
Co-culture    73114
Control       57627
Name: count, dtype: int64

In [7]:
treatment_map = {
    'Co-culture': 'co-culture',
    'Control': 'none',
}

adata.obs['treatment'] = [treatment_map[x] if x in treatment_map else x for x in adata.obs.perturbation_2]
adata.obs.treatment.value_counts()

treatment
IFNγ          87590
co-culture    73114
none          57627
Name: count, dtype: int64

In [8]:
adata.obs.perturbation.value_counts()

perturbation
control    57605
ACTA2       1458
B2M         1379
A2M         1331
AEBP1       1302
           ...  
SNRPF         37
RACK1         36
PSMA7         28
UBL5          18
TUBB          12
Name: count, Length: 249, dtype: int64

In [9]:
adata.obs['condition'] = adata.obs.perturbation.copy()
adata.obs['perturbation_type'] = 'CRISPRi'
adata.obs['dataset'] = 'frangieh21'

In [10]:
required_cols = [
    'condition',
    'cell_type',
    'treatment',
    'perturbation_type',
    'dataset',
    'ngenes',
    'ncounts',
]

for col in required_cols:
    assert col in adata.obs.columns
    if np.any(adata.obs[col].isnull()):
        print(col)
    if np.any(adata.obs[col].isna()):
        print(col)

In [11]:
adata.var.head()

Unnamed: 0_level_0,ensembl_id,ncounts,ncells
gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A1BG,ENSG00000121410,41265.0,34705
A1BG-AS1,ENSG00000268895,13133.0,12463
A1CF,ENSG00000148584,114.0,114
A2M,ENSG00000175899,219.0,193
A2M-AS1,ENSG00000245105,62.0,62


In [12]:
adata.X = csr_matrix(adata.X)

In [13]:
adata.write_h5ad(f'{data_cache_dir}/frangieh21_processed.h5ad')