In [1]:
!pip install mygene statannotations scrublet scanpy scvelo decoupler goatools gseapy scperturb chembl_webresource_client biomart PyComplexHeatmap statsmodels omnipath git+https://github.com/saezlab/pypath.git --quiet

In [2]:
import subprocess
import os
import sys
import matplotlib.backends.backend_pdf
import scanpy as sc
import matplotlib.pyplot as pl
import anndata as ad
import pandas as pd
import numpy as np
import seaborn as sns
import scvelo as scv
scv.settings.verbosity=1

from pathlib import Path

# Jupyter stuff
from tqdm.notebook import tqdm
from IPython.display import clear_output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

%matplotlib inline

# Custom functions
sys.path.insert(1, '../../')
from utils import *

# scperturb package
sys.path.insert(1, '../../package/src/')
from scperturb import *

from pathlib import Path
figure_path = Path('../../figures/')

In [3]:
TEMPDIR = Path('/scratch/peidli/scPerturb/')
DATADIR = Path('/home/peidli/data/scPerturb/')

In [4]:
from scipy.io import mmread
from scipy.sparse import csr_matrix

In [19]:
X = mmread(TEMPDIR / 'SunshineHein2023/matrix.mtx')
obs = pd.read_csv(TEMPDIR / 'SunshineHein2023/barcodes.tsv.gz', index_col=0, sep='\t', names=['cell_barcode'])
var = pd.read_csv(TEMPDIR / 'SunshineHein2023/features.tsv.gz', index_col=1, sep='\t', names=['ensembl_id', 'gene_symbol', 'feature_type'])
ids = pd.read_csv(TEMPDIR / 'SunshineHein2023/cell_identities.csv', index_col=0)

adata = sc.AnnData(csr_matrix(X).T, pd.concat([obs, ids], axis=1), var)
adata.var.drop('feature_type', axis=1, inplace=True)  # trivial
adata.var_names_make_unique()

# move non-gene features to obsm
group1 = adata.var.index.str.startswith('SCV_')
non_genes = list(adata.var_names[group1])
adata.obsm['SCOV_expression'] = pd.DataFrame(adata[:, non_genes].X.A, index=adata.obs_names, columns=non_genes)
adata = adata[:, ~group1].copy()

group2 = adata.var.index.str.startswith('lenti_')
non_genes = list(adata.var_names[group2])
adata.obsm['lentivirus_capture'] = pd.DataFrame(adata[:, non_genes].X.A, index=adata.obs_names, columns=non_genes)
adata = adata[:, ~adata.var.index.str.startswith('lenti_')].copy()

# harmonize metadata
adata.obs['perturbation'] = ['_'.join(np.unique([y.split('_')[0] for y in x.split(';')])).replace('non-targeting', 'control').replace('control_','').replace('_control','') if type(x)==str else None for x in adata.obs.guide_identity]
adata.obs = adata.obs.rename({'guide_identity': 'guide_id', 
                              'number_of_guides': 'nperts', 
                             }, axis=1)
adata.obs['perturbation_type'] = 'CRISPR-cas9'
adata.obs['disease'] = "lung adenocarcinoma and SARS-CoV-2"
adata.obs['cancer'] = True
adata.obs['tissue_type']="cell_line"
adata.obs["cell_line"] = "Calu-3"
adata.obs["celltype"] = 'lung epithelial cells'
adata.obs['organism'] = 'human'

annotate_qc(adata)
assert_annotations(adata)