In [7]:
!pip install chembl_webresource_client scanpy scperturb --quiet

In [8]:
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as pl
import seaborn as sns
import os
import sys
from tqdm.auto import tqdm
from pathlib import Path

In [10]:
# load config
import yaml
with open('../../configuration/config.yaml', 'r') as file:
    config = yaml.safe_load(file)
DOWNDIR = Path(config['DOWNDIR'])
TEMPDIR = Path(config['TEMPDIR'])
sys.path.insert(1, '../../')
from utils import *

In [11]:
sorted([file.name for file in (TEMPDIR / 'XuCao2023').glob('*')])

['GSM6752591_on_target_cell_metadata.csv.gz',
 'GSM6752591_on_target_nascent_tx.Barcodes.tsv.gz',
 'GSM6752591_on_target_nascent_tx.Genes.tsv.gz',
 'GSM6752591_on_target_nascent_tx_count_matrix.mtx.gz',
 'GSM6752591_on_target_sgRNA.Barcodes.tsv.gz',
 'GSM6752591_on_target_sgRNA.Genes.tsv.gz',
 'GSM6752591_on_target_sgRNA_count_matrix.mtx.gz',
 'GSM6752591_on_target_whole_tx.Barcodes.tsv.gz',
 'GSM6752591_on_target_whole_tx.Genes.tsv.gz',
 'GSM6752591_on_target_whole_tx_count_matrix.mtx.gz',
 'GSM6752591_whole_txome_sgRNA_sample_name_barcode_table.csv.gz']

In [29]:
from scipy.io import mmread
from scipy.sparse import csr_matrix

In [51]:
suffix = 'whole_tx'
X = csr_matrix(mmread(TEMPDIR / 'XuCao2023' / f'GSM6752591_on_target_{suffix}_count_matrix.mtx.gz'))
obs = pd.read_csv(TEMPDIR / 'XuCao2023' / f'GSM6752591_on_target_{suffix}.Barcodes.tsv.gz', index_col=0, names=['cell_barcode'])
var = pd.read_csv(TEMPDIR / 'XuCao2023' / f'GSM6752591_on_target_{suffix}.Genes.tsv.gz', index_col=0, names=['gene_symbol'])
adata = sc.AnnData(X.T, obs, var)

In [46]:
suffix = 'nascent_tx'
X = csr_matrix(mmread(TEMPDIR / 'XuCao2023' / f'GSM6752591_on_target_{suffix}_count_matrix.mtx.gz'))
obs = pd.read_csv(TEMPDIR / 'XuCao2023' / f'GSM6752591_on_target_{suffix}.Barcodes.tsv.gz', index_col=0, names=['cell_barcode'])
var = pd.read_csv(TEMPDIR / 'XuCao2023' / f'GSM6752591_on_target_{suffix}.Genes.tsv.gz', index_col=0, names=['gene_symbol'])
ndata = sc.AnnData(X.T, obs, var)

In [47]:
suffix = 'sgRNA'
X = csr_matrix(mmread(TEMPDIR / 'XuCao2023' / f'GSM6752591_on_target_{suffix}_count_matrix.mtx.gz'))
obs = pd.read_csv(TEMPDIR / 'XuCao2023' / f'GSM6752591_on_target_{suffix}.Barcodes.tsv.gz', index_col=0, names=['cell_barcode'])
var = pd.read_csv(TEMPDIR / 'XuCao2023' / f'GSM6752591_on_target_{suffix}.Genes.tsv.gz', index_col=0, names=['gene_symbol'])
sdata = sc.AnnData(X.T, obs, var)

In [93]:
# check alignment
assert all(adata.var_names==ndata.var_names)
assert all(adata.obs_names==ndata.obs_names)
assert all(adata.obs_names==sdata.obs_names)

In [96]:
adata.layers['nascent_counts'] = ndata.X.copy()
adata.obsm['sgRNA_counts'] = pd.DataFrame(sdata.X.A, index=sdata.obs_names, columns=sdata.var_names, dtype=pd.SparseDtype("int", 0))
full_obs = pd.read_csv(TEMPDIR / 'XuCao2023' / f'GSM6752591_on_target_cell_metadata.csv.gz', index_col=0)
assert all(adata.obs_names==full_obs.index)
adata.obs = full_obs.copy()

In [97]:
# harmonize metadata
adata.obs.rename({'UMI_counts': 'ncounts', 'target_genes': 'perturbation', 'target': 'guide_id'}, axis=1, inplace=True)
adata.obs.perturbation.replace('NO-TARGET', 'control', inplace=True)
cols = ['perturbation', 'ncounts', 'nascent_UMI_counts', 'nascent_ratio', 'guide_id', 'gRNA_UMI_counts', 'nascent_MT_ratio', 'Cell_cycle_phase', 'whole_exon_ratio', 'new_exon_ratio']
adata.obs = adata.obs[cols]
adata.obs['target'] = adata.obs['perturbation'].copy()

adata.obs['perturbation_type'] = 'CRISPRi'
adata.obs['disease'] = "healthy"
adata.obs['cancer'] = True
adata.obs['tissue_type']="cell_line"
adata.obs["cell_line"] = "HEK293"
adata.obs["celltype"] = 'embryonic kidney cells'
adata.obs['organism'] = 'human'
adata.obs['nperts'] = [p.count('_')+1-p.count('control') if type(p)==str else 0 for p in adata.obs.perturbation]
annotate_qc(adata, species='human')
adata.obs.index.name = 'cell_barcode'

In [98]:
assert_annotations(adata)