In [1]:
import subprocess
import os
import sys
import matplotlib.backends.backend_pdf
import scanpy as sc
import matplotlib.pyplot as pl
import anndata as ad
import pandas as pd
import numpy as np
import seaborn as sns
import scvelo as scv
scv.settings.verbosity=1

from pathlib import Path

# Jupyter stuff
from tqdm.notebook import tqdm
from IPython.display import clear_output
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

%matplotlib inline

# Custom functions
sys.path.insert(1, '../')
from utils import *

# scperturb package
sys.path.insert(1, '../package/src/')
from scperturb import *

from pathlib import Path
figure_path = Path('../figures/')

In [4]:
TEMPDIR = Path('/fast/scratch/users/peidlis_c/perturbation_resource_paper/')

In [5]:
sorted([file.name for file in (TEMPDIR / 'McFalineTrapnell2023').glob('*')])

[]

Expression matrices, cell annotations and gene annotations were generated according to the protocol described in our manuscript as well as in Cao et al. 2019 Nature 566(7745) p. 496-502 (three level sci-RNA-seq).

Nuclear barcodes "hash" sequences were extracted from fastq files if the first 10 bp of read 2 match a hash from the appropriate hashSampleSheet.txt (2nd column) within a hamming distance of 2, the last column of the hashSampleSheet.txt file pertains to whether 1 or 2 hashes were used to identify a condition (only single hashes were used, associated metadata to deconvolve hash combinations can be found in the appropriate hashTable_metadata.txt file). 

Reads were then deduplicated by cell barcode and UMI to create a file of hash UMI counts (hashTable.out files) for each nuclear transcriptome in the  experiment. Tab-delimited columns of hashTable.out files correspond to a sample id, the cell barcode identified for that hash, a "_" delimited string with treatment metadata, an axis id for each hash (not used in these experiments), and umi counts for each hash. The first tab-delimited column of the appropriate hashSampleSheet.txt files contains metadata for the experiment divided by underscores. For sciPlexGxE_1, the first "_" delimited string corresponds to the hash plate used, and the second to the well within the hash plate, the third denotes the cell line, the fourth the CRISPR system (CRISPRi for gene knockdown and CRISPRa for overexpression), the fifth to the experiment (MMR or HPRT1 perturbation), the sixth to the dose of the drug and the seventh to the specific drug, and the last "_" string corresponds to the well format of multiwell plates. For sciPlexGxE_2, only the hash oligo was specified. Please refer to the separate hash metadata file (sciPlexGxE_2_hash_metadata.txt). For sciPlex_3, the first "_" delimited string corresponds to the hash plate used, the second to the well within the hash plate, the third denotes the cell line, the fourth to the specific drug, the fifth to the dose of the drug and the six and seventh to biological replicate. For sciPlex_4, the first "_" delimited string corresponds to the hash plate used, the second to the dose of drug, the third denotes the specific drug, the fourth the concentration of trametinib used for combinatorial exposure, and the fifth to the biological replicate. Note that the cell line was assigned based on the RT barcode during the first indexing step of combinatorial indexing RNA-seq.

Single-guide RNA (sgRNA) sequences were extracted from fastq files if a sequence contained a TGTGG at position 3-7 of read 2 and the 18 base pairs spanning position 24-42 of read 2 match a sgRNA from the appropriate gRNASampleSheet.txt (2nd column) within a hamming distance of 2, the last column of the gRNASampleSheet.txt file pertains to whether 1 or 2 sgRNAs were expected K562_hashTable_metadata.txt). Reads were then deduplicated by cell barcode and UMI to create a file of sgRNA read counts (gRNATable.out files). The first tab-delimited column of the appropriate gRNASampleSheet.txt file contains sgRNA metadata where the first “_” delimited string denotes the targeted gene, non-targeting control (NTC) or random control (radnom), and the second the sgRNA rank from Horlbeck et al. 2016 eLife 5:e19760. The second column refers to the 19 bp protospacer sequence in positions 24-42 of read 2. The full protospacer sequences can be found in the appropriate gRNA_whitelist.txt file.
Assembly: GRCh38
Supplementary files format and content: Expression matrices are output by our pipeline in matrix market exchange format and the cell and gene annotation files that accompany these files are in tsv file format.

In [14]:
def load_adata(prefix):
    obs = pd.read_csv(TEMPDIR / f'McFalineTrapnell2023/{prefix}_cell.annotations.txt.gz', sep='\t', index_col=0, header=None, names=['cell_barcode', 'orig_ident'])
    var = pd.read_csv(TEMPDIR / f'McFalineTrapnell2023/{prefix}_gene.annotations.txt.gz', sep='\t', index_col=1, header=None, names=['gene_id', 'gene_symbol'], )
    from scipy.sparse import csr_matrix
    tab = pd.read_csv(TEMPDIR / f'McFalineTrapnell2023/{prefix}_UMI.count.matrix.gz', sep='\t', header=None, names=['gene', 'cell', 'value'])
    X = csr_matrix((tab.value, (tab.cell-1, tab.gene-1)), shape=(len(obs), len(var)))
    return sc.AnnData(X, obs, var)

# GSM7056149_sciPlexGxE_2

In [None]:
prefix = 'GSM7056149_sciPlexGxE_2'
load_adata(prefix)

In [None]:
adata.shape

In [None]:
adata.obs.head()

In [None]:
hashTable = pd.read_csv(TEMPDIR / f'McFalineTrapnell2023/{prefix}_hashTable.out.txt.gz', sep='\t', index_col=1, header=None,
                        names = ['orig_ident', 'cell_barcode', 'plate_id', 'all_ones', 'counts'])
# cell_barcodes --> count hashes for plate_id
df_hash = pd.pivot_table(hashTable, values='counts', index='cell_barcode', columns='plate_id', fill_value=0)
# just take highest
most_counts = np.argmax(df_hash.values, axis=1)
best_plates = pd.DataFrame(list(df_hash.columns[most_counts]), index = df_hash.index, columns=['best_plate_id'])

In [None]:
adata.obs = pd.merge(adata.obs, best_plates, left_index=True, right_index=True, how='left')

In [None]:
adata.obs

In [None]:
# useless?
hash_sample_sheet = pd.read_csv(TEMPDIR / f'McFalineTrapnell2023/{prefix}_hash_sample_sheet.txt.gz', sep='\t', index_col=0, header=None)
hash_sample_sheet.head()

In [None]:
# useless?
sgRNA_sequences = pd.read_csv(TEMPDIR / f'McFalineTrapnell2023/{prefix}_sgRNA_sequences.txt.gz', sep='\t', index_col=2)
sgRNA_sequences.head()