# Run ChromVAR with ```scprinter```

## 0. Imports

In [1]:
%load_ext autoreload
%autoreload 2
import scprinter as scp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import time
import pandas as pd
import numpy as np
import os
import pickle
import torch
import matplotlib as mpl
mpl.rcParams['pdf.fonttype'] = 42
from scanpy.plotting.palettes import zeileis_28
from tqdm.contrib.concurrent import *
from tqdm.auto import *
import anndata
import scanpy as sc
import statistics as stat
import json
import csv
import re
import copy
from sklearn.preprocessing import OneHotEncoder

In [2]:
import snapatac2 as snap

In [3]:
scp.__version__

'1.0.0a'

### 0.1 Setup

In [4]:
# Specify the reference genome. This must match that of your ATAC fragments file
genome = scp.genome.mm10

genome

<scprinter.genome.Genome at 0x7fed2c3c3ad0>

Define objective

In [5]:
chromVAR_or_seq2PRINT = 'chromvar'

## 1. Paths

### 1.1 Data directories

In [6]:
master_data_dir = '/bap/bap/collab_asthma_multiome/'

TODO: paths are messy you should clean this up soon

In [7]:
# outputs
printer_h5ad_output_dir = os.path.join(master_data_dir, 'ATAC', '2_Analysis_Outputs', '1b_ChromVAR_scPrinter_object')
scprinter_obj_path = os.path.join(printer_h5ad_output_dir, 'Asthma_Multiome_Collab_scPrinter.h5ad')
peak_path = os.path.join(master_data_dir, 'ATAC', '2_Analysis_Outputs', '1c_chromvar_scPrinter_MACS_peaks', 'chromVAR_preset_Asthma_Multiome_scPrinter_cleaned_merged_narrowPeak.bed')

output_dir = os.path.join(master_data_dir, 'ATAC', '2_Analysis_Outputs', f'1d_{chromVAR_or_seq2PRINT}_Outputs')

# if the output directory does not exist, create it
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [8]:
output_dir

'/bap/bap/collab_asthma_multiome/ATAC/2_Analysis_Outputs/1d_chromvar_Outputs'

## 2. ```scPrinter``` analysis

### 2.1 Load the scPrinter object

When you finish using the object, run ```printer.close()``` otherwise you won't be able to load it properly next time.

In [9]:
printer = scp.load_printer(scprinter_obj_path, genome)

In [10]:
printer

head project
AnnData object with n_obs x n_vars = 7418 x 0 backed at '/bap/bap/collab_asthma_multiome/ATAC/2_Analysis_Outputs/1b_ChromVAR_scPrinter_object/Asthma_Multiome_Collab_scPrinter.h5ad'
    obs: 'sample', 'n_fragment', 'frac_dup', 'frac_mito', 'frag_path', 'frag_sample_name', 'tsse', 'predicted_cluster'
    uns: 'bias_bw', 'bias_path', 'binding score', 'gff_db', 'reference_sequences', 'insertion', 'peak_calling', 'unique_string', 'footprints', 'genome'
    obsm: 'insertion_chr14', 'insertion_chr16', 'insertion_chrY', 'insertion_chr1', 'insertion_chr6', 'insertion_chr15', 'insertion_chr12', 'insertion_chr4', 'insertion_chr8', 'insertion_chr7', 'insertion_chr13', 'insertion_chr3', 'insertion_chr11', 'insertion_chr10', 'insertion_chr18', 'insertion_chrX', 'insertion_chr19', 'insertion_chr17', 'insertion_chr5', 'insertion_chr9', 'insertion_chr2'




### 2.2 Get peak-by-cell count matrix

In [11]:
# First construct a peak-by-cell matrix of ATAC counts
adata = scp.pp.make_peak_matrix(printer,
                       regions=peak_path,
                       region_width=300,
                       cell_grouping=None,
                       group_names=None,
                       sparse=True)

Loading insertion profiles


Making peak matrix:   0%|          | 0/370848 [00:00<?, ?it/s]

In [12]:
adata.write(f'{output_dir}/cell_peak.h5ad')



## 3. Calculate chromVAR motif sores

In [13]:
# Only keep peaks with > 0 coverage
adata = anndata.read_h5ad(f'{output_dir}/cell_peak.h5ad')
coverage = adata.X.sum(axis=0)
adata = adata[:, coverage > 0]

adata

View of AnnData object with n_obs × n_vars = 7418 × 370848

Before running the below code, make sure you check which GPUs are available (use the nvtop command in terminal).

In [14]:
# We can calculate chromVAR motif scores using either GPU (device = "cuda", much faster) or CPU (device = "cpu", slower)
device = "cuda"

if device == "cuda":
    import warnings
    warnings.filterwarnings("ignore")
    import scanpy as sc
    import anndata
    import cupy as cp
    import cupyx as cpx
    import time
    import rmm
    from rmm.allocators.cupy import rmm_cupy_allocator
    rmm.reinitialize(
        managed_memory=True, # Allows oversubscription
        pool_allocator=True, # default is False
        devices=1, # GPU device IDs to register. By default registers only GPU 0.
    )
    cp.cuda.set_allocator(rmm_cupy_allocator)

In [15]:
# Sample background peaks for each peak
scp.chromvar.sample_bg_peaks(adata,
                             genome=genome,
                             method='chromvar',
                             niterations=250)

Fetching GC content:   0%|          | 0/370848 [00:00<?, ?it/s]

Sampling nearest neighbors
NNDescent (2500, 2)


Sampling background peaks:   0%|          | 0/2500 [00:00<?, ?it/s]

array([[357297, 228870,  77812, ..., 121568, 259832, 178674],
       [ 80562,  77593, 213698, ..., 158649,  24883, 335991],
       [ 27500, 160648, 355589, ..., 288352,  46076, 191749],
       ...,
       [194190,  71307, 147671, ..., 115542, 295490, 214375],
       [347670, 357115, 277522, ...,  89195,  50265, 106244],
       [363925,   1265, 324463, ..., 192177, 368372,   6300]])

In [16]:
# Scan motifs
motif = scp.motifs.FigR_Mouse_Motifs(genome,
                                     bg=list(adata.uns['bg_freq']),
                                     n_jobs=100,
                                     pvalue=5e-5, mode='motifmatchr')

motif.prep_scanner(None, pvalue=5e-5)
motif.chromvar_scan(adata)

  0%|          | 0/741696 [00:00<?, ?it/s]

In [17]:
# Compute motif scores for single cells
chromvar = scp.chromvar.compute_deviations(adata, chunk_size=50000, device=device)

Computing expectation reads per cell and peak...


Processing chunks:   0%|          | 0/1 [00:00<?, ?it/s]

Processing background peaks:   0%|          | 0/250 [00:00<?, ?it/s]

In [18]:
chromvar

AnnData object with n_obs × n_vars = 7418 × 890

In [19]:
# Save for later use
chromvar.write(f'{output_dir}/chromvar_FigR.h5ad')

## Close object

In [20]:
printer.close()

# END