In [3]:
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix
from mudata import MuData
import mudata as md
from anndata import AnnData
# import pyranges as pr
import bioframe as bf

In [4]:
def set_coord(adata, range_df):
    adata.varm['coord'] = range_df.set_index(adata.var_names)

In [26]:
def subset_by_overlap(adata, granges):
    # coord = pr.PyRanges(adata.varm['coord'].reset_index())
    varm = adata.varm['coord'].copy()
    varm['idx'] = varm.index
    idx = bf.overlap(varm, granges, how='inner')['idx']
    return adata[:, idx]

In [20]:
def slice_granges(adata, chrom, start, end):
    idx = bf.select(adata.varm['coord'], f"{chrom}:{start}-{end}").index
    return adata[:, idx]

In [39]:
class RangeAnnData(AnnData):
    def set_coord(self, range_df):
        self.varm['coord'] = range_df.set_index(self.var_names)

    def subset_by_overlap(self, granges):
        varm = self.varm['coord'].copy()
        varm['idx'] = varm.index
        idx = bf.overlap(varm, granges, how='inner')['idx']
        return self[:, idx]

    def slice_granges(self, chrom, start, end):
        idx = bf.select(self.varm['coord'], f"{chrom}:{start}-{end}").index
        return self[:, idx]

In [40]:
counts = csr_matrix(np.random.poisson(1, size=(100, 1000)), dtype=np.float32)
exons, gr = pr.data.exons().df, pr.data.cpg().df
exons.columns = ['chrom', 'start', 'end'] + list(exons.columns[3:])
gr.columns = ['chrom', 'start', 'end']+ list(gr.columns[3:])

adata = RangeAnnData(counts)
adata.obs_names = [f"Cell_{i:d}" for i in range(adata.n_obs)]
adata.var_names = [f"Gene_{i:d}" for i in range(adata.n_vars)]
adata.set_coord(exons)

In [41]:
adata.slice_granges('chrX', 1, 1500000).varm['coord']

Unnamed: 0,chrom,start,end,Name,Score,Strand
Gene_12,chrX,1475113,1475229,NM_001267713_exon_4_0_chrX_1475114_f,0,+
Gene_18,chrX,1393647,1393735,NM_172249_exon_1_0_chrX_1393648_f,0,+
Gene_22,chrX,1419383,1419519,NM_001161531_exon_9_0_chrX_1419384_f,0,+
Gene_124,chrX,1424338,1424420,NM_006140_exon_11_0_chrX_1424339_f,0,+
Gene_140,chrX,1407651,1407781,NM_001161532_exon_3_0_chrX_1407652_f,0,+
Gene_183,chrX,1404670,1404813,NM_172245_exon_3_0_chrX_1404671_f,0,+
Gene_276,chrX,1393647,1393735,NM_006140_exon_1_0_chrX_1393648_f,0,+
Gene_338,chrX,585078,585337,NM_000451_exon_0_0_chrX_585079_f,0,+
Gene_345,chrX,1424338,1424420,NM_001161530_exon_10_0_chrX_1424339_f,0,+
Gene_410,chrX,1414319,1414349,NM_172245_exon_8_0_chrX_1414320_f,0,+


In [43]:
adata.subset_by_overlap(gr).varm['coord']

Unnamed: 0,chrom,start,end,Name,Score,Strand
Gene_811,chrX,17879217,17879457,NM_001172739_exon_2_0_chrX_17879218_r,0,-
Gene_477,chrX,34147868,34150447,NM_203408_exon_0_0_chrX_34147869_r,0,-
Gene_477,chrX,34147868,34150447,NM_203408_exon_0_0_chrX_34147869_r,0,-
Gene_520,chrX,45016959,45017133,NM_176819_exon_2_0_chrX_45016960_r,0,-
Gene_114,chrX,47039278,47039436,NM_001204467_exon_9_0_chrX_47039279_f,0,+
...,...,...,...,...,...,...
Gene_879,chrY,15591393,15592550,NR_047610_exon_27_0_chrY_15591394_r,0,-
Gene_912,chrY,15591393,15592550,NR_047607_exon_29_0_chrY_15591394_r,0,-
Gene_923,chrY,15591393,15592550,NM_001258269_exon_29_0_chrY_15591394_r,0,-
Gene_972,chrY,15591393,15592550,NR_047599_exon_28_0_chrY_15591394_r,0,-


In [46]:
counts = csr_matrix(np.random.poisson(1, size=(100, 10000)), dtype=np.float32)
chipseq, gr = pr.data.chipseq().df, pr.data.cpg().df
chipseq.columns = ['chrom', 'start', 'end'] + list(chipseq.columns[3:])
gr.columns = ['chrom', 'start', 'end']+ list(gr.columns[3:])

bdata = RangeAnnData(counts)
bdata.obs_names = [f"Cell_{i:d}" for i in range(bdata.n_obs)]
bdata.var_names = [f"Gene_{i:d}" for i in range(bdata.n_vars)]
bdata.set_coord(chipseq)

In [47]:
mdata = MuData({"Exon": adata, "Chipseq": bdata})



In [57]:
MuData({k: mdata.mod[k].subset_by_overlap(gr) for k in mdata.mod.keys()})



In [65]:
class RangeMuData(MuData):
    def set_coord(self, prange):
        self.varm['coord'] = prange.df.set_index(adata.var_names)

    def subset_by_overlap(self, granges):
        return MuData({k: mdata.mod[k].subset_by_overlap(granges) for k in mdata.mod.keys()})

    def slice_granges(self, chrom, start, end):
        return MuData({k: mdata.mod[k].slice_granges(chrom, start, end) for k in mdata.mod.keys()})


In [67]:
mdata = RangeMuData({"Exon": adata, "Chipseq": bdata})



In [63]:
mdata.subset_by_overlap(gr)



In [69]:
mdata.slice_granges('chrX', 1, 10000000)