In [1]:
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix
from mudata import MuData
import mudata as md
from anndata import AnnData
# import pyranges as pr
import bioframe as bf
import os

In [2]:
def set_coord(adata, range_df):
    adata.varm['coord'] = range_df.set_index(adata.var_names)

In [3]:
def subset_by_overlap(adata, granges):
    # coord = pr.PyRanges(adata.varm['coord'].reset_index())
    varm = adata.varm['coord'].copy()
    varm['idx'] = varm.index
    idx = bf.overlap(varm, granges, how='inner')['idx']
    return adata[:, idx]

In [4]:
def slice_granges(adata, chrom, start, end):
    idx = bf.select(adata.varm['coord'], f"{chrom}:{start}-{end}").index
    return adata[:, idx]

In [5]:
class RangeAnnData(AnnData):
    def set_coord(self, range_df):
        self.varm['coord'] = range_df.set_index(self.var_names)

    def subset_by_overlap(self, granges):
        varm = self.varm['coord'].copy()
        varm['idx'] = varm.index
        idx = bf.overlap(varm, granges, how='inner')['idx']
        return self[:, idx]

    def slice_granges(self, chrom, start, end):
        idx = bf.select(self.varm['coord'], f"{chrom}:{start}-{end}").index
        return self[:, idx]

In [40]:
class RangeMuData(MuData):
    def set_coord(self, prange):
        self.varm['coord'] = prange.df.set_index(adata.var_names)

    def subset_by_overlap(self, granges):
        return MuData({k: mdata.mod[k].subset_by_overlap(granges) for k in mdata.mod.keys()})

    def slice_granges(self, chrom, start, end):
        return MuData({k: mdata.mod[k].slice_granges(chrom, start, end) for k in mdata.mod.keys()})

In [40]:
counts = csr_matrix(np.random.poisson(1, size=(100, 1000)), dtype=np.float32)
exons, gr = pr.data.exons().df, pr.data.cpg().df
exons.columns = ['chrom', 'start', 'end'] + list(exons.columns[3:])
gr.columns = ['chrom', 'start', 'end']+ list(gr.columns[3:])

adata = RangeAnnData(counts)
adata.obs_names = [f"Cell_{i:d}" for i in range(adata.n_obs)]
adata.var_names = [f"Gene_{i:d}" for i in range(adata.n_vars)]
adata.set_coord(exons)

In [39]:
rna = pd.read_csv('data/tumor01_OE0260_AMPLIFY-NEOVAC_NOA21_02-001.fpkm_tpm.featureCounts.tsv', sep="\t")
rna.columns = ['chrom', 'start', 'end'] + list(rna.columns[3:])
rna_data = RangeAnnData(rna[['num_reads']].transpose())
rna_data.var_names = rna['gene_id']
rna_data.set_coord(rna)

sv = pd.read_csv('data/svs_OE0260_AMPLIFY-NEOVAC_NOA21_02-001_tumor01-blood01_filtered_somatic.tsv', sep="\t")
sv.columns = ['chrom', 'start', 'end'] + list(sv.columns[3:])

snv = pd.read_csv('data/snvs_OE0260_AMPLIFY-NEOVAC_NOA21_02-001_somatic_snvs_conf_8_to_10.vcf', sep="\t")
snv.columns = ['chrom', 'start', 'end'] + list(snv.columns[3:])
snv['end'] = snv['start'] + 1
snv_data = RangeAnnData(snv[['QUAL']].transpose())
snv_data.set_coord(snv)

indel = pd.read_csv('data/indel_OE0260_AMPLIFY-NEOVAC_NOA21_02-001_somatic_indels_conf_8_to_10.vcf', sep="\t")
indel.columns = ['chrom', 'start', 'end'] + list(indel.columns[3:])
indel['end'] = indel['start'] + 1
# indel_data = RangeAnnData(indel[['QUAL']].transpose())
# indel_data.set_coord(indel)


  rna_data = RangeAnnData(rna[['num_reads']].transpose())
  snv_data = RangeAnnData(snv[['QUAL']].transpose())


chrom                                                                    1
start                                                             70505384
end                                                                      .
REF                                                                     AC
ALT                                                                      A
QUAL                                                                     .
FILTER                                                                PASS
INFO                     SOMATIC;BRF=0.35;FR=0.2500;HP=1;HapScore=1;MGO...
FORMAT                                                  GT:GL:GOF:GQ:NR:NV
CONTROL                                   0/0:0.0,-35.65,-299.4:5:99:119:0
TUMOR                                   0/1:-209.66,0.0,-299.7:4:99:203:64
DBSNP                                                                    .
1K_GENOMES                                                               .
ExAC                     

In [33]:
snv_data = RangeAnnData(snv[['QUAL']].transpose())
snv_data.set_coord(snv)

  snv_data = RangeAnnData(snv[['QUAL']].transpose())


In [36]:
snv_data.varm['coord']

Unnamed: 0,chrom,start,end,REF,ALT,QUAL,FILTER,INFO,FORMAT,/omics/odcf/project/OE0260/amplify-neovac/sequencing/exon_sequencing/view-by-pid/OE0260_AMPLIFY-NEOVAC_NOA21_02-001/tumor01/paired/merged-alignment/.merging_0/tumor01_OE0260_AMPLIFY-NEOVAC_NOA21_02-001_merged.mdup.bam,...,CpGislands,TFBScons,ENCODE_DNASE,miRNAs_snoRNAs,miRBase18,COSMIC,miRNAtargets,CgiMountains,phastConsElem20bp,ENCODE_TFBS
0,1,871192,.,C,T,4.800000e+01,PASS,DP=165;VDB=2.331949e-01;RPB=1.645767e+00;AF1=0...,GT:PL:GQ,"0/1:78,0,255:81",...,CpG_17,.,39,.,.,.,.,CGI(17%);score=83,.,EZH2&KAP1&SUZ12
1,1,902035,.,C,T,3.540000e+00,PASS,DP=186;VDB=1.140054e-01;RPB=7.859256e-01;AF1=0...,GT:PL:GQ,"0/1:31,0,211:30",...,CpG_20,.,118,.,.,.,.,CGI(7%);score=93,.,POLR2A&MAX&EZH2&TFAP2A&TFAP2C&PHF8&SIN3AK20&MX...
2,1,907516,.,C,T,-0.000000e+00,PASS,DP=172;VDB=1.302682e-01;RPB=9.739171e-01;AF1=0...,GT:PL:GQ,"0/0:0,215,255:99",...,.,.,.,.,.,.,.,.,.,.
3,1,908658,.,G,A,1.380000e+02,PASS,DP=275;VDB=1.719430e-01;RPB=1.030886e+00;AF1=0...,GT:PL:GQ,"0/1:168,0,255:99",...,.,.,.,.,.,.,.,.,.,.
4,1,915554,.,C,T,-0.000000e+00,PASS,DP=219;VDB=1.066174e-01;RPB=6.524372e-01;AF1=0...,GT:PL:GQ,"0/0:0,255,255:99",...,.,.,8,.,.,.,.,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7997,X,153997596,.,G,A,4.350000e-09,PASS,DP=97;VDB=1.226685e-01;RPB=1.308506e+00;AF1=0;...,GT:PL:GQ,"0/0:0,60,255:99",...,.,.,.,.,.,.,.,CGI(39%);score=61,.,.
7998,X,155227354,.,G,A,1.630000e+02,PASS,DP=201;VDB=4.034074e-01;RPB=1.430359e+00;AF1=0...,GT:PL:GQ,"0/1:193,0,255:99",...,.,.,.,.,.,.,.,.,.,GATA1&POLR2A&TAF1&EP300&POU2F2&TEAD4&RUNX3&EGR...
7999,Y,2655739,.,A,G,1.060000e+02,PASS,DP=27;VDB=1.226496e-01;RPB=5.484828e-01;AF1=0....,GT:PL:GQ,"0/1:136,0,136:99",...,.,.,.,.,.,.,.,.,.,EZH2
8000,Y,14107589,.,C,T,2.250000e+02,PASS,DP=91;VDB=6.335098e-02;RPB=4.660041e-01;AF1=0....,GT:PL:GQ,"0/1:255,0,255:99",...,.,.,.,.,.,.,.,CGI(27%);score=73,.,.
