In [1]:
import os
import sys
import pandas as pd
import numpy as np
import glob
import time
from scipy.sparse import csr_matrix
import anndata as an
import scanpy as sc
import pyranges as pr

# load the data

In [2]:
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/pore_c/population_mESC.read_level.parquet"
df = pd.read_parquet(fpath)
df['value'] = 1
print(f"{df.shape=}")
df.head()

df.shape=(14877807, 13)


Unnamed: 0,read_name,read_start,read_end,length_on_read,chrom,ref_start,ref_end,mapping_quality,basename,local_position,global_position,order,value
0,00000b61-7794-4b29-9f89-2b74e7bbce3e,8,400,392,15,40952485,40952851,60,batch02,40952668,2053916568,4,1
1,00000b61-7794-4b29-9f89-2b74e7bbce3e,400,553,153,1,153345218,153345369,60,batch02,153345293,153345293,4,1
2,00000b61-7794-4b29-9f89-2b74e7bbce3e,611,759,148,16,13896976,13897125,36,batch02,13897050,2130934901,4,1
3,00000b61-7794-4b29-9f89-2b74e7bbce3e,810,1110,300,4,99871359,99871640,60,batch02,99871499,636526111,4,1
4,000047b3-2703-4687-9978-37722f5619da,0,1025,1025,4,31146588,31147172,60,batch02,31146880,567801492,4,1


# make the interval table

In [3]:
def get_intervals(bp, resolution):
  """
  Creates a list of non-overlapping intervals from 0 to bp, 
  incremented by the resolution, using NumPy.
  """
  edges = np.arange(0, bp + resolution, resolution)
  return np.column_stack((edges[:-1], edges[1:]))


def create_bin_table(chroms, resolution):
    """creates a bin table for a given assembly"""
    bin_table = []
    for _, row in chroms.iterrows():
        chrom, bp = row
        bins = get_intervals(bp, resolution)
        chr_df = pd.DataFrame(bins, columns=['start', 'end'])
        chr_df['chrom'] = chrom

        bin_table.append(chr_df)

    return pd.concat(bin_table, ignore_index=True)
    
def create_chromosome_intervals(fpath, base_resolution=10000):
  """
  Creates a dataframe of chromosome intervals with the specified base resolution.

  Args:
    fpath: Path to the CSV file containing chromosome sizes.
    base_resolution: The desired resolution for the intervals.

  Returns:
    A pandas DataFrame with columns 'chrom', 'start', 'end', 'bin', and 'chrom_bin'.
  """
  chrom = pd.read_csv(fpath)

  intervals = create_bin_table(chrom[['chrom', 'size']], base_resolution)
  intervals = intervals.reset_index(drop=False, names='bin')
  intervals = intervals[['chrom', 'start', 'end', 'bin']]
  intervals['chrom_bin'] = intervals.groupby('chrom')['bin'].cumcount()

  intervals['start'] = intervals['start'].astype(int)
  intervals['end'] = intervals['end'].astype(int)
  intervals['bin_name'] = "chr" + intervals['chrom'] + ":" + intervals['chrom_bin'].astype(str)
  print(f"{intervals.shape=}")
  return intervals

# Example usage:
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/reference/chrom_sizes.csv"
base_resolution = 10000
intervals = create_chromosome_intervals(fpath, base_resolution=base_resolution)
intervals.head()

intervals.shape=(263206, 6)


Unnamed: 0,chrom,start,end,bin,chrom_bin,bin_name
0,1,0,10000,0,0,chr1:0
1,1,10000,20000,1,1,chr1:1
2,1,20000,30000,2,2,chr1:2
3,1,30000,40000,3,3,chr1:3
4,1,40000,50000,4,4,chr1:4


# merge the interval table

In [4]:
pyranges_columns = {
    'chrom' : 'Chromosome',
    'start' : 'Start',
    'end' : 'End',
    'ref_start' : 'Start',
    'ref_end' : 'End',
}

# rename columns
intervals = intervals.rename(columns=pyranges_columns)
df = df.rename(columns=pyranges_columns)

df_pr = pr.PyRanges(df)
intervals_pr = pr.PyRanges(intervals)

df = df_pr.join(
    intervals_pr,
    strandedness=None,
    how='left',
    report_overlap=True,
).df


result_columns = {
    'Chromosome' : 'chrom',
    'Start' : 'ref_start',
    'End' : 'ref_end',
    'Start_b' : 'bin_start',
    'End_b' : 'bin_end',
    'Overlap' : 'overlap',
}

df = df.rename(columns=result_columns)

# add index identifiers
df['read_index'] = pd.factorize(df['read_name'])[0]
df['bin_index'] = pd.factorize(df['bin'])[0]

print(f"{df.shape=}")
df.head()

df.shape=(15266109, 21)


Unnamed: 0,read_name,read_start,read_end,length_on_read,chrom,ref_start,ref_end,mapping_quality,basename,local_position,...,order,value,bin_start,bin_end,bin,chrom_bin,bin_name,overlap,read_index,bin_index
0,00000b61-7794-4b29-9f89-2b74e7bbce3e,400,553,153,1,153345218,153345369,60,batch02,153345293,...,4,1,153340000,153350000,15334,15334,chr1:15334,151,0,0
1,00034a2f-d701-4d8e-b9ac-f042759ea993,150,369,219,1,66952097,66952295,54,batch02,66952196,...,2,1,66950000,66960000,6695,6695,chr1:6695,198,1,1
2,00035947-c139-4b07-ac0e-5e607bfc0c79,141,301,160,1,23272831,23272980,60,batch02,23272905,...,4,1,23270000,23280000,2327,2327,chr1:2327,149,2,2
3,00044e92-a6d5-4488-95b4-a63e5871f029,0,470,470,1,188808706,188809117,60,batch02,188808911,...,9,1,188800000,188810000,18880,18880,chr1:18880,411,3,3
4,00044e92-a6d5-4488-95b4-a63e5871f029,470,690,220,1,188918706,188918925,60,batch02,188918815,...,9,1,188910000,188920000,18891,18891,chr1:18891,219,3,4


# Make X

In [5]:
data = df['value'].tolist()
row = df['bin_index'].values
col = df['read_index'].values

obs_names = df['bin_index'].unique()
var_names = df['read_index'].unique()

n = df['bin_index'].nunique()
m = df['read_index'].nunique()

X = csr_matrix((data, (row, col)), shape=(n, m))
print(f"{X.shape=}")

X.shape=(256188, 2756467)


In [6]:
df.columns

Index(['read_name', 'read_start', 'read_end', 'length_on_read', 'chrom',
       'ref_start', 'ref_end', 'mapping_quality', 'basename', 'local_position',
       'global_position', 'order', 'value', 'bin_start', 'bin_end', 'bin',
       'chrom_bin', 'bin_name', 'overlap', 'read_index', 'bin_index'],
      dtype='object')

# make obs

In [7]:
obs_columns = [
    'chrom',
    'bin_start',
    'bin_end',
    'bin',
    'bin_name',
    'bin_index',
    'chrom_bin',
]
obs = df[obs_columns].drop_duplicates()
print(f"{obs.shape=}")

# add some columns
obs['n_reads'] = X.sum(axis=1)

print(f"{obs.shape=}")
obs.head()

obs.shape=(256188, 7)
obs.shape=(256188, 8)


Unnamed: 0,chrom,bin_start,bin_end,bin,bin_name,bin_index,chrom_bin,n_reads
0,1,153340000,153350000,15334,chr1:15334,0,15334,77
1,1,66950000,66960000,6695,chr1:6695,1,6695,56
2,1,23270000,23280000,2327,chr1:2327,2,2327,54
3,1,188800000,188810000,18880,chr1:18880,3,18880,51
4,1,188910000,188920000,18891,chr1:18891,4,18891,60


# load and merge genes

In [8]:
# Load genes
fpath = "/scratch/indikar_root/indikar1/shared_data/higher_order/reference/gene_table.parquet"
gdf = pd.read_parquet(fpath)
print(f"{gdf.shape=}")

gdf_pr = pr.PyRanges(gdf)
obs_pr = pr.PyRanges(obs.rename(columns={
    'chrom' : 'Chromosome',
    'bin_start' : 'Start',
    'bin_end' : 'End',
}))

gdf = gdf_pr.join(
    obs_pr,
    strandedness=None,
    how='left',
    report_overlap=True,
    
).df.rename(columns={
    'Chromosome' : 'chrom', 
    'Start' : 'bin_start', 
    'End' : 'bin_end', 
    'Start_b' : 'gene_start', 
    'End_b' : 'gene_end', 
    'length' : 'gene_length',
    'Overlap' : 'overlap',
})


# drop unalignable genes
gdf = gdf[gdf['bin'] >= 0].reset_index()

keep_columns = [
    'gene_name', 
    'gene_id',
    'gene_biotype',
    'chrom',
    'gene_length',
    'is_tf',
    'gene_start',
    'gene_end',
    'bin',
    'bin_name',
    'chrom_bin',
    'overlap',    
] 

gdf = gdf[keep_columns]
gdf['n_bins_spanned'] = gdf.groupby('gene_name')['bin'].transform('nunique')
gdf['is_tf'] = gdf['is_tf'].astype(bool)
print(f"{gdf.shape=}")

gdf.head()

gdf.shape=(56655, 10)
gdf.shape=(185152, 13)


Unnamed: 0,gene_name,gene_id,gene_biotype,chrom,gene_length,is_tf,gene_start,gene_end,bin,bin_name,chrom_bin,overlap,n_bins_spanned
0,Gm38212,ENSMUSG00000104478,TEC,1,2756,False,108340000,108350000,10834,chr1:10834,10834,2756,1
1,Gm7449,ENSMUSG00000104385,processed_pseudogene,1,663,False,6980000,6990000,698,chr1:698,698,663,1
2,Gm37108,ENSMUSG00000102135,processed_pseudogene,1,7030,False,6980000,6990000,698,chr1:698,698,3218,2
3,Gm37108,ENSMUSG00000102135,processed_pseudogene,1,7030,False,6990000,7000000,699,chr1:699,699,3812,2
4,Gm37275,ENSMUSG00000103282,processed_pseudogene,1,30,False,6990000,7000000,699,chr1:699,699,18,2


In [9]:
gene_list = lambda x: ";".join(list(x))

# aggregate for obs
obs_genes = gdf.groupby('bin').agg(
    n_genes = ('gene_name', 'nunique'),
    n_tfs = ('is_tf', 'sum'),
    total_gen_bp = ('overlap', 'sum'),
    genes = ('gene_name', gene_list),   
).reset_index()
print(f"{obs_genes.shape=}")

# filter out bins not seen in pore-c data
obs_genes = obs_genes[obs_genes['bin'].isin(obs['bin'].values)]
print(f"{obs_genes.shape=}")

obs_genes.head()

obs_genes.shape=(149034, 5)
obs_genes.shape=(149034, 5)


Unnamed: 0,bin,n_genes,n_tfs,total_gen_bp,genes
0,314,1,0,1070,4933401J01Rik
1,317,1,0,110,Gm26206
2,327,1,0,3877,Xkr4
3,328,1,0,10000,Xkr4
4,329,1,0,10000,Xkr4


In [10]:
obs = pd.merge(
    obs,
    obs_genes,
    how='left',
    left_on='bin',
    right_on='bin',
)

obs['n_genes'] = obs['n_genes'].fillna(0.0).astype(int)
obs['n_tfs'] = obs['n_tfs'].fillna(0.0).astype(int)
obs['total_len_bp'] = obs['total_gen_bp'].fillna(0.0).astype(int)

# ensure proper sorting
obs = obs.set_index('bin_index')
obs = obs.reindex(obs_names)
obs = obs.reset_index()

obs = obs.set_index('bin_name')

obs.head()

Unnamed: 0_level_0,bin_index,chrom,bin_start,bin_end,bin,chrom_bin,n_reads,n_genes,n_tfs,total_gen_bp,genes,total_len_bp
bin_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
chr1:15334,0,1,153340000,153350000,15334,15334,77,1,0,10000.0,Dhx9,10000
chr1:6695,1,1,66950000,66960000,6695,6695,56,0,0,,,0
chr1:2327,2,1,23270000,23280000,2327,2327,54,1,0,4754.0,Gm29506,4754
chr1:18880,3,1,188800000,188810000,18880,18880,51,0,0,,,0
chr1:18891,4,1,188910000,188920000,18891,18891,60,0,0,,,0


# make var

In [11]:
df.columns

Index(['read_name', 'read_start', 'read_end', 'length_on_read', 'chrom',
       'ref_start', 'ref_end', 'mapping_quality', 'basename', 'local_position',
       'global_position', 'order', 'value', 'bin_start', 'bin_end', 'bin',
       'chrom_bin', 'bin_name', 'overlap', 'read_index', 'bin_index'],
      dtype='object')

In [12]:
var = df.copy()

var = var.groupby(['read_name', 'read_index']).agg(
    mean_mapq = ('mapping_quality', 'mean'),
    median_mapq = ('mapping_quality', 'median'),
    n_chromosomes = ('chrom', 'nunique'),
    order = ('order', 'first'),
    n_bins = ('bin', 'nunique'),
    read_length_bp = ('length_on_read', 'sum'),
).reset_index()

# ensure proper sorting
var = var.set_index('read_index')
var = var.reindex(var_names)
var = var.reset_index()
var = var.set_index('read_name')

var.head()

Unnamed: 0_level_0,read_index,mean_mapq,median_mapq,n_chromosomes,order,n_bins,read_length_bp
read_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
00000b61-7794-4b29-9f89-2b74e7bbce3e,0,54.0,60.0,4,4,4,993
00034a2f-d701-4d8e-b9ac-f042759ea993,1,45.0,45.0,2,2,2,381
00035947-c139-4b07-ac0e-5e607bfc0c79,2,35.5,41.0,4,4,4,1378
00044e92-a6d5-4488-95b4-a63e5871f029,3,56.0,60.0,3,9,5,2826
000734c0-eed1-4bd5-8543-c979e3ebdb9e,4,34.5,34.5,2,2,2,363


# Make AnnData

In [13]:
adata = an.AnnData(
    X=csr_matrix(X),
    obs=obs,
    var=var,
)

adata.uns['genes'] = gdf.copy()
adata.uns['intervals'] = intervals.copy()
adata.uns['base_resolution'] = base_resolution

adata.layers["H"] = csr_matrix(adata.X.copy())
sc.logging.print_memory_usage()
adata

Memory usage: current 9.21 GB, difference +9.21 GB


AnnData object with n_obs × n_vars = 256188 × 2756467
    obs: 'bin_index', 'chrom', 'bin_start', 'bin_end', 'bin', 'chrom_bin', 'n_reads', 'n_genes', 'n_tfs', 'total_gen_bp', 'genes', 'total_len_bp'
    var: 'read_index', 'mean_mapq', 'median_mapq', 'n_chromosomes', 'order', 'n_bins', 'read_length_bp'
    uns: 'genes', 'intervals', 'base_resolution'
    layers: 'H'

In [14]:
outpath = f"/scratch/indikar_root/indikar1/shared_data/higher_order/anndata/bulk_{int(base_resolution)}_raw.h5ad"
adata.write(outpath)
sc.logging.print_memory_usage()
adata

Memory usage: current 9.21 GB, difference +0.00 GB


AnnData object with n_obs × n_vars = 256188 × 2756467
    obs: 'bin_index', 'chrom', 'bin_start', 'bin_end', 'bin', 'chrom_bin', 'n_reads', 'n_genes', 'n_tfs', 'total_gen_bp', 'genes', 'total_len_bp'
    var: 'read_index', 'mean_mapq', 'median_mapq', 'n_chromosomes', 'order', 'n_bins', 'read_length_bp'
    uns: 'genes', 'intervals', 'base_resolution'
    layers: 'H'

In [15]:
break

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
break