In [1]:
import glob
from ALLCools.mcds import MCDS
import pandas as pd
import time
import pybedtools
import anndata
import numpy as np
from scipy.sparse import coo_matrix, csc_matrix, csr_matrix, vstack, save_npz

In [2]:
mc_type = 'CGN'
te_class = 'LINE'
inout='all'
ct = 'DG_Glut'

In [3]:
genomepath = '/ref/m3C/mm10.main.nochrM.nochrY.chrom.sizes'
gene_bed = pybedtools.BedTool('TE_use.gencode.vM22.annotation.gene.sorted.bed')
te_bed = pybedtools.BedTool(f'/ref/repeatmasker-with-id/hg38.repeatmasker.repClass-{te_class}.bed')

In [4]:
meta = pd.read_csv('/data/metadata/240104_m3C_META.csv', index_col =0)
meta['AgingMajorType'] = meta['AgingMajorType'].apply(lambda x: x.replace(' ','_').replace('/',''))
meta.head()

Unnamed: 0_level_0,InputReadPairs,InputReadPairsBP,TrimmedReadPairs,R1WithAdapters,R1QualTrimBP,R1TrimmedReadsBP,R2WithAdapters,R2QualTrimBP,R2TrimmedReadsBP,UniqueMappedReads,...,DissectionRegion,Region,Replicate,l1,mc_m3c-co_cluster_l1,mc_m3c-co_cluster_l2,CellType,CellSubclass,AgingMajorType,Class
cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AMB_220628_18mo_3F_4E_1_P1-1-I3-A1,1710943,516615812,1710512,309,766343,216480647,874,649089,230238965,1870858,...,ACB-1+ACB-2,Nucleus_accumbens,rep2,13,23,0-0,STR D1 Gaba,STR D1 Gaba,STR_D1_Gaba,Gaba
AMB_220628_18mo_3F_4E_1_P1-1-I3-A13,1688475,510130057,1688130,316,725418,213815592,832,673498,227339515,1820523,...,ACB-1+ACB-2,Nucleus_accumbens,rep2,11,6,12-6,STR-PAL Chst9 Gaba,STR D1 Sema5a Gaba,STR_D1_Sema5a_Gaba,Gaba
AMB_220628_18mo_3F_4E_1_P1-1-I3-A14,2158376,651979882,2157941,456,998559,273186730,1132,869248,290536436,2374173,...,ACB-1+ACB-2,Nucleus_accumbens,rep2,8,4,2-2,Astro NN,Astro-TE NN,Astro-TE_NN,NN
AMB_220628_18mo_3F_4E_1_P1-1-I3-A2,1582810,478040623,1582446,283,708122,200329678,853,626267,213020525,1746308,...,ACB-1+ACB-2,Nucleus_accumbens,rep2,9,8,7-6,STR D2 Gaba,STR D2 Gaba,STR_D2_Gaba,Gaba
AMB_220628_18mo_3F_4E_1_P1-1-I3-B1,1766714,533610284,1766279,411,810795,223604054,1045,691737,237777442,1968711,...,ACB-1+ACB-2,Nucleus_accumbens,rep2,13,23,0-0,STR D1 Gaba,STR D1 Gaba,STR_D1_Gaba,Gaba


In [8]:
all_cts = ['Oligo_NN', 'STR_D1_Gaba', 'Astro-TE_NN', 'STR_D2_Gaba',
          'L6_CT_CTX_Glut', 'VLMC_NN', 'OPC_NN',
          'L23_IT_CTX_Glut', 'CA1-ProS_Glut', 'CEA-BST_Gaba', 'MEA-COA_Glut',
          'Pvalb_Gaba', 'L45_IT_CTX_Glut', 'L6bCT_ENT_Glut',
          'STR-PAL_Chst9_Gaba', 'L23_IT_PPP_Glut', 'Astro-NT_NN',
          'L6_IT_CTX_Glut', 'CA3_Glut', 'Sst_Gaba', 'MEA-BST_Gaba']

In [7]:
all_mcds_paths = glob.glob(f'/data/female-amb/Famale.TE.zarr/*/{te_class}') 
len(all_mcds_paths)

122

In [None]:
mcds = MCDS.open(
    all_mcds_paths, 
    #use_obs = use_cells,
    obs_dim='cell', 
    ).sel(mc_type = mc_type)

In [None]:
for ct in all_cts:
    use_cells = meta[meta['AgingMajorType'] == ct].index
    if te_class == 'LINE':
        tmp_mcds = mcds.sel(LINE = mcds.get_index(te_class).isin(te['te_id'].unique()), 
                        cell = mcds.get_index('cell').isin(use_cells))
    elif te_class == 'SINE':
        tmp_mcds = mcds.sel(SINE = mcds.get_index(te_class).isin(te['te_id'].unique()),
                        cell = mcds.get_index('cell').isin(use_cells))
    else:
        tmp_mcds = mcds.sel(LTR = mcds.get_index(te_class).isin(te['te_id'].unique()),
                        cell = mcds.get_index('cell').isin(use_cells))
    tmp = tmp_mcds[f'{te_class}_da'].sel(count_type='mc').values.astype(np.float16)/tmp_mcds[f'{te_class}_da'].sel(count_type='cov').values.astype(np.float16)
    ads = anndata.AnnData(csr_matrix(tmp<=0.75, dtype=np.uint8), 
                      obs=mcds.get_index('cell').to_frame()[[]],
                      var=mcds.get_index(te_class).to_frame()[[]],
                      dtype=np.uint8)
    ads.write_h5ad(f"/home/qzeng_salk_edu/project/241115_te_clustering/adata/{ct}.{te_class}.downsampled.h5ad")
    print(f"{ct} done")

In [6]:
# te_ref = pd.read_csv(f"/ref/repeatmasker-with-id/hg38.repeatmasker.repClass-{te_class}.bed", sep='\t', header=None)
# # index as chr1-3188159-3188425
# te_ref.index = te_ref[0]+'-'+te_ref[1].astype(str)+'-'+te_ref[2].astype(str)

# # select features
# hyper_te =pd.read_csv(f'/data/female-amb/aDMR_TE_overlap/Hyper/{ct}.TE.bed', sep='\t', header=None, index_col=3)  
# hypo_te =pd.read_csv(f'/data/female-amb/aDMR_TE_overlap/Hypo/{ct}.TE.bed', sep='\t', header=None, index_col=3)
# # concat hyper and hypo
# te = pd.concat([hyper_te, hypo_te])
# # te index use 7 8 9
# te.index = te[7]+'-'+te[8].astype(str)+'-'+te[9].astype(str)
# te['te_id'] = te_ref[3].to_dict()
# te = te.dropna()
# # drop duplicate
# te = te.drop_duplicates()
# te

In [35]:
#mcds = mcds[f'{te_class}_da'].load()  

In [9]:
# tmp = tmp_mcds[f'{te_class}_da'].sel(count_type='mc').values.astype(np.float16)/tmp_mcds[f'{te_class}_da'].sel(count_type='cov').values.astype(np.float16)

  tmp = mcds[f'{te_class}_da'].sel(count_type='mc').values.astype(np.float16)/mcds[f'{te_class}_da'].sel(count_type='cov').values.astype(np.float16)


In [9]:
# ads = anndata.AnnData(csr_matrix(tmp<=0.75, dtype=np.uint8), 
#                       obs=mcds.get_index('cell').to_frame()[[]],
#                       var=mcds.get_index(te_class).to_frame()[[]],
#                       dtype=np.uint8)

In [10]:
# ads.write_h5ad(f"adata/{ct}.{te_class}.downsampled.h5ad")
# ads

In [11]:
# del tmp

In [40]:
# mcds.add_mc_frac(
#     var_dim = te_class,
# normalize_per_cell=True,  # after calculating mC frac, per cell normalize the matrix
#     clip_norm_value=10  # clip outlier values above 10 to 10
# )


In [41]:
# adata = mcds.get_count_adata(da_name=f'{te_class}_da_frac',var_dim=te_class, 
#                                 loading_chunk=50000)


In [42]:
#adata.write_h5ad(f"{te_class}.norm_downsampled.h5ad")