In [None]:
import pandas as pd
import numpy as np
from wmb import *
from ALLCools.plot import *

import pathlib

In [None]:
mc_annot = cemba.get_mc_annot()
rna_annot = broad.get_tenx_annot()

In [None]:
def save_index(cells, file_name, downsample=100000, random_state=0):
    cells = pd.Series(cells)
    if cells.size > downsample:
        cells = cells.sample(downsample, random_state=random_state)
    cells.to_csv(file_name, index=None, header=False)

In [None]:
mc = mc_annot['MajorRegion'].to_pandas()
mc = mc[mc == 'CB']
mc

In [None]:
rna = rna_annot['MajorRegion'].to_pandas()
rna = rna[rna == 'CB']
rna

## L1 - Neuron Non-Neuron

### prepare neuron

In [None]:
# neuron (not include DG-GC and CB)
mc_neurons = ~mc_annot['L1_annot'].isin(['ODC', 'OPC', 'ASC', 'MGC', 'CB', 'CBX', 'DG'])
rna_neurons = ~rna_annot['L1_annot'].isin(['ODC', 'ASC', 'ECPC', 'NonN', 'OPC', 'VLMC', 'CB', 'OLF1'])

# pathlib.Path('../L1/Neuron').mkdir(exist_ok=True)
save_index(mc_annot.sel(cell=mc_neurons).get_index('cell'),
           file_name='../L1/Neuron/mc_cells.txt',
           downsample=10000000,
           random_state=0)

save_index(rna_annot.sel(cell=rna_neurons).get_index('cell'),
           file_name='../L1/Neuron/rna_cells.txt',
           downsample=10000000,
           random_state=0)




### prepare non_neuron

In [None]:
mc_non_neurons = mc_annot['L1_annot'].isin(['ODC', 'OPC', 'ASC', 'MGC', 'CB', 'CBX', 'DG'])
rna_non_neurons = rna_annot['L1_annot'].isin(['ODC', 'ASC', 'ECPC', 'NonN', 'OPC', 'VLMC', 'CB', 'OLF1'])

pathlib.Path('../L1/NonNeuron').mkdir(exist_ok=True)
save_index(mc_annot.sel(cell=mc_non_neurons).get_index('cell'),
           file_name='../L1/NonNeuron/mc_cells.txt',
           downsample=10000000,
           random_state=0)

save_index(rna_annot.sel(cell=rna_non_neurons).get_index('cell'),
           file_name='../L1/NonNeuron/rna_cells.txt',
           downsample=10000000,
           random_state=0)



In [None]:
# non neuron sub group
#[   'DG']
mc_non_neurons_dict = {'OPC':['OPC'],
                       'ODC':['ODC'],
                  'ASC':['ASC'],
                  'NonN':['MGC'],
                  'CB':['CB','CBX'],
                  }

#[ 'OLF1']
rna_non_neurons_dict = {'OPC':['OPC'],
                       'ODC':['ODC'],
                  'ASC':['ASC'],
                  'NonN':['MGC','NonN','ECPC', 'VLMC',],
                  'CB':['CB'],
                  }

# nonneuron (include DG-GC and CB)
for group, mt in mc_non_neurons_dict.items():
    pathlib.Path(f'../L1/{group}').mkdir(exist_ok=True)
    mc_non_neurons = mc_annot['L1_annot'].isin(mt)
    
    save_index(mc_annot.sel(cell=mc_non_neurons).get_index('cell'),
           file_name=f'../L1/{group}/mc_cells.txt',
           downsample=10000000,
           random_state=0)

for group, mt in rna_non_neurons_dict.items():
    pathlib.Path(f'../L1/{group}').mkdir(exist_ok=True)
    rna_non_neurons = rna_annot['L1_annot'].isin(mt)
    
    save_index(rna_annot.sel(cell=rna_non_neurons).get_index('cell'),
           file_name=f'../L1/{group}/rna_cells.txt',
           downsample=10000000,
           random_state=0)



In [None]:
cwd = pathlib.Path().absolute()
mc_type = 'CGN'

with open('Snakefile') as f, open('../L1/Snakefile', 'w') as out_f:
    snakefile_tmp = f.read()
    snakefile_tmp = snakefile_tmp.replace('REPLACE_MC_TYPE', str(mc_type))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_TEMPLATE_DIR', str(cwd))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_DATASET', 'BROAD_TENX')
    snakefile_tmp = snakefile_tmp.replace('REPLACE_CLUSTER_COL', str('L1'))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_INTEGRATION_GROUP_KEY',
                                          str('L2'))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_INTEGRATION_PLOT_KEY',
                                          str('L1_annot'))
    snakefile_tmp = snakefile_tmp.replace(
        'REPLACE_CATEGORICAL_KEY', str(['L1_annot', 'L2', 'DissectionRegion']))
    out_f.write(snakefile_tmp)
    

### MajorRegion

In [None]:
mc_region_to_use = {'OLF':['OLF']}
rna_region_to_use = {'OLF':['OLF']}

In [None]:
for group, mt in mc_region_to_use.items():
    pathlib.Path(f'../MajorRegion/{group}').mkdir(exist_ok=True)
    mc_cells = mc_annot['MajorRegion'].isin(mt)
    
    save_index(mc_annot.sel(cell=mc_cells).get_index('cell'),
           file_name=f'../MajorRegion/{group}/mc_cells.txt',
           downsample=10000000,
           random_state=0)

for group, mt in rna_region_to_use.items():
    pathlib.Path(f'../MajorRegion/{group}').mkdir(exist_ok=True)
    rna_cells = rna_annot['MajorRegion'].isin(mt)
    
    save_index(rna_annot.sel(cell=rna_cells).get_index('cell'),
           file_name=f'../MajorRegion/{group}/rna_cells.txt',
           downsample=10000000,
           random_state=0)

In [None]:
cwd = pathlib.Path().absolute()
##
mc_type = 'CHN'

with open('Snakefile') as f, open('../MajorRegion/Snakefile', 'w') as out_f:
    snakefile_tmp = f.read()
    snakefile_tmp = snakefile_tmp.replace('REPLACE_MC_TYPE', str(mc_type))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_TEMPLATE_DIR', str(cwd))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_DATASET', 'BROAD_TENX')
    snakefile_tmp = snakefile_tmp.replace('REPLACE_CLUSTER_COL', str('L1'))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_INTEGRATION_GROUP_KEY',
                                          str('L2'))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_INTEGRATION_PLOT_KEY',
                                          str('L1_annot'))
    snakefile_tmp = snakefile_tmp.replace(
        'REPLACE_CATEGORICAL_KEY', str(['L1_annot', 'L2', 'DissectionRegion']))
    out_f.write(snakefile_tmp)
    

## L2

In [None]:
import joblib
import pathlib
import subprocess

In [None]:
mc_groups = pd.read_csv('../L1/Neuron/mc_integration_group.csv.gz',
                        index_col=0).squeeze()
rna_groups = pd.read_csv('../L1/Neuron/rna_integration_group.csv.gz',
                         index_col=0).squeeze()

In [None]:
mc_groups.values

In [None]:
mc_groups[mc_groups.values == -1]

In [None]:
mc_groups.unique()

In [None]:
for integration_group in rna_groups.unique():
    mc_cells = mc_groups[mc_groups == integration_group].index
    rna_cells = rna_groups[rna_groups == integration_group].index

    integration_group = f'InteGroup{integration_group}'
    out_dir = pathlib.Path(f'../L2/{integration_group}')
    out_dir.mkdir(exist_ok=True, parents=True)

    save_index(mc_cells,
               f'../L2/{integration_group}/mc_cells.txt',
               downsample=10000000,
               random_state=0)
    save_index(rna_cells,
               f'../L2/{integration_group}/rna_cells.txt',
               downsample=10000000,
               random_state=0)

cwd = pathlib.Path().absolute()
with open('Snakefile') as f, open('../L2/Snakefile', 'w') as out_f:
    snakefile_tmp = f.read()
    snakefile_tmp = snakefile_tmp.replace('REPLACE_TEMPLATE_DIR', str(cwd))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_DATASET', 'BROAD_TENX')
    snakefile_tmp = snakefile_tmp.replace('REPLACE_CLUSTER_COL', str('L2'))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_INTEGRATION_GROUP_KEY',
                                          str('L3'))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_INTEGRATION_PLOT_KEY',
                                          str('L2'))
    snakefile_tmp = snakefile_tmp.replace(
        'REPLACE_CATEGORICAL_KEY', str(['L2', 'L3', 'DissectionRegion']))
    out_f.write(snakefile_tmp)

# L3

In [None]:
import joblib
import pathlib
import subprocess

In [None]:
L2_group = []
for path in pathlib.Path('../L2').glob('InteGroup*'):
    L2_group.append(str(path).split('/')[-1])

In [None]:
for l2_group in L2_group:
    mc_groups = pd.read_csv(f'../L2/{l2_group}/mc_integration_group.csv.gz',
                        index_col=0).squeeze()
    rna_groups = pd.read_csv(f'../L2/{l2_group}/rna_integration_group.csv.gz',
                         index_col=0).squeeze()
    
    for integration_group in rna_groups.unique():
        mc_cells = mc_groups[mc_groups == integration_group].index
        rna_cells = rna_groups[rna_groups == integration_group].index

        integration_group = f'{l2_group}_{integration_group}'
        if mc_cells.size <= 150 or rna_cells.size <= 150:
            print(integration_group)
        else:
            out_dir = pathlib.Path(f'../L3/{integration_group}')
            out_dir.mkdir(exist_ok=True, parents=True)

            save_index(mc_cells,
                       f'../L3/{integration_group}/mc_cells.txt',
                       downsample=1000000,
                       random_state=0)
            save_index(rna_cells,
                       f'../L3/{integration_group}/rna_cells.txt',
                       downsample=1000000,
                       random_state=0)


In [None]:
cwd = pathlib.Path().absolute()
with open('Snakefile') as f, open('../L3/Snakefile', 'w') as out_f:
    snakefile_tmp = f.read()
    snakefile_tmp = snakefile_tmp.replace('REPLACE_TEMPLATE_DIR', str(cwd))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_DATASET', 'BROAD_TENX')
    snakefile_tmp = snakefile_tmp.replace('REPLACE_CLUSTER_COL', str('L3'))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_INTEGRATION_GROUP_KEY',
                                          str('L3'))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_INTEGRATION_PLOT_KEY',
                                          str('L3'))
    snakefile_tmp = snakefile_tmp.replace(
        'REPLACE_CATEGORICAL_KEY', str(['L3', 'DissectionRegion']))
    out_f.write(snakefile_tmp)

# L4

In [None]:
L2_group = []
for path in pathlib.Path('../L2').glob('InteGroup*'):
    L2_group.append(str(path).split('/')[-1])

In [None]:
for l2_group in L2_group:
    mc_groups = pd.read_csv(f'../L2/{l2_group}/mc_integration_group.csv.gz',
                        index_col=0).squeeze()
    rna_groups = pd.read_csv(f'../L2/{l2_group}/rna_integration_group.csv.gz',
                         index_col=0).squeeze()
    
    for integration_group in rna_groups.unique():
        mc_cells = mc_groups[mc_groups == integration_group].index
        rna_cells = rna_groups[rna_groups == integration_group].index

        integration_group = f'{l2_group}_{integration_group}'
        if mc_cells.size <= 150 or rna_cells.size <= 150:
            print(integration_group)
        else:
            out_dir = pathlib.Path(f'../L4/{integration_group}')
            out_dir.mkdir(exist_ok=True, parents=True)

            save_index(mc_cells,
                       f'../L4/{integration_group}/mc_cells.txt',
                       downsample=1000000,
                       random_state=0)
            save_index(rna_cells,
                       f'../L4/{integration_group}/rna_cells.txt',
                       downsample=1000000,
                       random_state=0)


In [None]:
cwd = pathlib.Path().absolute()
with open('Snakefile') as f, open('../L4/Snakefile', 'w') as out_f:
    snakefile_tmp = f.read()
    snakefile_tmp = snakefile_tmp.replace('REPLACE_TEMPLATE_DIR', str(cwd))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_DATASET', 'BROAD_TENX')
    snakefile_tmp = snakefile_tmp.replace('REPLACE_CLUSTER_COL', str('L3'))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_INTEGRATION_GROUP_KEY',
                                          str('L4'))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_INTEGRATION_PLOT_KEY',
                                          str('L4'))
    snakefile_tmp = snakefile_tmp.replace(
        'REPLACE_CATEGORICAL_KEY', str(['L3', 'L4','DissectionRegion']))
    out_f.write(snakefile_tmp)