In [None]:
import pandas as pd
import numpy as np
import pathlib
from wmb import *
from ALLCools.plot import *

In [None]:
mc_annot = cemba.get_mc_annot()
atac_annot = cemba_atac.get_atac_annot()

In [None]:
def save_index(cells, file_name, downsample=100000, random_state=0):
    cells = pd.Series(cells)
    if cells.size > downsample:
        cells = cells.sample(downsample, random_state=random_state)
    cells.to_csv(file_name, index=None, header=False)

## L1 - Neuron Non-Neuron

In [None]:
# neuron (not include DG-GC and CB)
mc_neurons = ~mc_annot['L1_annot'].isin(['ODC', 'OPC', 'ASC', 'MGC', 'CB', 'CBX', 'DG'])
atac_neurons = ~atac_annot['L2_annot'].isin([
    'VPIA', 'VLMC', 'MGL', 'PER', 'VEC', 'RGL', 'ASC', 'EPEN', 'BERG', 'OPC',
    'IOL', 'OGC', 'GRC', 'GRANGL'
])

In [None]:
pathlib.Path('../L1/Neuron').mkdir(exist_ok=True, parents=True)
save_index(mc_annot.sel(cell=mc_neurons).get_index('cell'),
           file_name='../L1/Neuron/mc_cells.txt',
           downsample=10000000,
           random_state=0)

save_index(atac_annot.sel(cell=atac_neurons).get_index('cell'),
           file_name='../L1/Neuron/atac_cells.txt',
           downsample=10000000,
           random_state=0)

In [None]:
cwd = pathlib.Path().absolute()

with open('Snakefile') as f, open('../L1/Snakefile', 'w') as out_f:
    snakefile_tmp = f.read()
    snakefile_tmp = snakefile_tmp.replace('REPLACE_TEMPLATE_DIR', str(cwd))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_DATASET', 'CEMBA_ATAC')
    snakefile_tmp = snakefile_tmp.replace('REPLACE_CLUSTER_COL', str('L1'))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_INTEGRATION_GROUP_KEY',
                                          str('L2'))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_INTEGRATION_PLOT_KEY',
                                          str('L1_annot'))
    snakefile_tmp = snakefile_tmp.replace(
        'REPLACE_CATEGORICAL_KEY', str(['L1_annot', 'L2', 'DissectionRegion']))
    out_f.write(snakefile_tmp)

## Prepare L2

In [None]:
mc_groups = pd.read_csv('../L1/Neuron/mc_integration_group.csv.gz',
                        index_col=0).squeeze()
atac_groups = pd.read_csv('../L1/Neuron/atac_integration_group.csv.gz',
                         index_col=0).squeeze()

In [None]:
mc_groups.value_counts()

In [None]:
atac_groups.value_counts()

In [None]:
# for L2 Neuron
for integration_group in mc_groups.unique():
    mc_cells = mc_groups[mc_groups  == integration_group].index
    atac_cells = atac_groups[atac_groups == integration_group].index

    integration_group = f'InteGroup{integration_group}'
    
    
    if mc_cells.size <= 150 or atac_cells.size <= 150:
        print(integration_group)
    else:
        out_dir = pathlib.Path(f'../L2/Neuron/{integration_group}')
        out_dir.mkdir(exist_ok=True, parents=True)
        save_index(mc_cells,
                   f'../L2/Neuron/{integration_group}/mc_cells.txt',
                   downsample=1000000,
                   random_state=0)
        save_index(atac_cells,
                   f'../L2/Neuron/{integration_group}/atac_cells.txt',
                   downsample=1000000,
                   random_state=0)

    
cwd = pathlib.Path().absolute()
with open('Snakefile') as f, open('../L2/Neuron/Snakefile', 'w') as out_f:
    snakefile_tmp = f.read()
    snakefile_tmp = snakefile_tmp.replace('REPLACE_TEMPLATE_DIR', str(cwd))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_DATASET', 'CEMBA_ATAC')
    snakefile_tmp = snakefile_tmp.replace('REPLACE_CLUSTER_COL', str('L2'))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_INTEGRATION_GROUP_KEY',
                                          str('L3'))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_INTEGRATION_PLOT_KEY',
                                          str('L2'))
    snakefile_tmp = snakefile_tmp.replace(
        'REPLACE_CATEGORICAL_KEY', str(['L2', 'L3', 'DissectionRegion']))
    out_f.write(snakefile_tmp)

### Prepare L4

In [None]:
import joblib
import pathlib
import subprocess

In [None]:
L2_group = []
for path in pathlib.Path('../L2/Neuron').glob('InteGroup*'):
    L2_group.append(str(path).split('/')[-1])

In [None]:
L2_group

In [None]:
for l2_group in L2_group:
    mc_groups = pd.read_csv(f'../L2/Neuron/{l2_group}/mc_integration_group.csv.gz',
                        index_col=0).squeeze()
    atac_groups = pd.read_csv(f'../L2/Neuron/{l2_group}/atac_integration_group.csv.gz',
                         index_col=0).squeeze()
    
    for integration_group in atac_groups.unique():
        mc_cells = mc_groups[mc_groups == integration_group].index
        atac_cells = atac_groups[atac_groups == integration_group].index

        integration_group = f'{l2_group}_{integration_group}'
        if mc_cells.size <= 150 or atac_cells.size <= 150:
            print(integration_group)
        else:
            out_dir = pathlib.Path(f'../L4/Neuron/{integration_group}')
            out_dir.mkdir(exist_ok=True, parents=True)

            save_index(mc_cells,
                       f'../L4/Neuron/{integration_group}/mc_cells.txt',
                       downsample=1000000,
                       random_state=0)
            save_index(atac_cells,
                       f'../L4/Neuron/{integration_group}/atac_cells.txt',
                       downsample=1000000,
                       random_state=0)


In [None]:
cwd = pathlib.Path().absolute()
with open('Snakefile') as f, open('../L4/Neuron/Snakefile', 'w') as out_f:
    snakefile_tmp = f.read()
    snakefile_tmp = snakefile_tmp.replace('REPLACE_TEMPLATE_DIR', str(cwd))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_DATASET', 'CEMBA_ATAC')
    snakefile_tmp = snakefile_tmp.replace('REPLACE_CLUSTER_COL', str('L4'))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_INTEGRATION_GROUP_KEY',
                                          str('L4'))
    snakefile_tmp = snakefile_tmp.replace('REPLACE_INTEGRATION_PLOT_KEY',
                                          str('L4'))
    snakefile_tmp = snakefile_tmp.replace(
        'REPLACE_CATEGORICAL_KEY', str(['L3','L4', 'DissectionRegion']))
    out_f.write(snakefile_tmp)