In [6]:
import os
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"

In [2]:
# import dependencies
import os
import numpy as np
import pandas as pd
import scanpy as sc
import loompy as lp
from MulticoreTSNE import MulticoreTSNE as TSNE


In [3]:
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)
# Set maximum number of jobs for Scanpy.
sc.settings.njobs = 30

-----
anndata     0.8.0
scanpy      1.9.2
-----
MulticoreTSNE       NA
PIL                 9.4.0
asttokens           NA
backcall            0.2.0
beta_ufunc          NA
binom_ufunc         NA
cffi                1.15.1
cloudpickle         2.2.1
colorama            0.4.6
comm                0.1.2
cycler              0.10.0
cython_runtime      NA
cytoolz             0.12.1
dask                2023.2.0
dateutil            2.8.2
debugpy             1.6.6
decorator           5.1.1
executing           1.2.0
h5py                3.8.0
hypergeom_ufunc     NA
invgauss_ufunc      NA
ipykernel           6.21.2
jedi                0.18.2
jinja2              3.1.2
joblib              1.2.0
kiwisolver          1.4.4
llvmlite            0.39.1
loompy              3.0.7
markupsafe          2.1.2
matplotlib          3.7.0
mpl_toolkits        NA
natsort             8.2.0
nbinom_ufunc        NA
ncf_ufunc           NA
nct_ufunc           NA
ncx2_ufunc          NA
numba               0.56.4
numexpr         

In [4]:
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
import os
import glob
import pickle
import pandas as pd
import numpy as np

from dask.diagnostics import ProgressBar

from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2

from ctxcore.rnkdb import FeatherRankingDatabase as RankingDatabase
from pyscenic.utils import modules_from_adjacencies, load_motifs
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell

import seaborn as sns

DATA_FOLDER="~/tmp"
DATABASE_FOLDER = "/home/jjanssens/jjans/resources/resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based"
DATABASES_GLOB = os.path.join(DATABASE_FOLDER, "*.genes_vs_motifs.rankings.feather")
MOTIF_ANNOTATIONS_FNAME = '/home/jjanssens/jjans/resources/resources.aertslab.org/cistarget/motif2tf/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl'
HG_TFS_FNAME = '/home/jjanssens/jjans/resources/resources.aertslab.org/cistarget/tf_lists/allTFs_hg38.txt'




In [7]:
for sample in ['iGlut_post','iGABA_pre','iGABA_post','iGlut_pre']:
    print(sample)

    #load metadata
    meta = pd.read_csv("scanpy/"+sample+"_dr_clustered_raw_merged_meta.tsv",sep="\t",index_col=0)
    meta['M_CycA'] = meta['CycA']

    #define morphogens
    morphogens = ['XAV', 'CHIR', 'RA', 'FGF8', 'BMP4', 'SHH','CycA']
    morphogens = ['M_'+x for x in morphogens]

    #load regulon acitivities
    import pickle
    file = open("pyscenic/regulons/consensus_0/aucell_"+sample+".p",'rb')
    auc_mtx = pickle.load(file)
    file.close()

    #only keep regulons that were detected in the sample
    regulons_sample = [x for x in auc_mtx.columns if sample in x]
    auc_mtx = auc_mtx[regulons_sample].copy()

    #rename columns (to just TF)
    import re
    auc_mtx.columns = [re.sub(sample+"--","",x) for x in auc_mtx.columns]
    
    
    meta['N2B27_2Si'] = 0
    meta['NIM'] = 0
    meta['N2B27_SB_CHIR'] = 0
    if 'Basal_media' in meta.columns:
        meta.loc[meta['Basal_media']=='N2B27_2Si','N2B27_2Si'] = 1
        meta.loc[meta['Basal_media']=='NIM','NIM'] = 1
        meta.loc[meta['Basal_media']=='N2B27_SB_CHIR','N2B27_SB_CHIR'] = 1

    
    
    
    non_variable_columns = ['sample', 'species', 'gene_count', 'tscp_count', 'mread_count',
       'bc1_well', 'bc2_well', 'bc3_well', 'bc1_wind', 'bc2_wind', 'bc3_wind',
       'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts',
       'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt',
       'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'log1p_total_counts_ribo', 'pct_counts_ribo', 'n_genes', 'percent_mito',
       'n_counts', 'outlier', 'mt_outlier', 'doublet_score',
       'predicted_doublet', 'leiden_4', 'leiden_10', 'merged_clusters_from_10',
       'plateID', 'parse_id', 'AP_axis', 'DV_axis', 'Basal_media',
       'final_clustering', 'CycA','tSNE_1', 'tSNE_2', 'umap_1', 'umap_2']
    
    
    variables = ['M_FGF8','M_RA','M_XAV','M_CHIR','M_RA','M_BMP4','M_SHH','M_CycA','N2B27_2Si','NIM','N2B27_SB_CHIR']
    
    import pandas as pd
    import warnings

    warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)


    for morph1 in variables:
        for morph2 in variables:
            if morph1!=morph2:
                meta[morph1+':'+morph2] = meta[morph1]*meta[morph2]
                for morph3 in variables:
                    if morph3 not in [morph1,morph2]:
                        meta[morph1+':'+morph2+':'+morph3] = meta[morph1]*meta[morph2]*meta[morph3]



                        
    meta_var_cols = [x for x in meta.columns if x not in non_variable_columns]
    meta_var = meta[meta_var_cols].copy()

    a = meta_var.sum()
    a = a[a>0]

    meta_var = meta_var[a.index]

    meta_var_uniq = meta_var.T.drop_duplicates().T


    #add morphogens to activity matrix
    morphogens = meta_var_uniq.columns
    for x in morphogens:
        auc_mtx[x] = meta_var_uniq.loc[auc_mtx.index,x]

    #load TF names
    tf_names = list(morphogens)


    #run SCENIC (first step to define gene modules)
    import pandas as pd
    from arboreto.utils import load_tf_names
    from arboreto.algo import grnboost2
    from distributed import LocalCluster, Client

    
    for seed in [42,25,7]:
        if __name__ == '__main__':
            # create custom LocalCluster and Client instances
            local_cluster = LocalCluster(n_workers=30,
                                         threads_per_worker=2,
                                         memory_limit=8e9)
            custom_client = Client(local_cluster)

            # load the data

            # run GRN inference multiple times
            adjacencies = grnboost2(expression_data=auc_mtx,
                                    tf_names=tf_names,
                                    client_or_address=custom_client,
                                    seed=seed,
                                    verbose=True)

            # close the Client and LocalCluster after use
            custom_client.close()
            local_cluster.close()


        with open('pyscenic_morphogens_interact/consensus_0/'+sample+'_adjacenies_'+str(seed)+'.p', "wb") as f:
            pickle.dump(adjacencies, f)


        modules = list(modules_from_adjacencies(adjacencies, auc_mtx,min_genes=0)) #needs to be run again with min_genes=0 to keep all modules
        with open('pyscenic_morphogens_interact/consensus_0/'+sample+'_modules_'+str(seed)+'.p', "wb") as f:
            pickle.dump(modules, f)


        module_summary = []
        for module in modules:
            tmp = module
            module_name = tmp.name
            morph = re.sub("Regulon for ","",module_name)

            context = tmp.context

            module_genes = tmp.genes
            for gene in module_genes:
                w = tmp[gene]
                module_summary.append(dict(morph=morph,gene=gene,w=w,context=context))

        module_summary = pd.DataFrame(module_summary)
        module_summary.to_csv("pyscenic_morphogens_interact/consensus_0/"+sample+"_module_summary_"+str(seed)+".tsv",sep="\t")



iGlut_post
preparing dask client
parsing input
creating dask graph
30 partitions
computing dask graph
not shutting down client, client was created externally
finished



2024-04-18 15:40:55,280 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [False].

2024-04-18 15:40:56,004 - pyscenic.utils - INFO - Creating modules.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = adjacencies.groupby(by=COLUMN_NAME_TARGET).apply(


preparing dask client
parsing input
creating dask graph
30 partitions
computing dask graph
not shutting down client, client was created externally
finished



2024-04-18 15:48:25,780 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [False].

2024-04-18 15:48:26,497 - pyscenic.utils - INFO - Creating modules.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = adjacencies.groupby(by=COLUMN_NAME_TARGET).apply(


preparing dask client
parsing input
creating dask graph
30 partitions
computing dask graph
not shutting down client, client was created externally
finished



2024-04-18 15:55:55,748 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [False].

2024-04-18 15:55:56,473 - pyscenic.utils - INFO - Creating modules.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = adjacencies.groupby(by=COLUMN_NAME_TARGET).apply(


iGABA_pre
preparing dask client
parsing input
creating dask graph
30 partitions
computing dask graph
not shutting down client, client was created externally
finished



2024-04-18 16:05:13,152 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [False].

2024-04-18 16:05:14,020 - pyscenic.utils - INFO - Creating modules.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = adjacencies.groupby(by=COLUMN_NAME_TARGET).apply(


preparing dask client
parsing input
creating dask graph
30 partitions
computing dask graph
not shutting down client, client was created externally
finished



2024-04-18 16:13:53,366 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [False].

2024-04-18 16:13:54,246 - pyscenic.utils - INFO - Creating modules.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = adjacencies.groupby(by=COLUMN_NAME_TARGET).apply(


preparing dask client
parsing input
creating dask graph
30 partitions
computing dask graph
not shutting down client, client was created externally
finished



2024-04-18 16:22:47,064 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [False].

2024-04-18 16:22:47,947 - pyscenic.utils - INFO - Creating modules.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = adjacencies.groupby(by=COLUMN_NAME_TARGET).apply(


iGABA_post
preparing dask client
parsing input
creating dask graph
30 partitions
computing dask graph
not shutting down client, client was created externally
finished


2024-04-18 16:26:42,037 - distributed.worker - ERROR - Failed to communicate with scheduler during heartbeat.
Traceback (most recent call last):
  File "/links/groups/treutlein/USERS/jjans/anaconda3/envs/pyscenic/lib/python3.10/site-packages/distributed/comm/tcp.py", line 225, in read
    frames_nbytes = await stream.read_bytes(fmt_size)
tornado.iostream.StreamClosedError: Stream is closed

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/links/groups/treutlein/USERS/jjans/anaconda3/envs/pyscenic/lib/python3.10/site-packages/distributed/worker.py", line 1216, in heartbeat
    response = await retry_operation(
  File "/links/groups/treutlein/USERS/jjans/anaconda3/envs/pyscenic/lib/python3.10/site-packages/distributed/utils_comm.py", line 419, in retry_operation
    return await retry(
  File "/links/groups/treutlein/USERS/jjans/anaconda3/envs/pyscenic/lib/python3.10/site-packages/distributed/utils_comm.py", line 404, in re

preparing dask client
parsing input
creating dask graph
30 partitions
computing dask graph
not shutting down client, client was created externally
finished



2024-04-18 16:30:42,210 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [False].

2024-04-18 16:30:42,754 - pyscenic.utils - INFO - Creating modules.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = adjacencies.groupby(by=COLUMN_NAME_TARGET).apply(


preparing dask client
parsing input
creating dask graph
30 partitions
computing dask graph
not shutting down client, client was created externally
finished



2024-04-18 16:34:20,365 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [False].

2024-04-18 16:34:20,876 - pyscenic.utils - INFO - Creating modules.
To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df = adjacencies.groupby(by=COLUMN_NAME_TARGET).apply(


iGlut_pre
preparing dask client
parsing input
creating dask graph
30 partitions
computing dask graph
not shutting down client, client was created externally
finished



2024-04-18 17:05:05,115 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [False].

2024-04-18 17:05:07,050 - pyscenic.utils - INFO - Creating modules.


preparing dask client
parsing input
creating dask graph
30 partitions
computing dask graph
not shutting down client, client was created externally
finished



2024-04-18 17:34:06,410 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [False].

2024-04-18 17:34:08,733 - pyscenic.utils - INFO - Creating modules.


preparing dask client
parsing input
creating dask graph
30 partitions
computing dask graph
not shutting down client, client was created externally
finished



2024-04-18 18:03:31,418 - pyscenic.utils - INFO - Calculating Pearson correlations.

	Dropout masking is currently set to [False].

2024-04-18 18:03:33,715 - pyscenic.utils - INFO - Creating modules.
