In [1]:
import os
os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"

In [2]:
# import dependencies
import os
import numpy as np
import pandas as pd
import scanpy as sc
import loompy as lp
from MulticoreTSNE import MulticoreTSNE as TSNE


In [3]:
from resource import getrlimit, setrlimit, RLIMIT_NPROC

In [4]:
RLIMIT_NPROC

6

In [5]:
import resource
resource.setrlimit(resource.RLIMIT_NPROC, (16384, 16384))

In [6]:
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_versions()
sc.set_figure_params(dpi=150, fontsize=10, dpi_save=600)
# Set maximum number of jobs for Scanpy.
sc.settings.njobs = 30

-----
anndata     0.8.0
scanpy      1.9.2
-----
MulticoreTSNE       NA
PIL                 9.4.0
asttokens           NA
backcall            0.2.0
beta_ufunc          NA
binom_ufunc         NA
cffi                1.15.1
cloudpickle         2.2.1
colorama            0.4.6
comm                0.1.2
cycler              0.10.0
cython_runtime      NA
cytoolz             0.12.1
dask                2023.2.0
dateutil            2.8.2
debugpy             1.6.6
decorator           5.1.1
executing           1.2.0
h5py                3.8.0
hypergeom_ufunc     NA
invgauss_ufunc      NA
ipykernel           6.21.2
jedi                0.18.2
jinja2              3.1.2
joblib              1.2.0
kiwisolver          1.4.4
llvmlite            0.39.1
loompy              3.0.7
markupsafe          2.1.2
matplotlib          3.7.0
mpl_toolkits        NA
natsort             8.2.0
nbinom_ufunc        NA
ncf_ufunc           NA
nct_ufunc           NA
ncx2_ufunc          NA
numba               0.56.4
numexpr         

In [7]:
sample = 'iGlut_pre'
adata = sc.read_h5ad("scanpy/"+sample+"_dr_clustered_raw_merged.h5ad")
meta = pd.read_csv("scanpy/"+sample+"_dr_clustered_raw_merged_meta.tsv",sep="\t",index_col=0)

In [8]:
import seaborn as sns
import matplotlib.pyplot as plt

In [9]:
import os
import glob
import pickle
import pandas as pd
import numpy as np

from dask.diagnostics import ProgressBar

from arboreto.utils import load_tf_names
from arboreto.algo import grnboost2

from ctxcore.rnkdb import FeatherRankingDatabase as RankingDatabase
from pyscenic.utils import modules_from_adjacencies, load_motifs
from pyscenic.prune import prune2df, df2regulons
from pyscenic.aucell import aucell

import seaborn as sns

DATA_FOLDER="~/tmp"
DATABASE_FOLDER = "/home/jjanssens/jjans/resources/resources.aertslab.org/cistarget/databases/homo_sapiens/hg38/refseq_r80/mc_v10_clust/gene_based"
DATABASES_GLOB = os.path.join(DATABASE_FOLDER, "*.genes_vs_motifs.rankings.feather")
MOTIF_ANNOTATIONS_FNAME = '/home/jjanssens/jjans/resources/resources.aertslab.org/cistarget/motif2tf/motifs-v10nr_clust-nr.hgnc-m0.001-o0.0.tbl'
HG_TFS_FNAME = '/home/jjanssens/jjans/resources/resources.aertslab.org/cistarget/tf_lists/allTFs_hg38.txt'




In [10]:
#load TF names
tf_names = load_tf_names(HG_TFS_FNAME)

In [11]:
#load databases
db_fnames = glob.glob(DATABASES_GLOB)
def name(fname):
    return os.path.splitext(os.path.basename(fname))[0]
dbs = [RankingDatabase(fname=fname, name=name(fname)) for fname in db_fnames]
dbs

[FeatherRankingDatabase(name="hg38_10kbp_up_10kbp_down_full_tx_v10_clust.genes_vs_motifs.rankings"),
 FeatherRankingDatabase(name="hg38_500bp_up_100bp_down_full_tx_v10_clust.genes_vs_motifs.rankings")]

In [12]:
adata.obs['CellID'] = adata.obs.index

In [13]:
meta.columns

Index(['sample', 'species', 'gene_count', 'tscp_count', 'mread_count',
       'bc1_well', 'bc2_well', 'bc3_well', 'bc1_wind', 'bc2_wind', 'bc3_wind',
       'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts',
       'log1p_total_counts', 'pct_counts_in_top_20_genes', 'total_counts_mt',
       'log1p_total_counts_mt', 'pct_counts_mt', 'total_counts_ribo',
       'log1p_total_counts_ribo', 'pct_counts_ribo', 'n_genes', 'percent_mito',
       'n_counts', 'outlier', 'mt_outlier', 'doublet_score',
       'predicted_doublet', 'leiden_4', 'leiden_10', 'merged_clusters_from_10',
       'plateID', 'parse_id', 'AP_axis', 'DV_axis', 'Basal_media',
       'final_clustering', 'CycA', 'M_XAV', 'M_CHIR', 'M_RA', 'M_FGF8',
       'M_BMP4', 'M_SHH', 'M_PM', 'tSNE_1', 'tSNE_2', 'umap_1', 'umap_2'],
      dtype='object')

In [14]:
!mkdir pyscenic/{sample}

mkdir: cannot create directory 'pyscenic/iGlut_pre': File exists


In [None]:
for i in range(1,21):
    import random
    random.seed(i)

    subset_cells = list(meta.groupby("final_clustering").sample(n=100,replace=True,random_state=i).index)
    subset_cells = list(set(subset_cells))

    adata_subset = adata[adata.obs['CellID'].isin(subset_cells)].copy()

    dgem = pd.DataFrame.sparse.from_spmatrix(adata_subset.X)
    dgem.index = adata_subset.obs.index
    dgem.columns = adata_subset.var_names

    tf_names = [x for x in tf_names if x in dgem.columns]


    from threadpoolctl import threadpool_info

    import pandas as pd
    from arboreto.utils import load_tf_names
    from arboreto.algo import grnboost2
    from distributed import LocalCluster, Client

    if __name__ == '__main__':
        # create custom LocalCluster and Client instances
        local_cluster = LocalCluster(n_workers=30,
                                     threads_per_worker=2,
                                     memory_limit=8e9)
        custom_client = Client(local_cluster)

        # load the data

        # run GRN inference multiple times
        adjacencies = grnboost2(expression_data=dgem,
                                tf_names=tf_names,
                                client_or_address=custom_client,
                                seed=i,
                                verbose=True)

        # close the Client and LocalCluster after use
        custom_client.close()
        local_cluster.close()

    with open('pyscenic/'+sample+'/adjacenies_'+str(i)+'.p', "wb") as f:
        pickle.dump(adjacencies, f)

    modules = list(modules_from_adjacencies(adjacencies, dgem))

    with open('pyscenic/'+sample+'/modules_'+str(i)+'.p', "wb") as f:
        pickle.dump(modules, f)

    # Calculate a list of enriched motifs and the corresponding target genes for all modules.
    with ProgressBar():
        df = prune2df(dbs, modules, MOTIF_ANNOTATIONS_FNAME)


    # Save the enriched motifs and the discovered regulons to disk.
    df.to_csv('pyscenic/'+sample+'/table_enriched_motifs_'+str(i)+'.csv')
    with open('pyscenic/'+sample+'/table_enriched_motifs_'+str(i)+'.p', "wb") as f:
        pickle.dump(df, f)

    # Create regulons from this table of enriched motifs.
    regulons = df2regulons(df)

    with open('pyscenic/'+sample+'/regulons_'+str(i)+'.p', "wb") as f:
        pickle.dump(regulons, f)


In [82]:
len(regulons)

323

In [85]:
auc_mtx = aucell(dgem, regulons, num_workers=15)


In [86]:
with open('pyscenic/auc_mtx.p', "wb") as f:
    pickle.dump(auc_mtx, f)

In [91]:
dgem_all = pd.DataFrame.sparse.from_spmatrix(adata.X)
dgem_all.index = adata.obs.index
dgem_all.columns = adata.var_names

In [92]:
auc_mtx_all = aucell(dgem_all, regulons, num_workers=15)
with open('pyscenic/auc_mtx_all.p', "wb") as f:
    pickle.dump(auc_mtx_all, f)

In [94]:
from pyscenic.rss import regulon_specificity_scores
from pyscenic.plotting import plot_rss
import matplotlib.pyplot as plt
import seaborn as sns
from pyscenic.binarization import binarize

In [104]:
meta = meta.loc[auc_mtx_all.index]

In [105]:
rss_louvain = regulon_specificity_scores( auc_mtx_all, meta['leiden_10'] )

In [112]:
rss_louvain.to_csv("pyscenic/rss.tsv",sep="\t")