In [None]:
import anndata as ad
import pandas as pd
import numpy as np
import scanpy as sc
import os

In [None]:
def validate_input_arguments (arguments_list):
    if arguments_list["out_path"] is None:
        print("Please provide an output path")
    elif arguments_list["out_path"][-1] != "/":
        arguments_list["out_path"] = arguments_list["out_path"] + "/"

    if arguments_list["celltype"] is None:
        print("Please provide the name of the metadata field containing cell type annotations")

    if arguments_list["condition"] is None:
        print("Please provide the name of the metadata field containing condition annotations")

    if arguments_list["organism"] is None:
        arguments_list["organism"] = "human"

    if arguments_list["comparison_list"] is None:
        arguments_list["comparison_list"] = np.nan

    if arguments_list["logfc"] is None:
        arguments_list["logfc"] = 0.0

    if arguments_list ["pval"] is None:
        arguments_list["pval"] = 0.05

    if arguments_list["reg"] is None:
        arguments_list["reg"] = load_dorothea_regulon(arguments_list["organism"])

    elif isinstance(arguments_list["reg"], str):
        arguments_list["reg"] = pd.read_csv(arguments_list["reg"], index_col=0)
        arguments_list["reg"] = pd.DataFrame.rename(arguments_list["reg"], columns={"source" : "tf"})

    if not "tf" in arguments_list["reg"] and "target" in arguments_list["reg"] and "weight"in arguments_list["reg"]:
        raise Exception("Not all necessary columns found in regulon table! Please make sure that the regulon has the columns source, target and weight!")
    
 
   
    return(arguments_list)



In [None]:
def AverageExpression(anndataobject, celltype = None):
    gene_ids = anndataobject.var.index.values
    #cluster line necessary if not even returned?
    clusters = anndataobject.obs[celltype].cat.categories
    obs = anndataobject[:,gene_ids].X.toarray()
    obs = pd.DataFrame(obs,columns=gene_ids,index= anndataobject.obs[celltype])
    average_obs = obs.groupby(level=0, observed=False).mean()
    average_obs.T.to_csv("average.csv")

    return obs

#https://github.com/scverse/scanpy/issues/336
#https://github.com/scverse/scanpy/issues/181

In [76]:
#not per cluster but cluster and pval etc need to be added to csv (check against specific marker csv from lr2tf test run in R)

def get_significant_tfs_single(anndataobject, condition, out_path, pval, logfc):
    
    #does not work if condition is None
    single_result_path = out_path + condition 
    if not os.path.isdir(single_result_path):
        os.mkdir(single_result_path)
    
    anndataobject.layers["tf_activities"] =  sc.pp.scale(anndataobject.X)
    #or sc.pp.normalize_total(anndataobject)
    # sc.pp.log1p(anndata_object) 

    number_of_clusters = len(anndataobject.uns["tf_annotation"]) 

    anndataobject_markers = sc.pp.highly_variable_genes(anndataobject, layer="tf_activities", flavor= "seurat", inplace=False)
     #FindAllMarkers(seuratobject, only.pos = TRUE, min.pct = 0, logfc.threshold = 0, verbose = FALSE)

    anndataobject_markers.to_csv((single_result_path + "/all_specificmarker_" + condition + ".csv"))

    return anndataobject_markers

In [75]:
anndataobject = ad.read_h5ad("LR2TF_test_run/anndata_object.h5ad")
anndataobject.layers["tf_activities"] =  sc.pp.scale(anndataobject.X)
anndataobject_markers = sc.pp.highly_variable_genes(anndataobject, layer="tf_activities", flavor= "seurat", inplace=False)
anndataobject_markers


Unnamed: 0,means,dispersions,mean_bin,dispersions_norm,highly_variable
FO538757.2,1.768308e+00,5.280036,"(1.606, 2.409]",-1.112706,False
RP11-206L10.9,1.267639e+01,18.885371,"(12.045, 12.848]",0.978449,False
SAMD11,1.000000e-12,,"(-0.0161, 0.803]",,False
NOC2L,4.953634e+00,9.905836,"(4.818, 5.621]",-1.543635,False
HES4,4.098957e+00,9.240305,"(4.015, 4.818]",-0.772313,False
...,...,...,...,...,...
SMYD5,1.000000e-12,,"(-0.0161, 0.803]",,False
CCDC85C,1.000000e-12,,"(-0.0161, 0.803]",,False
CPNE2,1.000000e-12,,"(-0.0161, 0.803]",,False
TMEM206,1.000000e-12,,"(-0.0161, 0.803]",,False


In [None]:
#test if subsetting object and using sc.pp.calculate_qc_metrics yields same results (yes it does)

#anndataobject = ad.read_h5ad("LR2TF_test_run/anndata_object.h5ad")
#anndataobject.obs["new_annotation"]
#neural = anndataobject[anndataobject.obs.new_annotation == "Neural"]
#megakaryocyte = anndataobject[anndataobject.obs.new_annotation == "Neural"]
#msc = anndataobject[anndataobject.obs.new_annotation == "Neural"]
#fibroblast = anndataobject[anndataobject.obs.new_annotation == "Neural"]
#myeloid = anndataobject[anndataobject.obs.new_annotation == "Neural"]

#test_neural = sc.pp.calculate_qc_metrics(neural, inplace=True)
#neural.var.to_csv("test_neural")

In [31]:
#ignore extra tfs from decoupler while writing script 

def tf_activity_analysis (anndataobject, tf_activities = None, arguments_list = None):
    
    if (isinstance(anndataobject, str)):
        anndataobject = ad.read_h5ad(anndataobject)

    arguments_list = validate_input_arguments(arguments_list)

    if not os.path.isdir(arguments_list["out_path"]):
        os.mkdir(arguments_list["out_path"])
        tf_path = arguments_list["out_path"] + "TF_results/"
        os.mkdir(tf_path)
    else:
        tf_path = arguments_list["out_path"] + "TF_results/"

    #skipped tf activities part. ignore extra tf data from decoupler?

    anndataobject.obs["condition"] = arguments_list["condition"] 
    anndataobject.obs["cell_type"] = arguments_list["celltype"]
    anndataobject.obs["comparison_list"] = arguments_list["comparison_list"]

    if not np.isnan(arguments_list["comparison_list"]):
        if len(arguments_list["comparison_list"]) > 0 & len(anndataobject.obs["comparison_list"]) < 2:
            arguments_list["comparison_list"] <- np.nan
            print("Only one condition was found in the data, although a list of comparisons was provided. The analyses are performed only for the present condition!")

    #code for single condition  analysis

    if np.isnan(arguments_list["comparison_list"]):
        anndataobject.uns["tf_annotation"] = pd.DataFrame({"result_list" : [],
        "gene_expression_list" : [],
        "CTR_cluster_list" : [],
        "intranet_cluster_list" : []})

    #anndataobject_list = split by condition, skipped for now
    sub_object = anndataobject

    #NOT THE SAME RESULTS AS R AVERAGE EXPRESSION
    sub_object.layers["Average_Expression"] = AverageExpression(sub_object, celltype = arguments_list["celltype"])

    #add name into parameters later 
    tf_activity_scores = get_significant_tfs_single(sub_object, arguments_list["condition"], tf_path, pval = arguments_list["pval"], logfc = arguments_list["logfc"])
    #result_list["name"] = tf_activity_scores
    #gene_expression_list[name + "_average_expression"] = sub_object.layers["cell_type"]
    #if (arguments_list["organism"] == "human"):
        #CTR_cluster_list["name"] = generate_CrossTalkeR_input(tf_activity_scores[

    return(sub_object)

In [77]:
sub_object = tf_activity_analysis(anndataobject= "LR2TF_test_run/anndata_object.h5ad", arguments_list= {"out_path" : "folder", "celltype" : "new_annotation", "condition" : "na", "organism" : None, "comparison_list" : None, "logfc" : None, "pval" : None, "reg" : "human_dorothea_reg.csv"})