In [6]:
import os
import sys
import gseapy
import numpy as np
import pandas as pd
from pathlib import Path
from collections import defaultdict 

In [7]:
# Setting global variables
gmt_database = "/Users/sbunga/PycharmProjects/INDRA/scrna/test-project/annotation_script/gmt_database/"


def set_wd(outdir):
    """Set working directory to the provided path"""
    try:
        os.mkdir(outdir)
        os.chdir(outdir)
        os.mkdir('rnk')
        os.mkdir('label_data')
        print("Working directory set to: "+os.getcwd())

    except FileExistsError:
        pass

        
def to_df(seurat_data_raw, delim):
    df = pd.read_csv(seurat_data_raw, sep=delim, header=0)
    return df

        
def to_csv(genes, pvals, cluster):
    data = {"Gene":genes,
            "p_vals": pvals}
    rank_df = pd.DataFrame(data)
    rank_df.to_csv("rnk/cluster_"+str(cluster)+".rnk", sep="\t",
                   index=False, header=False)


def make_rank(seurat_df):
    """ This function accepts a seurat cluster gene
    dataframe and creates a rank file for each cluster """
    full_cluster = {0}
    genes, pvals = [], []
    count = 0 
    for i in range(0, len(seurat_df)):
        cluster = df.iloc[i]['cluster']
        if cluster not in full_cluster:
            full_cluster.add(cluster)
            to_csv(genes, pvals, count)
            count = cluster
            # reset the genes and pvals list to empty
            genes, pvals = [], []
        genes.append(seurat_df.iloc[i]['gene'])
        pvals.append(seurat_df.iloc[i]['avg_logFC'])
        
    # writing the data from the last loop
    to_csv(genes, pvals, count)

    
def annotate_clusters(nclusters, species):
    # Annotate each cluster
    top = []
    for each_cluster in range(0, nclusters):
        try:
            gseapy.prerank("rnk/cluster_"+str(each_cluster)+".rnk",
                           gmt_database + species + ".gmt",
                           'label_data/' + str(each_cluster) + '_folder')
        except:
            pass
        if os.path.isfile('label_data/'+str(each_cluster)+'_folder/gseapy.prerank.gene_sets.report.csv'):
            this_top = pd.read_csv(
                'label_data/'+str(each_cluster)+'_folder/'+'gseapy.prerank.gene_sets.report.csv', header=0)
            label = this_top['Term'].iloc[0]
            top.append(str(label))
        else:
            top.append("unknown" + str(each_cluster))
    return top

In [9]:
if __name__ == "__main__":
    set_wd("/Users/sbunga/PycharmProjects/INDRA/scrna/"
           "test-project/annotation_script/test-outputs")
    df = to_df("/Users/sbunga/PycharmProjects/INDRA/scrna/"
           "test-project/annotation_script/"
            "streamlined_code_results_marker_files_2Groups_sampleMarkers_dim35_r2.5.csv", delim=",")
    make_rank(df)
    top =  annotate_clusters(20, "mouse")
    

In [10]:
top

['Basal_Myoepithelial',
 'MHC_class_II',
 'ly_Endothelial_cells_FANTOM_2',
 'Astrocytes_FANTOM_2',
 'Luminal_1_1',
 'Astrocytes_FANTOM_3',
 'Astrocytes_FANTOM_2',
 'Astrocytes_FANTOM_1',
 'Keratinocytes_HPCA_3',
 'Astrocytes_FANTOM_1',
 'Astrocytes_FANTOM_2',
 'Astrocytes_FANTOM_1',
 'Endothelial_cells_BLUEPRINT_3',
 'Macrophages_M1_FANTOM_2',
 'Keratinocytes_FANTOM_1',
 'Keratinocytes_HPCA_3',
 'CD8+_T-cells',
 'Keratinocytes_FANTOM_3',
 'MHC_class_II',
 'Astrocytes_FANTOM_3']