# Load libraries and Network results

In [52]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 25 10:40:42 2020

@author: mwall
"""
import os
import sys
import glob
import numpy as np
import pandas as pd
import miner as miner
from scipy.stats import zscore
import miner_py3_kk as miner
from tqdm.notebook import tqdm, trange
import time

# change the working directory
#os.chdir('/Users/serdarturkaslan/Documents/GitHub/GbmMINER/data/MINER_MicroLowessRNATMM.08.24.2020/')
# output directory
output_dir = "/Volumes/omics4tb2/SYGNAL/GBM-Serdar/XCures"
# Path to the miner directory
input_path = "/Volumes/omics4tb2/SYGNAL/GBM-Serdar/MINER_MicroLowessRNATMM.08.24.2020"

# Function to load regulons

In [35]:
def loadRegulons(disease_relevant=True,disease_relevant_regulons_file="regulonDfGbmMicroRNASigCoxAndStatSig.csv"):
    
    # Load regulon Modules
    regulonModules = miner.read_json(os.path.join(input_path,"regulons.json"))
    print("Total number of regulons: " + str(len(regulonModules)))

    # load regulon data frame
    # All Regulons
    regulonDf = pd.read_csv(os.path.join(input_path, "regulonDf.csv"), header = 0)
    regulonDf = list(regulonDf['Regulon_ID'].drop_duplicates())
    regulonDf = [str(i) for i in regulonDf]
    

    # Disease relevant regulons
    regulonDfMicroGbmLatest = pd.read_csv(os.path.join(input_path,disease_relevant_regulons_file), header = 0)
    regulonDfMicroGbmLatest = list(regulonDfMicroGbmLatest['Regulon_ID'].drop_duplicates())
    regulonDfMicroGbmLatest = [str(i) for i in regulonDfMicroGbmLatest]
    regulonModulesFiltered = dict((k, regulonModules[k]) for k in regulonDfMicroGbmLatest if k in regulonModules)
    print("Filtered number of regulons: " + str(len(regulonModulesFiltered)))
    
    if disease_relevant == True:
        #regulonModules = dict((k, regulonModules[k]) for k in regulonDfMicroGbmLatest if k in regulonModules)
        regulonModules = regulonModulesFiltered 
        print("Returned %s filtered regulons" %(str(len(regulonModules))))
        return(regulonModules)
    
    else:
        regulonModules = dict((k, regulonModules[k]) for k in regulonDf if k in regulonModules)
        print("Returned %s Total regulons" %(str(len(regulonModules))))
        return(regulonModules)


# Function to load Programs 

In [36]:
def loadPrograms(disease_relevant=True,disease_relevant_programs_file="transcriptional_programsmiRNAAndSig.json"):
    
    # Load transcriptional programs
    # All Programs
    transcriptional_programs = miner.read_json(os.path.join(input_path,'transcriptional_programs.json'))
    print("Total # of programs: ", str(len(transcriptional_programs)))
    
    transcriptional_programs_filtered = miner.read_json(os.path.join(input_path,disease_relevant_programs_file))
    print("Filtered # of programs: ", str(len(transcriptional_programs_filtered)))
    
    if disease_relevant == True:
            transcriptional_programs = transcriptional_programs_filtered
            print("Returned %s filtered Programs" %(str(len(transcriptional_programs_filtered))))
    else:
        transcriptional_programs = transcriptional_programs
        print("Returned %s Total Programs" %(str(len(transcriptional_programs))))

    program_list = [transcriptional_programs[str(key)] for key in range(len(transcriptional_programs.keys()))]
    
    return(program_list)

# Function to calculate Program activity

In [37]:
def calculateProgramActivity(program_list,regulons,expressionData,outputFile):
    
    # select reference dictionary for downstream analysis (pr_genes, revisedClusters, coexpressionModules, or regulonModules)
    referenceDictionary = createPrGenesDictionary(program_list,regulons)

    # create a background matrix used for statistical hypothesis testing
    bkgd = miner.backgroundDf(expressionData)

    # for each cluster, give samples that show high coherent cluster activity
    overExpressedMembers = miner.biclusterMembershipDictionary(referenceDictionary,bkgd,label=2,p=0.05)

    # for each clus|ter, give samples that show low coherent cluster activity
    underExpressedMembers = miner.biclusterMembershipDictionary(referenceDictionary,bkgd,label=0,p=0.05)

    # convert overExpressedMembers dictionary to binary matrix
    overExpressedProgramsMatrix = miner.membershipToIncidence(overExpressedMembers,expressionData)

    # convert underExpressedMembers dictionary to binary matrix
    underExpressedProgramsMatrix = miner.membershipToIncidence(underExpressedMembers,expressionData)

    # Create program matrix with values of {-1,0,1}
    dfr_programs = overExpressedProgramsMatrix-underExpressedProgramsMatrix
    
    # Write program activity into a outFile
    dfr_programs.to_csv(outputFile)
    
    return(dfr_programs)

# Function to create dictionary of propgram genes

In [38]:
# Create dictionary of program genes
# make dictionary of program keys with gene lists as elements
def createPrGenesDictionary(program_list, regulons):
    pr_genes_expanded = {}
    for i in range(len(program_list)):
        rgns = program_list[i]
        genes = []
        for r in rgns:
            if r in regulons:
                genes.append(regulons[r])
        if len(genes) != 0:
            genes = list(set(np.hstack(genes)))
            pr_genes_expanded[i] = genes

    unique_genes = set(x for y in pr_genes_expanded.values() for x in y)
        
    print("Total number of genes: " + str(len(unique_genes)))
    return(pr_genes_expanded)


# Load data folders

In [39]:
# Folders for patient data
patientDataFolders = glob.glob('/Volumes/omics4tb2/SYGNAL/XCures/processed_results/*')
patientDataFolders = list(filter(lambda x:'TL-' in x, patientDataFolders))
print('Total Patients to process: %s' %(len(patientDataFolders)))

Total Patients to process: 12


# Load Protein coding genes for filtering

In [40]:
# load protein coding genes list from ensembl for filtering
ensemblProteinCodingGenes = pd.read_csv(os.path.join(input_path,"homo_sapiens_protein_coding_export.txt"), sep="\t", index_col=None, header = 0)
print("Protein Coding Genes")
#print(ensemblProteinCodingGenes)

uniqueProteins = list(ensemblProteinCodingGenes['Gene stable ID'].drop_duplicates())
print("Loaded " + str(len(uniqueProteins)) + " protein-coding genes\n")

Protein Coding Genes
Loaded 22492 protein-coding genes



# Load Regulons

In [41]:
# load disease relevant regulons
regulonModulesDisease = loadRegulons(disease_relevant=True)

#load all regulons
regulonModulesAll = loadRegulons(disease_relevant=False)

Total number of regulons: 3764
Filtered number of regulons: 505
Returned 505 filtered regulons
Total number of regulons: 3764
Filtered number of regulons: 505
Returned 3764 Total regulons


# Load Programs

In [42]:
# load disease relevant programs
program_list_disease = loadPrograms(disease_relevant=True)

# load all programs
program_list_all = loadPrograms(disease_relevant=False)

Total # of programs:  178
Filtered # of programs:  33
Returned 33 filtered Programs
Total # of programs:  178
Filtered # of programs:  33
Returned 178 Total Programs


# Loop through each patient to calculate Regulon & Program activity

In [54]:
# Patient analysis loop
allSummary = pd.DataFrame()
for folder in tqdm(patientDataFolders[1:2]):
    time.sleep(0.01)
    patientID = folder.split('/')[-1] #get patientID
    print(patientID)
    patientDataFile = os.path.join(folder,"RNA",patientID + ".genes.results") # get RNASeq results
    
    # Check if patient data file exists
    if os.path.exists('%s' %(patientDataFile)):
        # create a new column with patient name
        newColName = patientID + "_zscore"
        
        # Read expression data
        rawExpressionData = pd.read_csv(patientDataFile, sep="\t", index_col=None, header = 0)
        print(rawExpressionData.head)
        
        # seperate ensembl gene ids and symbols
        rawExpressionData[['GeneID','gene_symbol']] = pd.DataFrame(rawExpressionData).gene_id.str.split("_",expand=True)
        #print(rawExpressionData)
        
        # filter for protein coding genes
        rawExpressionDataFilt = rawExpressionData[rawExpressionData.GeneID.isin(uniqueProteins)].copy()
        #rawExpressionDataFilt = rawExpressionData.loc[rawExpressionData.GeneID in uniqueProteins]
        rawExpressionDataFilt.head()
        print("Filtered raw expression")
        print(rawExpressionDataFilt)
        
        # zscore patients expression data
        rawExpressionDataFilt[[newColName]] = rawExpressionDataFilt[['TPM']].apply(zscore)
        rawExpressionDataFilt
        #expressionData = pd.read_csv("abundanceXCuresRNATPMTL1961DB85ZScoredLatest.csv", sep=",", index_col=0, header = 0)
        expressionData = rawExpressionDataFilt[[patientID + "_zscore"]]
        expressionData.index = rawExpressionDataFilt['GeneID']
        #print(expressionData)
        
        print("Processed: " + patientDataFile)
        referenceDictionary = regulonModulesDisease
        # create a background matrix used for statistical hypothesis testing
        bkgd = miner.backgroundDf(expressionData)
        
        # for each cluster, give samples that show high coherent cluster activity
        overExpressedMembers = miner.biclusterMembershipDictionary(referenceDictionary,bkgd,label=2,p=0.05)
        
        # for each clus|ter, give samples that show low coherent cluster activity
        underExpressedMembers = miner.biclusterMembershipDictionary(referenceDictionary,bkgd,label=0,p=0.05)
        
        # convert overExpressedMembers dictionary to binary matrix
        overExpressedMembersMatrix = miner.membershipToIncidence(overExpressedMembers,expressionData)
        overExpressedMembersMatrix.index = np.array(overExpressedMembersMatrix.index).astype(str)
        
        # convert underExpressedMembers dictionary to binary matrix
        underExpressedMembersMatrix = miner.membershipToIncidence(underExpressedMembers,expressionData)
        underExpressedMembersMatrix.index = np.array(underExpressedMembersMatrix.index).astype(str)

        # Create program matrix with values of {-1,0,1}
        rr = overExpressedMembersMatrix-underExpressedMembersMatrix
        #print(dfr_regulons)
        # 
  
        # calculate disease relevant regulon activity 
        #rr = miner.generateRegulonActivity(regulonModulesDisease,expressionData, p=0.05)
        #print(rr)
        
        # calculate all regulon activity 
        aa = miner.generateRegulonActivity(regulonModulesAll,expressionData, p=0.05)
        #print(rr)
        
        # calculate disease relevant program activity
        program_activity_disease = calculateProgramActivity(program_list_disease,regulonModulesDisease,expressionData,outputFile= os.path.join(output_dir,"network_activities", patientID) + "_disease_rel_program_activity.csv")
    
        # calculate all program activity
        program_activity_all = calculateProgramActivity(program_list_all,regulonModulesAll,expressionData,outputFile= os.path.join(output_dir,"network_activities", patientID) + "_all_program_activity.csv")
    
        # write disease relevant regulon activity results to filr
        rr.to_csv(os.path.join(output_dir,"network_activities", patientID) + "_disease_rel_regulon_activity.csv")
        
        # write all regulon activity results to file
        aa.to_csv(os.path.join(output_dir,"network_activities", patientID) + "_all_regulon_activity.csv")
        
        # calculate regulon activity stats for disease relevant regulons
        overActiveRegulonCount = len(rr[rr[newColName] == 1])
        underActiveRegulonCount = len(rr[rr[newColName] == "-1"])
        neutralRegulonCount = len(rr[rr[newColName] == 0])
        
        resSummary = {"Over" : [overActiveRegulonCount],
                              "Under" : [underActiveRegulonCount],
                              "Neutral" : [neutralRegulonCount],
                      "Type" : "Disease Relevant Regulon"
                     }
        resSummary = pd.DataFrame(resSummary)
        resSummary = resSummary.rename(index={0: patientID})
        
        # calculate regulon activity stats for all regulons
        overActiveRegulonCount2 = len(aa[aa[newColName] == 1])
        underActiveRegulonCount2 = len(aa[aa[newColName] == -1])
        neutralRegulonCount2 = len(aa[aa[newColName] == 0])
        
        resSummary2 = {"Over" : [overActiveRegulonCount2],
                              "Under" : [underActiveRegulonCount2],
                              "Neutral" : [neutralRegulonCount2],
                      "Type" : "All Regulon"
                     }
        resSummary2 = pd.DataFrame(resSummary2)
        resSummary2 = resSummary2.rename(index={0: patientID})
        
        
        allSummary = allSummary.append(resSummary)
        allSummary = allSummary.append(resSummary2)
        
        #print(resSummary)
        
        #dfr_regulons = miner.generateRegulonActivity(referenceDictionary,expressionData, p=0.05, returnBkgd="no")
        
#         overActiveRegulonCount = len(dfr_regulons[dfr_regulons[patientID+"_zscore"] == 1])
#         underActiveRegulonCount = len(dfr_regulons[dfr_regulons[patientID+"_zscore"] == -1])
#         neutralRegulonCount = len(dfr_regulons[dfr_regulons[patientID+"_zscore"] == 0])
        
#         resSummary = {"Over" : [overActiveRegulonCount],
#                       "Under" : [underActiveRegulonCount],
#                       "Neutral" : [neutralRegulonCount]}
#         resSummary = pd.DataFrame(resSummary)
#         resSummary = resSummary.rename(index={0: str(patientID)})
#         allSummary = allSummary.append(resSummary)
        #print(resSummary)

        #print("ActiveactiveRegulonCount:", activeRegulonCount)
        #sys.exit()

        #dfr_regulons.to_csv("Regulon_Activity_" + patientID + ".csv")
        
    else:
        print("Data file doesnt exist: " + patientDataFile) 
print(allSummary)
        

  0%|          | 0/1 [00:00<?, ?it/s]

TL-19-87E3E1
<bound method NDFrame.head of                           gene_id  \
0          ENSG00000000003_TSPAN6   
1            ENSG00000000005_TNMD   
2            ENSG00000000419_DPM1   
3           ENSG00000000457_SCYL3   
4        ENSG00000000460_C1orf112   
...                           ...   
60671    ENSG00000288584_Z97205.3   
60672  ENSG00000288585_AC010184.1   
60673  ENSG00000288586_AL357874.3   
60674  ENSG00000288587_AL645933.5   
60675  ENSG00000288588_AC116317.2   

                                        transcript_id(s)   length  \
0      ENST00000373020_TSPAN6-201,ENST00000494424_TSP...  2547.10   
1      ENST00000373031_TNMD-201,ENST00000485971_TNMD-202  1205.00   
2      ENST00000371582_DPM1-201,ENST00000371584_DPM1-...  1075.97   
3      ENST00000367770_SCYL3-201,ENST00000367771_SCYL...  4091.68   
4      ENST00000286031_C1orf112-201,ENST00000359326_C...  2103.71   
...                                                  ...      ...   
60671                       E