# Load libraries and Network results

In [1]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Sep 25 10:40:42 2020

@author: mwall
"""
import os
import sys
import glob
import numpy as np
import pandas as pd
import miner as miner
from scipy.stats import zscore
import miner_py3_kk as miner
from tqdm.notebook import tqdm, trange
import time

# change the working directory
os.chdir('/Users/serdarturkaslan/Documents/GitHub/GbmMINER/data/MINER_MicroLowessRNATMM.08.24.2020/')

# Path to the miner directory
input_path = os.getcwd()

# Load regulon Modules
regulonModules = miner.read_json(os.path.join(input_path,"regulons.json"))
regulonDfMicroGbmLatest = pd.read_csv("regulonDfGbmMicroRNASigCoxAndStatSig.csv", header = 0)
regulonDfMicroGbmLatest = list(regulonDfMicroGbmLatest['Regulon_ID'].drop_duplicates())
regulonDfMicroGbmLatest = [str(i) for i in regulonDfMicroGbmLatest]
regulonModules = dict((k, regulonModules[k]) for k in regulonDfMicroGbmLatest if k in regulonModules)

# create regulon dictionary
referenceDictionary = regulonModules

# Load data folders

In [None]:
# Folders for patient data
patientDataFolders = glob.glob('/Volumes/omics4tb2/SYGNAL/XCures/processed_results/*')
patientDataFolders = list(filter(lambda x:'TL-' in x, patientDataFolders))
print('Total Patients to process: %s' %(len(patientDataFolders)))

# Load Protein coding genes for filtering

In [79]:
# load protein coding genes list from ensembl for filtering
ensemblProteinCodingGenes = pd.read_csv(os.path.join(input_path,"homo_sapiens_protein_coding_export.txt"), sep="\t", index_col=None, header = 0)
print("Protein Coding Genes")
print(ensemblProteinCodingGenes)

uniqueProteins = list(ensemblProteinCodingGenes['Gene stable ID'].drop_duplicates())
print(len(uniqueProteins))

Protein Coding Genes
         Gene stable ID Transcript stable ID Gene name       Gene type  \
0       ENSG00000284967      ENST00000647178     FDFT1  protein_coding   
1       ENSG00000284967      ENST00000647069     FDFT1  protein_coding   
2       ENSG00000079459      ENST00000538689     FDFT1  protein_coding   
3       ENSG00000079459      ENST00000615631     FDFT1  protein_coding   
4       ENSG00000079459      ENST00000618539     FDFT1  protein_coding   
...                 ...                  ...       ...             ...   
251334  ENSG00000189171      ENST00000440685   S100A13  protein_coding   
251335  ENSG00000189171      ENST00000476133   S100A13  protein_coding   
251336  ENSG00000189171      ENST00000339556   S100A13  protein_coding   
251337  ENSG00000189171      ENST00000392622   S100A13  protein_coding   
251338  ENSG00000189171      ENST00000392623   S100A13  protein_coding   

       Gene Synonym  
0              DGPT  
1              DGPT  
2              DGPT  
3 

                target_id  length  eff_length  est_counts  tpm
0       ENST00000631435.1      12     6.16667         0.0  0.0
1       ENST00000434970.2       9     4.20000         0.0  0.0
2       ENST00000448914.1      13     6.28571         0.0  0.0
3       ENST00000415118.1       8     3.20000         0.0  0.0
4       ENST00000632684.1      12     6.16667         0.0  0.0
...                   ...     ...         ...         ...  ...
188748  ENST00000639790.1    1370  1135.59000         0.0  0.0
188749  ENST00000639660.1     284    90.97150         0.0  0.0
188750  ENST00000643577.1     105    18.34970         0.0  0.0
188751  ENST00000646356.1     900   665.58700         0.0  0.0
188752  ENST00000645792.1     930   695.58700         0.0  0.0

[188753 rows x 5 columns]


# Loop through each patient to calculate regulon activity

In [86]:


# Patient analysis loop
allSummary = pd.DataFrame()
for folder in tqdm(patientDataFolders):
    time.sleep(0.01)
    patientID = folder.split('/')[-1] #get patientID
    print(patientID)
    patientDataFile = os.path.join(folder,"RNA",patientID + ".genes.results") # get RNASeq results
    
    # Check if patient data file exists
    if os.path.exists('%s' %(patientDataFile)):
        # create a new column with patient name
        newColName = patientID + "_zscore"
        
        # Read expression data
        rawExpressionData = pd.read_csv(patientDataFile, sep="\t", index_col=None, header = 0)
        
        
        # seperate ensembl gene ids and symbols
        rawExpressionData[['GeneID','gene_symbol']] = pd.DataFrame(rawExpressionData).gene_id.str.split("_",expand=True)
        #print(rawExpressionData)
        
        # filter for protein coding genes
        rawExpressionDataFilt = rawExpressionData[rawExpressionData.GeneID.isin(uniqueProteins)].copy()
        #rawExpressionDataFilt = rawExpressionData.loc[rawExpressionData.GeneID in uniqueProteins]

        #print("Filtered raw expression")
        #print(rawExpressionDataFilt)
        
        # zscore patients expression data
        rawExpressionDataFilt[[newColName]] = rawExpressionDataFilt[['TPM']].apply(zscore)
   
        #expressionData = pd.read_csv("abundanceXCuresRNATPMTL1961DB85ZScoredLatest.csv", sep=",", index_col=0, header = 0)
        expressionData = rawExpressionDataFilt[[patientID + "_zscore"]]
        expressionData.index = rawExpressionDataFilt['GeneID']
        #print(expressionData)
        
        print("Processed: " + patientDataFile)
        
        # create a background matrix used for statistical hypothesis testing
        #bkgd = miner.backgroundDf(expressionData)
        
        # for each cluster, give samples that show high coherent cluster activity
        #overExpressedMembers = miner.biclusterMembershipDictionary(referenceDictionary,bkgd,label=2,p=0.05)
        
        # for each clus|ter, give samples that show low coherent cluster activity
        #underExpressedMembers = miner.biclusterMembershipDictionary(referenceDictionary,bkgd,label=0,p=0.05)
        
        # convert overExpressedMembers dictionary to binary matrix
        #overExpressedMembersMatrix = miner.membershipToIncidence(overExpressedMembers,expressionData)
        #overExpressedMembersMatrix.index = np.array(overExpressedMembersMatrix.index).astype(str)
        
        # convert underExpressedMembers dictionary to binary matrix
        #underExpressedMembersMatrix = miner.membershipToIncidence(underExpressedMembers,expressionData)
        #underExpressedMembersMatrix.index = np.array(underExpressedMembersMatrix.index).astype(str)

        # Create program matrix with values of {-1,0,1}
        #dfr_regulons = overExpressedMembersMatrix-underExpressedMembersMatrix
        #print(dfr_regulons)
        # 
        
        # calculate regulon activity
        rr = miner.generateRegulonActivity(referenceDictionary,expressionData, p=0.05)
        #print(rr)
        
        # write regulon activity results to file
        rr.to_csv("Regulon_Activity_" + patientID + ".csv")
        # calculate regulon activity stats
        overActiveRegulonCount = len(rr[rr[newColName] == 1])
        underActiveRegulonCount = len(rr[rr[newColName] == -1])
        neutralRegulonCount = len(rr[rr[newColName] == 0])
        
        resSummary = {"Over" : [overActiveRegulonCount],
                              "Under" : [underActiveRegulonCount],
                              "Neutral" : [neutralRegulonCount]}
        resSummary = pd.DataFrame(resSummary)
        resSummary = resSummary.rename(index={0: patientID})
        
        
        allSummary = allSummary.append(resSummary)
        
        #print(resSummary)
        
        #dfr_regulons = miner.generateRegulonActivity(referenceDictionary,expressionData, p=0.05, returnBkgd="no")
        
#         overActiveRegulonCount = len(dfr_regulons[dfr_regulons[patientID+"_zscore"] == 1])
#         underActiveRegulonCount = len(dfr_regulons[dfr_regulons[patientID+"_zscore"] == -1])
#         neutralRegulonCount = len(dfr_regulons[dfr_regulons[patientID+"_zscore"] == 0])
        
#         resSummary = {"Over" : [overActiveRegulonCount],
#                       "Under" : [underActiveRegulonCount],
#                       "Neutral" : [neutralRegulonCount]}
#         resSummary = pd.DataFrame(resSummary)
#         resSummary = resSummary.rename(index={0: str(patientID)})
#         allSummary = allSummary.append(resSummary)
        #print(resSummary)

        #print("ActiveactiveRegulonCount:", activeRegulonCount)
        #sys.exit()

        #dfr_regulons.to_csv("Regulon_Activity_" + patientID + ".csv")
        
    else:
        print("Data file doesnt exist: " + patientDataFile) 
print(allSummary)
        

  0%|          | 0/12 [00:00<?, ?it/s]

TL-19-61DB85
Processed: /Volumes/omics4tb2/SYGNAL/XCures/processed_results/TL-19-61DB85/RNA/TL-19-61DB85.genes.results
done!
done!
TL-19-87E3E1
Processed: /Volumes/omics4tb2/SYGNAL/XCures/processed_results/TL-19-87E3E1/RNA/TL-19-87E3E1.genes.results
done!
done!
TL-20-0B6792
Processed: /Volumes/omics4tb2/SYGNAL/XCures/processed_results/TL-20-0B6792/RNA/TL-20-0B6792.genes.results
done!
done!
TL-20-24CC77
Processed: /Volumes/omics4tb2/SYGNAL/XCures/processed_results/TL-20-24CC77/RNA/TL-20-24CC77.genes.results
done!
done!
TL-20-495B84
Processed: /Volumes/omics4tb2/SYGNAL/XCures/processed_results/TL-20-495B84/RNA/TL-20-495B84.genes.results
done!
done!
TL-20-920D6D
Processed: /Volumes/omics4tb2/SYGNAL/XCures/processed_results/TL-20-920D6D/RNA/TL-20-920D6D.genes.results
done!
done!
TL-20-C7EDD0
Processed: /Volumes/omics4tb2/SYGNAL/XCures/processed_results/TL-20-C7EDD0/RNA/TL-20-C7EDD0.genes.results
done!
done!
TL-20-E2D448
Processed: /Volumes/omics4tb2/SYGNAL/XCures/processed_results/TL-20-E2

In [85]:
import miner_py3_kk as miner

#expressionData = pd.read_csv("abundanceXCuresRNATPMTL1961DB85ZScoredLatest.csv", sep=",", index_col=0, header = 0)
expressionData = pd.read_csv("/Volumes/omics4tb2/SYGNAL/XCures/TL-20-0B6792/RNA/results_kallisto/abundance.tsv", sep="\t", index_col=None, header = 0)

print(expressionData)
#miner.generateRegulonActivity()
rr = miner.generateRegulonActivity(referenceDictionary,expressionData, p=0.025)
print(rr)

overActiveRegulonCount = len(rr[rr['TL1961DB85'] == 1])
underActiveRegulonCount = len(rr[rr['TL1961DB85'] == -1])
neutralRegulonCount = len(rr[rr['TL1961DB85'] == 0])
resSummary = {"Over" : [overActiveRegulonCount],
                      "Under" : [underActiveRegulonCount],
                      "Neutral" : [neutralRegulonCount]}
resSummary = pd.DataFrame(resSummary)
resSummary = resSummary.rename(index={0: 'TL1961DB85'})
print(resSummary)

target_id
ENST00000631435.1    0.0
ENST00000434970.2    0.0
ENST00000448914.1    0.0
ENST00000415118.1    0.0
ENST00000632684.1    0.0
                    ... 
ENST00000639790.1    0.0
ENST00000639660.1    0.0
ENST00000643577.1    0.0
ENST00000646356.1    0.0
ENST00000645792.1    0.0
Name: tpm, Length: 188753, dtype: float64


TypeError: object of type 'numpy.float64' has no len()

In [28]:
import miner_py3_kk as miner
from scipy.stats import zscore


rawExpressionData = pd.read_csv("/Volumes/omics4tb2/SYGNAL/XCures/processed_results/TL-19-87E3E1/RNA/TL-19-87E3E1.genes.results", sep="\t", index_col=None, header = 0)
print(rawExpressionData)
# zscore patients expression data
rawExpressionData[["zscore"]] = rawExpressionData[['TPM']].apply(zscore)

# seperate ensembl gene ids and symbols
rawExpressionData[['GeneID','gene_symbol']] = pd.DataFrame(rawExpressionData).gene_id.str.split("_",expand=True)
print(len(set(rawExpressionData['GeneID'])))

#expressionData = pd.read_csv("abundanceXCuresRNATPMTL1961DB85ZScoredLatest.csv", sep=",", index_col=0, header = 0)
expressionData = rawExpressionData[["zscore"]]
expressionData.index = rawExpressionData['GeneID']


#miner.generateRegulonActivity()
rr = miner.generateRegulonActivity(referenceDictionary,expressionData, p=0.025)
print(rr)

overActiveRegulonCount = len(rr[rr['zscore'] == 1])
underActiveRegulonCount = len(rr[rr['zscore'] == -1])
neutralRegulonCount = len(rr[rr['zscore'] == 0])
resSummary = {"Over" : [overActiveRegulonCount],
                      "Under" : [underActiveRegulonCount],
                      "Neutral" : [neutralRegulonCount]}
resSummary = pd.DataFrame(resSummary)
resSummary = resSummary.rename(index={0: 'zscore'})
print(resSummary)

                          gene_id  \
0          ENSG00000000003_TSPAN6   
1            ENSG00000000005_TNMD   
2            ENSG00000000419_DPM1   
3           ENSG00000000457_SCYL3   
4        ENSG00000000460_C1orf112   
...                           ...   
60671    ENSG00000288584_Z97205.3   
60672  ENSG00000288585_AC010184.1   
60673  ENSG00000288586_AL357874.3   
60674  ENSG00000288587_AL645933.5   
60675  ENSG00000288588_AC116317.2   

                                        transcript_id(s)   length  \
0      ENST00000373020_TSPAN6-201,ENST00000494424_TSP...  2547.10   
1      ENST00000373031_TNMD-201,ENST00000485971_TNMD-202  1205.00   
2      ENST00000371582_DPM1-201,ENST00000371584_DPM1-...  1075.97   
3      ENST00000367770_SCYL3-201,ENST00000367771_SCYL...  4091.68   
4      ENST00000286031_C1orf112-201,ENST00000359326_C...  2103.71   
...                                                  ...      ...   
60671                       ENST00000655888_Z97205.3-201  2595.00   
606

In [120]:

# create a background matrix used for statistical hypothesis testing
bkgd = miner.backgroundDf(expressionData)
# for each cluster, give samples that show high coherent cluster activity
overExpressedMembers = miner.biclusterMembershipDictionary(referenceDictionary,bkgd,label=2,p=0.05)
# for each clus|ter, give samples that show low coherent cluster activity
underExpressedMembers = miner.biclusterMembershipDictionary(referenceDictionary,bkgd,label=0,p=0.05)
# convert overExpressedMembers dictionary to binary matrix
overExpressedMembersMatrix = miner.membershipToIncidence(overExpressedMembers,expressionData)
overExpressedMembersMatrix.index = np.array(overExpressedMembersMatrix.index).astype(str)
# convert underExpressedMembers dictionary to binary matrix
underExpressedMembersMatrix = miner.membershipToIncidence(underExpressedMembers,expressionData)
underExpressedMembersMatrix.index = np.array(underExpressedMembersMatrix.index).astype(str)

# Create program matrix with values of {-1,0,1}
dfr_regulons = overExpressedMembersMatrix-underExpressedMembersMatrix
print(dfr_regulons)

dfr_regulons.to_csv("SigRegulonsActivityabundanceXCuresRNATPMTL1961DB85ZScoredLatest.csv")

done!
done!
      TL1961DB85
0            0.0
4            0.0
6            1.0
14           0.0
23           0.0
...          ...
3675         0.0
3703         0.0
3707         0.0
3727         1.0
3732         0.0

[505 rows x 1 columns]
