In [None]:
import pandas as pd
import sys,numpy
import sklearn,sklearn.preprocessing
import matplotlib,matplotlib.pyplot
import os
os.chdir('E:\ISB work stuff\Post-Doc_BaligaLab_GBMSYGNALProject')
import miner_py3_kk as miner
import imp
imp.reload(miner)

matplotlib.rcParams.update({'font.size':18,'font.family':'Arial','xtick.labelsize':14,'ytick.labelsize':14})
matplotlib.rcParams['pdf.fonttype']=42


def expressionQuantileNormalization(adata):


    '''
    Quantile normalization as in wikipedia
    '''
    
    samples=list(adata.keys())
    print(samples)
    genes=list(adata[samples[0]].keys())
    print(genes)
    
    # rank genes within each sample and compute their mean
    sortedMatrix=[]
    for sample in samples:
        values=[]
        for gene in genes:
            value=adata[sample][gene]
            values.append(value)
        values.sort()
        sortedMatrix.append(values)
    averages=numpy.mean(numpy.array(sortedMatrix),axis=0)
    
    # incorporate new values based on position
    normalized={}
    for sample in samples:
        normalized[sample]={}
        values=[]
        for gene in genes:
           values.append(adata[sample][gene])        
        order=numpy.argsort(values)
        ranks = order.argsort()
        newValues=averages[ranks]
        for i in range(len(genes)):
            normalized[sample][genes[i]]=newValues[i]
        
    return normalized

def expressionScaling(adata):

    '''
    Bring data to a range betwenn 0 and 1.
    '''

    samples=list(adata.keys())
    genes=list(adata[samples[0]].keys())
    
    # find min and max
    allValues=[]
    for sample in samples:
        for gene in genes:
            value=adata[sample][gene]
            allValues.append(value)
    low=min(allValues)
    high=max(allValues)

    # create a new dictionary
    scaled={}
    for sample in samples:
        scaled[sample]={}
        for gene in genes:
            value=adata[sample][gene]
            newValue=(value-low)/(high-low)
            scaled[sample][gene]=newValue

    # find min and max
    allValues=[]
    for sample in samples:
        for gene in genes:
            value=scaled[sample][gene]
            allValues.append(value)
    low=min(allValues)
    high=max(allValues)
    print('after',low,high)
            
    return scaled
                
def histogrammer(theData):

    '''
    This function creates a histogram.
    '''    

    x=[]; y=[]
    
    n,bins=numpy.histogram(theData,bins=int(numpy.sqrt(len(theData))))

    halfBin=(bins[1]-bins[0])/2.
    for bin in bins:
        center=bin+halfBin
        x.append(center)
    x.pop()
    y=numpy.array(n)
    y=list(y/float(sum(y)))

    return x,y

###
### MAIN
###

### 0. user defined variable
### 1. read data
### 2. normalizations
### 3. plot original and nomalized distributions
### 4. store data

# 0. user defined variables
# microarray data file
arrayDataFile='CCLE_RNAseq_genes_counts_20180929OnlyIDs_TMM.txt'
arrayexpdf = pd.read_csv(arrayDataFile, index_col = 0, header = 0, sep = "\t")

# RNA-seq, normalized counts
#rnaseqDataFile='GBM.uncv2.mRNAseq_RSEM_all.txt'

# results folder
#resultsFolder='E:\ISB work stuff\Post-Doc_BaligaLab_GBMSYGNALProject/GBM_TCGA/results/normalization/'

# 1. read data
print('Reading data...')
'''
arrayExpression={}; rnaseqExpression={}

# 1.1. read array data
arrayGenes=[]
with open(arrayDataFile,'r') as f:

    headerLine=f.readline()
    v=headerLine.split('\t')
    sampleIDs=v[1:]
    sampleIDs[-1]=sampleIDs[-1].replace('\n','')

    # dealing with patient IDs from TCGA https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/
    patientIDs=['-'.join(element.split('-')[:3]) for element in sampleIDs]
    
    next(f)

    for line in f:
        v=line.split('\t')
        geneName=v[0]

        if geneName not in arrayGenes:
            arrayGenes.append(geneName)
        l1 = v[1:]
        print(l1)
        rowExpression=[float(element) for element in l1]

        # create dictionary
        for i in range(len(patientIDs)):
            if patientIDs[i] not in arrayExpression.keys():
                arrayExpression[patientIDs[i]]={}

            arrayExpression[patientIDs[i]][geneName]=rowExpression[i]

# 1.2. read RNAseq data
rnaseqGenes=[]
with open(rnaseqDataFile,'r') as f:

    headerLine=f.readline()
    v=headerLine.split('\t')
    sampleIDs=v[1:]
    sampleIDs[-1]=sampleIDs[-1].replace('\n','')

    # dealing with patient IDs from TCGA https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/
    patientIDs=['-'.join(element.split('-')[:3]) for element in sampleIDs]

    for line in f:
        v=line.split('\t')
        geneName=v[0].split('|')[0]
        if geneName not in rnaseqGenes:
            rnaseqGenes.append(geneName)
        rowExpression=[float(element) for element in v[1:]]
        log2plusOne=[element for element in rowExpression]
        
        # create dictionary
        for i in range(len(patientIDs)):
            if patientIDs[i] not in rnaseqExpression.keys():
                rnaseqExpression[patientIDs[i]]={}

            rnaseqExpression[patientIDs[i]][geneName]=log2plusOne[i]


sampleNames=list(arrayExpression.keys())
sampleNames.sort()
geneNames=list(arrayExpression[sampleNames[0]].keys())
geneNames.sort()

fileName=resultsFolder+'MA.data.csv'
with open(fileName,'w') as f:
    header='geneID,'+','.join(sampleNames)
    f.write(header)
    f.write('\n')
    
    for geneName in geneNames:
        f.write(geneName)

        for sampleName in sampleNames:
            f.write(',{}'.format(arrayExpression[sampleName][geneName]))
        
        f.write('\n')

# store RS
sampleNames=list(rnaseqExpression.keys())
sampleNames.sort()
geneNames=list(rnaseqExpression[sampleNames[0]].keys())
geneNames.sort()

fileName=resultsFolder+'RS.data.csv'
with open(fileName,'w') as f:
    header='geneID,'+','.join(sampleNames)
    f.write(header)
    f.write('\n')
    
    for geneName in geneNames:
        f.write(geneName)

        for sampleName in sampleNames:
            f.write(',{}'.format(rnaseqExpression[sampleName][geneName]))
        
        f.write('\n')

# 1.3. find common intersect
#commonGenes=list(set(arrayGenes) & set(rnaseqGenes))
#commonGenes.sort()

# remove genes not common in array
genes2remove=[]
oneSample=list(arrayExpression.keys())[0]

for gene in arrayExpression[oneSample]:
    if gene not in commonGenes:
        genes2remove.append(gene)
for sample in arrayExpression:
    for gene in genes2remove:
        del arrayExpression[sample][gene]

# remove genes not common in array
genes2remove=[]
oneSample=list(rnaseqExpression.keys())[0]

for gene in rnaseqExpression[oneSample]:
    if gene not in commonGenes:
        genes2remove.append(gene)
for sample in rnaseqExpression:
    for gene in genes2remove:
        del rnaseqExpression[sample][gene]
'''
# 2. normalizations
print('Quantile normalization...')

#MAsamples=list(arrayExpression.keys())[:]
#RSsamples=list(rnaseqExpression.keys())[:]
#geneNames=commonGenes[:]
'''
# 2.1 merge two data sets
fullSet={}
for sample in MAsamples:
    fullSet[sample]={}
    for gene in geneNames:
        fullSet[sample][gene]=arrayExpression[sample][gene]
for sample in RSsamples:
    fullSet[sample]={}
    for gene in geneNames:
        fullSet[sample][gene]=rnaseqExpression[sample][gene]
      
# 2.2. transform array data into matrix
allValues=[]
for sample in MAsamples:
    values=[]
    for gene in geneNames:
        value=arrayExpression[sample][gene]
        values.append(value)
    allValues.append(values)
x=numpy.array(allValues)
arrayExpressionNum=numpy.transpose(x)
'''
# 2.3. run quantile normalization
#QNF=expressionQuantileNormalization(fullSet)

#arrayexpdf = pd.DataFrame.from_dict(arrayExpression)

#arrayexpdf = miner.zerofilter(arrayexpdf)     

#rnaseqexpdf = pd.DataFrame.from_dict(rnaseqExpression)

#rnaseqexpdf = miner.zerofilter(rnaseqexpdf)        
# 3.1. store original data sets
# store MA
#arrayExpression = arrayexpdf.to_dict()

#rnaseqExpression = rnaseqexpdf.to_dict()

#QNMA=expressionQuantileNormalization(arrayExpression)
expdf = arrayexpdf
#expdf = expdf.fillna(0)
#expdf = miner.removeNullRows(expdf)
#expdf = miner.entropycheck(expdf)
#expdf,conversionTable=miner.identifierConversion(expdf)
expdf=miner.zscore(expdf)
expdf.to_csv("CCLE_RNAseq_genes_counts_20180929OnlyIDs_TMM.csv")
#QNMA = expdf.to_dict()

'''
QNRS=expressionQuantileNormalization(rnaseqExpression)
fileName=resultsFolder+'QN.RS.nolog.csv'
sampleNames=list(QNRS.keys())
sampleNames.sort()
geneNames=list(QNRS[sampleNames[0]].keys())
geneNames.sort()
with open(fileName,'w') as f:
    header='geneID,'+','.join(sampleNames)
    f.write(header)
    f.write('\n')
    
    for geneName in geneNames:
        f.write(geneName)

        for sampleName in sampleNames:
            f.write(',{}'.format(QNRS[sampleName][geneName]))
        
        f.write('\n')
        
rnaseqDataFile = resultsFolder+'QN.RS.nolog.csv'
with open(rnaseqDataFile,'r') as f:

    headerLine=f.readline()
    v=headerLine.split(',')
    sampleIDs=v[1:]
    sampleIDs[-1]=sampleIDs[-1].replace('\n','')

    # dealing with patient IDs from TCGA https://docs.gdc.cancer.gov/Encyclopedia/pages/TCGA_Barcode/
    patientIDs=['-'.join(element.split('-')[:3]) for element in sampleIDs]

    for line in f:
        v=line.split(',')
        geneName=v[0]
        if geneName not in rnaseqGenes:
            rnaseqGenes.append(geneName)
        rowExpression=[float(element) for element in v[1:]]
        log2plusOne=[numpy.log2(element+1) for element in rowExpression]
        
        # create dictionary
        for i in range(len(patientIDs)):
            if patientIDs[i] not in QNRS.keys():
                QNRS[patientIDs[i]]={}

            QNRS[patientIDs[i]][geneName]=log2plusOne[i]
expdf = pd.DataFrame.from_dict(QNRS)
expdf = miner.removeNullRows(expdf)
expdf = miner.entropycheck(expdf)
#expdf,conversionTable=miner.identifierConversion(expdf)
expdf=miner.zscore(expdf)
QNRS = expdf.to_dict()

# 3.2. store QN data sets
# store MA
sampleNames=list(QNMA.keys())
sampleNames.sort()
geneNames=list(QNMA[sampleNames[0]].keys())
geneNames.sort()

fileName=resultsFolder+'QN.MA.data_gbmQuantile.csv'
with open(fileName,'w') as f:
    header='geneID,'+','.join(sampleNames)
    f.write(header)
    f.write('\n')
    
    for geneName in geneNames:
        f.write(geneName)

        for sampleName in sampleNames:
            f.write(',{}'.format(QNMA[sampleName][geneName]))
        
        f.write('\n')

# store RS
sampleNames=list(QNRS.keys())
sampleNames.sort()
geneNames=list(QNRS[sampleNames[0]].keys())
geneNames.sort()

fileName=resultsFolder+'QN.RS.data_gbm.csv'
with open(fileName,'w') as f:
    header='geneID,'+','.join(sampleNames)
    f.write(header)
    f.write('\n')
    
    for geneName in geneNames:
        f.write(geneName)

        for sampleName in sampleNames:
            f.write(',{}'.format(QNRS[sampleName][geneName]))
        
        f.write('\n')
#QNMAscaled=expressionScaling(QNMA)



# 2.4. run sklearn quantile normalization
#sklearn.preprocessing.quantile_transform(arrayExpressionNum,axis=0,output_distribution='normal')

print('\t ... completed.')
'''

In [7]:
#print(df)
is_NaN = expdf.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = expdf[row_has_NaN]
import numpy as np
print(rows_with_NaN)

          TCGA-02-004  TCGA-02-0015  TCGA-02-0016  TCGA-02-0023  TCGA-02-0025  \
CFHR5       -0.242125      0.043250     -0.116375      0.087625      0.342125   
OR2K2        0.212000      0.139000      0.271667      0.379000     -0.195667   
ADAM5P      -0.170000      0.251667     -0.063333     -0.129333      0.365667   
OR9G1        0.427000      0.115500      0.230000      0.226000      0.389000   
C7orf33      0.816333      0.154333     -0.382667      0.136333     -0.475000   
UNQ6490      0.180667      0.080333     -0.247000      0.126333      0.392667   
LAMB4        1.285200      0.930400      1.701600      1.425400      0.049200   
REM2         0.019667     -0.014667      0.189667      0.189333      0.204667   
DEFB125      0.501000      0.395000      0.314000      0.349667      0.080000   
ABCG8       -0.924000      0.955667     -0.495667     -0.344333      0.307000   
CCDC60       0.030000      0.306333     -0.015333      0.183667     -0.204333   
MAS1         0.186333      0

In [3]:
expdf

Unnamed: 0,TCGA-02-0001,TCGA-02-0003,TCGA-02-0004,TCGA-02-0006,TCGA-02-0007,TCGA-02-0009,TCGA-02-0010,TCGA-02-0011,TCGA-02-0014,TCGA-02-0015,...,TCGA-76-6662,TCGA-76-6663,TCGA-76-6664,TCGA-81-5910,TCGA-81-5911,TCGA-87-5896,Unnamed: 576,Unnamed: 577,mirbase_id,ensembl_gene_id
ENSG00000194717,8.627512,8.640768,9.414982,10.483634,8.300629,9.244229,8.790284,9.415445,8.591853,11.283106,...,10.308698,9.645604,11.058158,10.303829,7.426558,9.035385,0,0,hsa-let-7a-1,ENSG00000199165
ENSG00000198972,12.242824,11.715277,11.645951,11.872541,11.882706,11.754717,11.616951,11.685209,11.784784,11.944973,...,12.052566,12.465598,11.353487,12.626537,12.200711,12.243836,0,0,hsa-let-7a-2,ENSG00000198975
ENSG00000198973,5.870195,5.764104,5.880249,5.769884,5.841605,5.766450,5.776267,5.790933,5.753731,5.832266,...,5.683515,5.713833,5.710940,5.730792,5.737265,5.816615,0,0,hsa-let-7a-3,ENSG00000283990
ENSG00000198976,5.877118,5.802407,5.837366,5.836940,5.994112,5.831648,5.811708,5.768384,5.782225,5.804736,...,5.711966,5.729880,5.697743,5.731177,5.967034,5.727032,0,0,hsa-let-7b,ENSG00000284520
ENSG00000198984,7.018414,7.113557,7.208128,7.131805,7.041300,6.978891,7.025941,6.724357,7.745404,7.038871,...,7.553740,6.937253,7.072139,6.868288,7.415690,7.005611,0,0,hsa-let-7c,ENSG00000199030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000284499,5.845362,5.761823,5.717612,5.741150,5.762501,5.819229,5.779589,5.710597,5.730760,5.737878,...,5.674369,5.663512,5.671731,5.684239,5.702371,5.785394,0,0,hsa-mir-216b,ENSG00000211520
ENSG00000284520,14.493463,14.172742,13.421203,13.128622,12.236448,13.959411,13.664432,14.181989,14.407434,14.138218,...,14.330163,14.640407,14.243628,14.292654,14.505749,13.910095,0,0,hsa-mir-217,ENSG00000207548
ENSG00000284544,6.622736,7.104946,6.393158,6.982664,6.891096,6.384455,6.241855,6.672664,6.557907,6.504859,...,6.098895,6.991944,6.612716,5.983828,6.245802,6.343171,0,0,hsa-mir-218-1,ENSG00000207732
ENSG00000284567,11.635816,9.238128,10.088741,11.072867,9.469223,9.912153,9.533664,9.521077,8.972873,8.905918,...,11.660237,9.909842,10.998589,10.161575,9.603745,9.748362,0,0,hsa-mir-218-2,ENSG00000207739
