## Script to use after delete_na_values.R

 Takes datasets m_values matrices and converts them into more readable numpy format. Creates beta values matrices, arrays of diagnoses for samples (tumour or normal). Results will be saved in pandas format. Parralel programming methods was used to speed up calculations. 

In [9]:
cancer_types = ['BLCA', 'BRCA', 'COAD', 'ESCA', 'HNSC', 'KIRC', 'KIRP', 'LIHC', 'LUAD', 'LUSC', 'PRAD', 'THCA', 'UCEC']


In [10]:
import os
import glob
import errno
import re
import numpy as np
import pandas as pd
import csv
import multiprocessing as mp
import time

In [11]:
# paths that are used to save or read files

# make sure pwd() is data_preprocessing/

save_path = 'dataset/'
barcodes_part = 'barcodes/'
diagnoses_part = 'diagnoses/'
beta_values_part = 'beta_values/'
m_values_part = 'm_values/'
pandas_part = 'pandas/'
csv_part = '.csv'

if not ('dataset' in os.listdir()):
    os.mkdir('dataset')
    os.mkdir('dataset/' + pandas_part)
    os.mkdir('dataset/' + pandas_part + barcodes_part)
    os.mkdir('dataset/' + pandas_part + diagnoses_part)
    os.mkdir('dataset/' + pandas_part + beta_values_part)
    os.mkdir('dataset/' + pandas_part + m_values_part)

In [12]:
# function to get diagnose from tcga barcode. 
# 1 means tumour, 0 means normal
def get_diagnosis(barcode):
    diagnosis_code = int(barcode.split('-')[3][:2][0]) # taking tens didget from cancer code
    
    if diagnosis_code ==  0: # tumour samples represented 0-9
        return 1
    elif diagnosis_code == 1: # normal samples represented 10-19
        return 0

# function to return a set an ndarray of diagnoses for the list of tcga barcodes
def diagnoses(arr):
    diagnoses_array = []
    for a in arr:
        diagnoses_array.append(get_diagnosis(a))
    return np.array(diagnoses_array)

In [13]:
# converts beta values into m m values
def beta2m(beta):
    return np.log2((beta)/(1-beta))

# converts m values into beta values
def m2beta(m):
    return (np.power(2, m))/(np.power(2, m) + 1)

# converts m values into beta values (but: change +inf -> 1.0 and -inf -> 0.0). 
# it should be used to solve the problem of possibility of nan appereance in beta values matrix 
def m2beta_without_nan(m):
    if np.isinf(m):
        if m > 0.0:
            beta = 1.0
        else:
            beta = 0.0
    else:
        beta = (np.power(2, m))/(np.power(2, m) + 1)
    return beta

In [14]:
####m-value matrix to beta matrix####
#np_m2beta = np.vectorize(m2beta) #function m2beta for a matrix
np_m2beta_without_nan = np.vectorize(m2beta_without_nan) 
#NEW_MATRIX = np_m2beta(MATRIX)

In [15]:
delay = 10

def process_info():
    print('Module:' + str(__name__) + '\n')
    print('Parent Process id:' + str(os.getppid()) + '\n')
    print('Process id:' + str(os.getpid()) + '\n\n')

# making my version of the parallel_matrices function so it is clearer for me
# also works with the output of DataProcessingRda.Rmd, which is a slightly different format than what parallel_matrices is used to (the row names are the probes)
# file needs to be *_m_from_rda
def my_version(file):
    name = 'TCGA-' + file.split('/')[-1].split('.')[0].split('_')[0] # cancer type code
    print('PROJECT: ' + name)
    data = pd.read_csv(file)
    data = data.rename(columns = {"Unnamed: 0": "probe"})
    
    # create barcodes
    barcodes = list(data.columns)
    barcodes.remove('probe')
    print("Got barcodes")
    
    # diagnoses
    my_diagnoses = diagnoses(barcodes)
    print("Got diagnoses")
    
    # m values
    m_values = np.array(data.drop("probe", axis = 1))
    print("Got m values")
    
    # converts m_matrix into matrix of beta values
    beta_matrix = np_m2beta_without_nan(m_values)
    print("Got betas")
    
    dbv = pd.DataFrame(barcodes)
    dd = pd.DataFrame(my_diagnoses)
    dm = pd.DataFrame(m_values, index = data["probe"]) # save wit
    db = pd.DataFrame(beta_matrix,  index = data["probe"])
    dbv.to_csv(save_path+pandas_part+barcodes_part+name+csv_part, sep='\t')
    dd.to_csv(save_path+pandas_part+diagnoses_part+name+csv_part, sep='\t')
    dm.to_csv(save_path+pandas_part+m_values_part+name+csv_part, sep='\t')
    db.to_csv(save_path+pandas_part+beta_values_part+name+csv_part, sep='\t')
    print(name + ' pandas versions saved!')
  
    
# files = [root_path + 'GDCdata/from_rda/TCGA-'+ cancer_type + '_m_from_rda' for cancer_type in ["BRCA", "HNSC", "THCA", "PRAD"]]
# files = [root_path + 'GDCdata/from_rda/TCGA-'+ cancer_type + '_m_from_rda' for cancer_type in ["COAD", "ESCA", "KIRC", "BLCA"]]
files = [cancer_type + '_m_from_rda' for cancer_type in cancer_types]


pool = mp.Pool(processes = mp.cpu_count())
results = [pool.map(my_version, (file for file in files))]

PROJECT: BLCA
PROJECT: BRCA
Got barcodes
Got diagnoses
Got m values
Got barcodes
Got diagnoses
Got m values
Got betas
BLCA pandas versions saved!
Got betas
BRCA pandas versions saved!
