In [1]:
import numpy as np
import pandas as pd
import pyreadr

In [2]:
#Load the samples by gene, clinical, pathway enrichment, cell type enrichment data frame
#train_part1_df = pd.read_csv("../Data/Revised_Training_Set_with_Expr_Clin_PA_CTS_P1.csv.gz",sep="\t",low_memory=False)
#train_part2_df = pd.read_csv("../Data/Revised_Training_Set_with_Expr_Clin_PA_CTS_P2.csv.gz",sep="\t",low_memory=False)
#test_df = pd.read_csv("../Data/Revised_Test_Set_with_Expr_Clin_PA_CTS.csv.gz",sep="\t",low_memory=False)
train_df = pd.read_csv("../Data/Revised_Training_Set_with_Onco_Expr_Clin_PA_CTS.csv",sep="\t",low_memory=False)
test_df = pd.read_csv("../Data/Revised_Test_Set_with_Onco_Expr_Clin_PA_CTS.csv",sep="\t",low_memory=False)
#train_part1_df.head()
print(train_df.shape)
print(test_df.shape)

(337, 824)
(183, 824)


In [3]:
#Load the mutation information
out = pyreadr.read_r("../Data/Train_Test_Mutation_Matrices.Rdata")
train_mut_df = out["train_mut_mat"]
test_mut_df = out["test_mut_mat"]
train_mut_var_df = out["train_mut_var_mat"]
test_mut_var_df = out["test_mut_var_mat"]
print(train_mut_df.shape)
print(test_mut_df.shape)
train_mut_df.columns

(460, 374)
(211, 374)


Index(['DNMT3A', 'TET2', 'NRAS', 'TP53', 'RUNX1', 'IDH2', 'FLT3', 'SRSF2',
       'WT1', 'IDH1',
       ...
       'ARID1B', 'CRY1', 'ZBTB47', 'WDR43', 'TRPM4', 'MYT1', 'FAM155A',
       'ZNF687', 'SF3A1', 'dbgap_rnaseq_sample'],
      dtype='object', length=374)

In [4]:
#Get the column names and useful columns
all_columns = list(train_df.columns)

#Sample ids
sample_names = all_columns[0]

#Gene names
gene_names = all_columns[1:653]

#Clinical traits with T-sne
clin_traits = all_columns[653:750]
clin_trait_of_use = ['Tsne1','Tsne2','consensus_sex','ageAtDiagnosis','diseaseStageAtSpecimenCollection','vitalStatus',
                     'overallSurvival', '%.Blasts.in.BM', '%.Blasts.in.PB', '%.Eosinophils.in.PB', '%.Lymphocytes.in.PB', 
                     '%.Monocytes.in.PB', '%.Neutrophils.in.PB','ALT', 'AST', 'albumin', 'creatinine', 
                     'hematocrit', 'hemoglobin','plateletCount','wbcCount']

#A description of the min max values
train_df[clin_trait_of_use].describe()

#Get the information about pathways
pathway_names = all_columns[750:804]

#Get the information about celltypes and modules
cts_names = all_columns[804:824]

#Print all columns of interest
all_cols_of_interest = [sample_names]+gene_names+clin_trait_of_use+pathway_names+cts_names
print(all_cols_of_interest)

['dbgap_rnaseq_sample', 'LASP1', 'HOXA11', 'CREBBP', 'ETV1', 'GAS7', 'CD79B', 'BTK', 'BRCA1', 'WAS', 'WWTR1', 'CD74', 'BIRC3', 'FAS', 'BCLAF1', 'ANK1', 'RABEP1', 'ZCCHC8', 'CUL3', 'FLT4', 'CDH1', 'TNC', 'CTNNA1', 'PREX2', 'TPR', 'GOPC', 'TNFRSF17', 'SNX29', 'ELN', 'ARID1B', 'HERPUD1', 'POLQ', 'PIK3CB', 'THRAP3', 'KMT2C', 'PRDM1', 'POLD1', 'CASP8', 'PMS1', 'NTHL1', 'ERBB3', 'SPEN', 'MAP2K4', 'SMARCD1', 'GOLGA5', 'FGFR2', 'KLF6', 'RHOA', 'CBFB', 'FGFR3', 'TFE3', 'BCL3', 'CLTCL1', 'FSTL3', 'PABPC1', 'TCF3', 'PRKACA', 'TFRC', 'AFF4', 'SMC1A', 'FCGR2B', 'TP63', 'SDHA', 'SMARCE1', 'KDM5A', 'IGF2BP2', 'MAP3K13', 'PICALM', 'EED', 'ARHGEF10L', 'MLH1', 'NT5C2', 'NFKB2', 'FGFR1', 'ARAF', 'N4BP2', 'HOXA9', 'MLLT10', 'PCM1', 'CIC', 'DNM2', 'KEAP1', 'HSP90AA1', 'PTPRC', 'ERC1', 'XPO1', 'PALB2', 'KAT6A', 'CYLD', 'FAT1', 'NCOA1', 'BCORL1', 'ATRX', 'MECOM', 'EPS15', 'BAX', 'GNAS', 'CNOT3', 'GNA11', 'FUS', 'CD209', 'FH', 'ESR1', 'EZR', 'MSH2', 'MAP3K1', 'HSP90AB1', 'JAK2', 'ABL1', 'LZTR1', 'SMARCB1', 'M

In [5]:
len(all_cols_of_interest)

748

In [6]:
#Make the big combined training and test dataframe
#big_train_df = pd.concat([train_part1_df,train_part2_df],axis=0)
big_train_df = train_df
big_train_df = pd.DataFrame(big_train_df[all_cols_of_interest])
big_test_df = pd.DataFrame(test_df[all_cols_of_interest])

#Join the training dataframe with mutation information
big_train_df = pd.merge(big_train_df,train_mut_df,on='dbgap_rnaseq_sample')
big_train_df = pd.merge(big_train_df, train_mut_var_df, on="dbgap_rnaseq_sample")

big_test_df = pd.merge(big_test_df, test_mut_df, on="dbgap_rnaseq_sample")
big_test_df = pd.merge(big_test_df, test_mut_var_df, on="dbgap_rnaseq_sample")
print(big_train_df.shape)
print(big_test_df.shape)
sum(big_train_df.columns==big_test_df.columns)

(337, 1131)
(183, 1131)


1131

In [9]:
#Write the data frames as pickle files
big_train_df.to_pickle("../Data/Training_Set_Mod.pkl", compression="zip")
big_test_df.to_pickle("../Data/Test_Set_Mod.pkl",compression="zip")
big_train_df.to_csv("../Data/Training_Set_Mod.csv",index=None,sep="\t")
big_test_df.to_csv("../Data/Test_Set_Mod.csv",index=None,sep="\t")