In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler

THRESHOLD = 7
MRNA_PATH = './data/EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv'

def VarianceSelect(data, t):
	# 6 -> 2181
	# 7 -> 1579
	# 8 -> 1176
	# 9 -> 880
	selector = VarianceThreshold(threshold=t)
	result_select = selector.fit_transform(data)
	result_support = selector.get_support(indices=True)
	return result_select, result_support

In [2]:
# Read mRNA data
data_mrna = pd.read_csv(MRNA_PATH, sep='\t')
mrna_samples = pd.read_csv(MRNA_PATH, sep='\t', header=None, nrows=1)

In [4]:
mrna_samples = mrna_samples.values.tolist()
mrna_samples = mrna_samples[0]
mrna_samples = mrna_samples[1:]
for j in range(len(mrna_samples)):
	mrna_samples[j] = mrna_samples[j][:12]
data_mrna.columns = ['sample'] + mrna_samples
data_mrna.fillna(0.0, inplace=True)
data_mrna = data_mrna.T
data_mrna.drop(index='sample', inplace=True)

In [6]:
# Read patients ID in preprocessed clinical data
data_clin = pd.read_csv('./preprocessed_data/Pc_clinical_emb.csv', header=None)
clin_samples = data_clin[[0]]
clin_samples = clin_samples.values.tolist()
clinical_samples = list()
for i in range(len(clin_samples)):
	clinical_samples.append(clin_samples[i][0])
clin_samples = clinical_samples

In [7]:
# Remove the rows with same patient ID
data_mrna.reset_index(inplace=True)
data_mrna = data_mrna.drop_duplicates(['index'])
data_mrna.reset_index(drop=True)
data_mrna.set_index('index', inplace=True)

In [8]:
# Variance threshold
res, _ = VarianceSelect(data_mrna, THRESHOLD)
mrna_df = pd.DataFrame(res)

In [9]:
# min-max normalization
scaler = MinMaxScaler()
mrna_0_1 = scaler.fit_transform(mrna_df)
mrna_f_df = pd.DataFrame(mrna_0_1)
mrna_f_df.index = data_mrna.index
mrna_f_df.reset_index(inplace=True)

In [15]:
# Create all zero vector
a = mrna_f_df[mrna_f_df['index'] == 'TCGA-OR-A5J1']
sample_row = a.copy()
sample_row['index'] = 'xx'
for i in range(mrna_f_df.shape[1]-1):
	sample_row[i] = 0.0

In [16]:
# Fill NaN with zero vectors
i = 0
for x in clin_samples:
	if i == 0:
		mrna = mrna_f_df[mrna_f_df['index']  == x]
		if mrna.shape[0] == 0:
			mrna = sample_row.copy()
			mrna['index'] = x
		i += 1
	else:
		mrna_row = mrna_f_df[mrna_f_df['index'] == x]
		if mrna_row.shape[0] == 0:
			mrna_row = sample_row.copy()
			mrna_row['index'] = x
		mrna = pd.concat([mrna, mrna_row], axis = 0)

In [None]:
# Save data
mrna.set_index('index', inplace=True)
mrna.to_csv(f'./preprocessed_data/PC_mRNA_threshold_{THRESHOLD}.csv', index=False, header=False)

*One Cancer*

In [36]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler

THRESHOLD = 1000000
MRNA_PATH = './data/EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv'

def VarianceSelect(data, t):
	# 6 -> 2181
	# 7 -> 1579
	# 8 -> 1176
	# 9 -> 880
	selector = VarianceThreshold(threshold=t)
	result_select = selector.fit_transform(data)
	result_support = selector.get_support(indices=True)
	return result_select, result_support

In [2]:
# Read mRNA data
data_mrna = pd.read_csv(MRNA_PATH, sep='\t')
mrna_samples = pd.read_csv(MRNA_PATH, sep='\t', header=None, nrows=1)

In [3]:
mrna_samples = mrna_samples.values.tolist()
mrna_samples = mrna_samples[0]
mrna_samples = mrna_samples[1:]
for j in range(len(mrna_samples)):
	mrna_samples[j] = mrna_samples[j][:12]
data_mrna.columns = ['sample'] + mrna_samples
data_mrna.fillna(0.0, inplace=True)
data_mrna = data_mrna.T
data_mrna.drop(index='sample', inplace=True)

In [5]:
# Read patients ID in preprocessed clinical data
data_clin = pd.read_csv('./preprocessed_data/Pc_clinical_emb_OV.csv', header=None)
clin_samples = data_clin[[0]]
clin_samples = clin_samples.values.tolist()
clinical_samples = list()
for i in range(len(clin_samples)):
	clinical_samples.append(clin_samples[i][0])
clin_samples = clinical_samples

In [6]:
data_mrna = data_mrna.loc[data_mrna.index.isin(clin_samples)]

In [8]:
# Remove the rows with same patient ID
data_mrna.reset_index(inplace=True)
data_mrna = data_mrna.drop_duplicates(['index'])
data_mrna.reset_index(drop=True)
data_mrna.set_index('index', inplace=True)

In [37]:
# Variance threshold
res, _ = VarianceSelect(data_mrna, THRESHOLD)
mrna_df = pd.DataFrame(res)

In [39]:
# min-max normalization
scaler = MinMaxScaler()
mrna_0_1 = scaler.fit_transform(mrna_df)
mrna_f_df = pd.DataFrame(mrna_0_1)
mrna_f_df.index = data_mrna.index
mrna_f_df.reset_index(inplace=True)

In [40]:
# Create all zero vector
a = mrna_f_df[mrna_f_df["index"] == mrna_f_df["index"].iloc[-1]]
sample_row = a.copy()
sample_row['index'] = 'xx'
for i in range(mrna_f_df.shape[1]-1):
	sample_row[i] = 0.0

In [41]:
# Fill NaN with zero vectors
i = 0
data_list = []
for x in clin_samples:
	if i == 0:
		mrna = mrna_f_df[mrna_f_df['index']  == x]
		if mrna.shape[0] == 0:
			mrna = sample_row.copy()
			mrna['index'] = x
		i += 1
		data_list.append(mrna)
	else:
		mrna_row = mrna_f_df[mrna_f_df['index'] == x]
		if mrna_row.shape[0] == 0:
			mrna_row = sample_row.copy()
			mrna_row['index'] = x
		data_list.append(mrna_row)
mrna = pd.concat(data_list)
		# mrna = pd.concat([mrna, mrna_row], axis = 0)

In [42]:
# Save data
mrna.set_index('index', inplace=True)
mrna.to_csv(f'./preprocessed_data/PC_mRNA_threshold_{THRESHOLD}_OV.csv', index=False, header=False)

In [44]:
(mrna.sum(axis=1) == 0).sum()/ len(mrna)

0.47766323024054985

In [46]:
(mrna.sum(axis=1) == 0).sum()

278

In [47]:
mrna

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2383,2384,2385,2386,2387,2388,2389,2390,2391,2392
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-04-1331,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
TCGA-04-1332,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
TCGA-04-1335,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
TCGA-04-1336,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
TCGA-04-1337,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-61-2614,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
TCGA-OY-A56P,0.213014,0.357381,0.255301,0.152582,0.051222,0.527868,0.000689,0.068088,0.258847,0.132632,...,0.145072,0.179657,0.314936,0.244281,0.022899,0.774496,0.000000,0.283460,0.022007,0.333089
TCGA-OY-A56Q,0.246702,0.554533,0.503709,0.057821,0.135104,0.777455,0.000645,0.027992,0.158425,0.214579,...,0.146218,0.291041,0.190795,0.181718,0.018237,0.281478,0.038351,0.083189,0.109968,0.365141
TCGA-VG-A8LO,0.024922,0.388284,0.263917,0.081070,0.182644,0.551366,0.111981,0.156348,0.297488,0.313149,...,0.406218,0.151918,0.268371,0.258006,0.049414,0.437811,0.200111,0.175881,0.161512,0.252769
