In [1]:
""" Clinical data preprocess"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import LabelEncoder

CLINICAL_PATH = './data/TCGA-CDR-SupplementalTableS1.xlsx'

# Read clinical data
data = pd.read_excel(CLINICAL_PATH)

In [2]:
# Select the used features
data_used = data[['bcr_patient_barcode', 'type', 'gender', 'race', 'histological_type',
					'age_at_initial_pathologic_diagnosis', 'OS', 'OS.time']]
data_used.columns = ['id', 'cancer_type', 'gender', 'race', 'histological_type', 'age', 'event', 'event_time']


idx = data_used[data_used[['event', 'event_time']].isnull().T.any()].index
data_used.drop(labels=idx, inplace=True)
data_used.loc[data_used['race'] == '[Not Evaluated]', 'race'] = 'Na'
data_used.loc[data_used['race'] == '[Unknown]', 'race'] = 'Na'
data_used.loc[data_used['race'] == '[Not Available]', 'race'] = 'Na'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_used.drop(labels=idx, inplace=True)


In [13]:
# categorical data and continuous data
data_id = data_used[['id']]
data_cate = data_used[['cancer_type', 'gender', 'race', 'histological_type']]
data_num = data_used[['age']]
target = data_used[['event', 'event_time']]

In [15]:
# Convert label to category type
data_cate['histological_type'] = data_cate['histological_type'].astype('str')
for col in data_cate.columns:
	data_cate[col] = LabelEncoder().fit_transform(data_cate[col])

for col in data_cate.columns:
	data_cate[col] = data_cate[col].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cate['histological_type'] = data_cate['histological_type'].astype('str')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cate[col] = LabelEncoder().fit_transform(data_cate[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_cate[col] = data_cate[col].astype('category')


In [22]:
# Use 0 to fill NaN in age column
idx_num = data_num[data_num.isnull().T.any()].index
data_num.loc[idx_num, 'age'] = 0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_num.loc[idx_num, 'age'] = 0


In [25]:
# Define the embedding size
embedded_cols = {n: len(col.cat.categories) for n,col in data_cate.items()}
embedding_sizes = [(n_categories, min(50, (n_categories+1)//2)) for _,n_categories in embedded_cols.items()]
print(embedding_sizes)

[(33, 17), (2, 1), (6, 3), (145, 50)]


In [35]:
# concate the categorical data and continuous data
df = pd.concat([data_id, data_cate], axis=1)
df = pd.concat([df, data_num], axis=1)
df = pd.concat([df, target], axis=1)

df.to_csv('./preprocessed_data/Pc_clinical_emb.csv', header=False, index=False)

In [6]:
import numpy as np


# np.sum((data_used['race'] == '[Not Evaluated]', 'race'))
np.sum((data_used['race'] == '[Not Evaluated]').to_numpy())

0

*OV Cancer*

In [2]:
import pandas as pd

df = pd.read_csv("./preprocessed_data/Pc_clinical_emb.csv", header=None)
df

Unnamed: 0,0,1,2,3,4,5,6,7
0,TCGA-OR-A5J1,0,1,5,3,58.0,1.0,1355.0
1,TCGA-OR-A5J2,0,0,5,3,44.0,1.0,1677.0
2,TCGA-OR-A5J3,0,0,5,3,23.0,0.0,2091.0
3,TCGA-OR-A5J4,0,0,5,3,23.0,1.0,423.0
4,TCGA-OR-A5J5,0,1,5,3,30.0,1.0,365.0
...,...,...,...,...,...,...,...,...
11089,TCGA-YZ-A980,32,1,5,114,75.0,0.0,1862.0
11090,TCGA-YZ-A982,32,0,5,113,79.0,0.0,495.0
11091,TCGA-YZ-A983,32,0,5,20,51.0,0.0,798.0
11092,TCGA-YZ-A984,32,0,5,114,50.0,1.0,1396.0


In [7]:
df.loc[df[1] == 19].to_csv('./preprocessed_data/Pc_clinical_emb_OV.csv', header=False, index=False)

In [6]:
cnv_df = pd.read_csv("./data/Gistic2_CopyNumber_Gistic2_all_data_by_genes", sep="\t")
cnv_df

Unnamed: 0,Gene Symbol,TCGA-04-1331-01,TCGA-04-1332-01,TCGA-04-1335-01,TCGA-04-1336-01,TCGA-04-1337-01,TCGA-04-1338-01,TCGA-04-1341-01,TCGA-04-1342-01,TCGA-04-1343-01,...,TCGA-72-4235-01,TCGA-72-4236-01,TCGA-72-4237-01,TCGA-72-4238-01,TCGA-72-4240-01,TCGA-72-4241-01,TCGA-OY-A56P-01,TCGA-OY-A56Q-01,TCGA-VG-A8LO-01,TCGA-WR-A838-01
0,ACAP3,-0.703,0.080,-0.807,0.101,0.021,-0.999,-0.421,0.089,0.279,...,-0.076,0.054,0.801,0.04,0.058,0.339,0.813,0.000,0.137,-0.176
1,ACTRT2,-0.703,0.080,-0.807,0.101,0.021,-0.999,-0.421,0.089,0.279,...,-0.076,0.054,0.801,0.04,0.058,0.339,0.813,0.000,0.137,-0.176
2,AGRN,-0.703,0.080,-0.807,0.101,0.021,-0.999,-0.421,0.089,0.279,...,-0.076,0.054,0.801,0.04,0.058,0.339,0.813,0.000,0.137,-0.176
3,ANKRD65,-0.703,0.080,-0.807,0.101,0.021,-0.999,-0.421,0.089,0.279,...,-0.076,0.054,0.801,0.04,0.058,0.339,0.813,0.000,0.137,-0.176
4,ATAD3A,-0.703,0.080,-0.807,0.101,0.021,-0.999,-0.421,0.089,0.279,...,-0.076,0.054,0.801,0.04,0.058,0.339,0.813,0.000,0.137,-0.176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24771,IL9R|ENSG00000124334.12,-0.368,-0.256,0.673,-0.336,0.062,-0.082,-0.589,0.136,0.187,...,-0.081,-0.216,-0.143,-0.21,-0.349,-0.100,-0.080,-0.195,-0.675,0.065
24772,SPRY3|ENSG00000168939.6,-0.368,-0.256,0.673,-0.336,0.062,-0.082,-0.589,0.136,0.187,...,-0.081,-0.216,-0.143,-0.21,-0.349,-0.100,-0.080,-0.195,-0.675,0.065
24773,VAMP7|ENSG00000124333.10,-0.368,-0.256,0.673,-0.336,0.062,-0.082,-0.589,0.136,0.187,...,-0.081,-0.216,-0.143,-0.21,-0.349,-0.100,-0.080,-0.195,-0.675,0.065
24774,WASH6P|ENSG00000182484.10,-0.368,-0.256,0.673,-0.336,0.062,-0.082,-0.589,0.136,0.187,...,-0.081,-0.216,-0.143,-0.21,-0.349,-0.100,-0.080,-0.195,-0.675,0.065
