In [1]:
import pandas as pd
import pickle

base_data_dir = "/secure/shared_data/tcga_path_reports/TCGA_Metadata/"
cancer_type_df = pickle.load(open(base_data_dir+'TCGA_cancer_types_binary.p','rb')) # indicate the cancer type for each patient
t14_df = pd.read_csv(base_data_dir+'TCGA_T14_patients.csv')
n03_df = pd.read_csv(base_data_dir+'TCGA_N03_patients.csv')
m01_df = pd.read_csv(base_data_dir+'TCGA_M01_patients.csv')

# 1. T Category

In [2]:
""" this cells creates a dataframe containing two columns: type (cancer types) and patient_filename
"""
t14_patient_df = cancer_type_df[cancer_type_df['patient'].isin(list(t14_df['case_submitter_id']))].copy() # case_submitter_id is the patient_id
t14_patient_df.reset_index(inplace=True, drop=True)
t14_patient_df['case_submitter_id']=t14_patient_df['patient']

t14_merged_df = t14_patient_df.merge(t14_df, on='case_submitter_id')[['type','patient_filename','ajcc_pathologic_t','case_submitter_id']]
t14_merged_df = t14_merged_df[t14_merged_df['ajcc_pathologic_t']!='T0']
t14_merged_df = t14_merged_df[t14_merged_df['ajcc_pathologic_t']!='TX']

t14_merged_df.reset_index(inplace=True, drop=True)
t14_meta = t14_merged_df[["type", "patient_filename"]]
t14_meta

Unnamed: 0,type,patient_filename
0,ACC,TCGA-OR-A5J1.8866FD87-4F6F-4D7E-B99A-7DD427ED3BB3
1,ACC,TCGA-OR-A5J2.345C34EE-F031-41A4-B955-BA054F2A3CE1
2,ACC,TCGA-OR-A5J3.83F53610-4F40-425B-9B69-BBB94234224F
3,ACC,TCGA-OR-A5J4.39E300DC-C01F-46BC-AE4C-D834458F1FF7
4,ACC,TCGA-OR-A5J5.BB63299C-B7F7-4F08-998D-C30E56CD6065
...,...,...
6882,UVM,TCGA-YZ-A980.A0ADAAFB-31A4-468A-9543-F4DFCCBCF0DC
6883,UVM,TCGA-YZ-A982.6D118D76-61C0-4643-A525-53E3313C149D
6884,UVM,TCGA-YZ-A983.04B1F253-2351-4BA6-A289-A22BC1E19C35
6885,UVM,TCGA-YZ-A984.ECC9EDD8-CE51-49B8-9B3A-52D69920080F


In [3]:
""" select patients who are BRCA and LUAD
"""
brca_t14_patients = t14_meta[t14_meta["type"] == "BRCA"].copy()
print(brca_t14_patients.shape)
luad_t14_patients = t14_meta[t14_meta["type"] == "LUAD"].copy()
print(luad_t14_patients.shape)

(1031, 2)
(486, 2)


## 1.1 Select BRCA/LUAD patients in T category

In [11]:
t14_data_path = "/secure/shared_data/tcga_path_reports/t14_data/"
t14_training_pd = pd.read_csv(t14_data_path+"Target_Data_T14.csv")
t14_testing_pd = pd.read_csv(t14_data_path+"Target_Data_T14_test.csv")

t14_brca_training = t14_training_pd.merge(brca_t14_patients, on="patient_filename")
t14_brca_testing = t14_testing_pd.merge(brca_t14_patients, on="patient_filename")
print("BRCA training pd: {} and testing pd: {}".format(t14_brca_training.shape, t14_brca_testing.shape))

t14_luad_training = t14_training_pd.merge(luad_t14_patients, on="patient_filename")
t14_luad_testing = t14_testing_pd.merge(luad_t14_patients, on="patient_filename")
print("LUAD training pd: {} and testing pd: {}".format(t14_luad_training.shape, t14_luad_testing.shape))

BRCA training pd: (885, 4) and testing pd: (146, 4)
LUAD training pd: (409, 4) and testing pd: (77, 4)


In [16]:
t14_brca_training.to_csv(t14_data_path+"BRCA_T14_training.csv")
t14_brca_testing.to_csv(t14_data_path+"BRCA_T14_testing.csv")
t14_luad_training.to_csv(t14_data_path+"LUAD_T14_training.csv")
t14_luad_testing.to_csv(t14_data_path+"LUAD_T14_testing.csv")

## 1.2 Analyze ZSCOT Results for BRCA/LUAD Patients

In [4]:
t14_ZSCOT_path = "/secure/shared_data/tnm/t14_res/med42-t0.7-tp0.95-nrs1.csv"
# load the ZS-COT results
t14_ZSCOT_df = pd.read_csv(t14_ZSCOT_path)
print("All Test:", t14_ZSCOT_df.shape)

# only select ZS-COT results for patients who have BRCA
t14_ZSCOT_brca = t14_ZSCOT_df.merge(brca_t14_patients, on="patient_filename")
print("BRCA:", t14_ZSCOT_brca.shape)

# only select ZS-COT results for patients who have LUAD
t14_ZSCOT_luad = t14_ZSCOT_df.merge(luad_t14_patients, on="patient_filename")
print("LUAD:", t14_ZSCOT_luad.shape)


All Test: (1034, 7)
BRCA: (146, 8)
LUAD: (77, 8)


# 2. N Category

In [7]:
n03_patient_df = cancer_type_df[cancer_type_df['patient'].isin(list(n03_df['case_submitter_id']))].copy()
#print(n03_patient_df.shape)
n03_patient_df.reset_index(inplace=True, drop=True)
n03_patient_df['case_submitter_id']=n03_patient_df['patient']

n03_merged_df = n03_patient_df.merge(n03_df, on='case_submitter_id')[['type','patient_filename','ajcc_pathologic_n','case_submitter_id']]
n03_merged_df = n03_merged_df[n03_merged_df['ajcc_pathologic_n']!='NX']
n03_merged_df = n03_merged_df[n03_merged_df['ajcc_pathologic_n']!='N0 (i+)']
n03_merged_df = n03_merged_df[n03_merged_df['ajcc_pathologic_n']!='N0 (i-)']
n03_merged_df = n03_merged_df[n03_merged_df['ajcc_pathologic_n']!='N0 (mol+)']


n03_merged_df.reset_index(inplace=True, drop=True)
list_n=[]
for a_value in list(n03_merged_df['ajcc_pathologic_n']):
    if a_value == 'N0':
        list_n.append(0)
    elif '1' in a_value:
        list_n.append(1)
    elif '2' in a_value:
        list_n.append(2)
    elif '3' in a_value:
        list_n.append(3)
    else:
        print(a_value) #Check for any unrecognized terms 

n03_meta = n03_merged_df[["type", "patient_filename"]]
n03_meta

Unnamed: 0,type,patient_filename
0,ACC,TCGA-OR-A5J1.8866FD87-4F6F-4D7E-B99A-7DD427ED3BB3
1,ACC,TCGA-OR-A5J2.345C34EE-F031-41A4-B955-BA054F2A3CE1
2,ACC,TCGA-OR-A5J3.83F53610-4F40-425B-9B69-BBB94234224F
3,ACC,TCGA-OR-A5J4.39E300DC-C01F-46BC-AE4C-D834458F1FF7
4,ACC,TCGA-OR-A5J5.BB63299C-B7F7-4F08-998D-C30E56CD6065
...,...,...
5673,UVM,TCGA-WC-A882.BCFE115C-4472-4221-86B3-75B122F9B425
5674,UVM,TCGA-WC-A883.FDE93561-B0B0-495F-8BBD-932EA3B7D02D
5675,UVM,TCGA-WC-A884.21C6D225-052E-4E12-A639-A1B671E22CE4
5676,UVM,TCGA-WC-A885.1D5AEA80-D8C7-444F-BFB4-1B3822C8D17B


In [8]:
""" select patients who are BRCA and LUAD
"""
brca_n03_patients = n03_meta[n03_meta["type"] == "BRCA"].copy()
print(brca_n03_patients.shape)
luad_n03_patients = n03_meta[n03_meta["type"] == "LUAD"].copy()
print(luad_n03_patients.shape)

(800, 2)
(477, 2)


## 2.1 Select BRCA/LUAD patients in N category

In [10]:
n03_data_path = "/secure/shared_data/tcga_path_reports/n03_data/"
n03_training_pd = pd.read_csv(n03_data_path+"Target_Data_N03.csv")
n03_testing_pd = pd.read_csv(n03_data_path+"Target_Data_N03_test.csv")

n03_brca_training = n03_training_pd.merge(brca_n03_patients, on="patient_filename")
n03_brca_testing = n03_testing_pd.merge(brca_n03_patients, on="patient_filename")
print("BRCA training pd: {} and testing pd: {}".format(n03_brca_training.shape, n03_brca_testing.shape))

n03_luad_training = n03_training_pd.merge(luad_n03_patients, on="patient_filename")
n03_luad_testing = n03_testing_pd.merge(luad_n03_patients, on="patient_filename")
print("LUAD training pd: {} and testing pd: {}".format(n03_luad_training.shape, n03_luad_testing.shape))

BRCA training pd: (669, 4) and testing pd: (131, 4)
LUAD training pd: (395, 4) and testing pd: (82, 4)


In [17]:
n03_brca_training.to_csv(n03_data_path+"BRCA_N03_training.csv")
n03_brca_testing.to_csv(n03_data_path+"BRCA_N03_testing.csv")
n03_luad_training.to_csv(n03_data_path+"LUAD_N03_training.csv")
n03_luad_testing.to_csv(n03_data_path+"LUAD_N03_testing.csv")

## 2.2 Analyze ZSCOT Results for BRCA/LUAD Patients

In [7]:
n03_ZSCOT_path = "/secure/shared_data/tnm/n03_res/med42-t0.7-tp0.95-nrs1.csv"
# load the ZS-COT results
n03_ZSCOT_df = pd.read_csv(n03_ZSCOT_path)
print("All Test:", n03_ZSCOT_df.shape)

# only select ZS-COT results for patients who have BRCA
n03_ZSCOT_brca = n03_ZSCOT_df.merge(brca_n03_patients, on="patient_filename")
print("BRCA:", n03_ZSCOT_brca.shape)

# only select ZS-COT results for patients who have LUAD
n03_ZSCOT_luad = n03_ZSCOT_df.merge(luad_n03_patients, on="patient_filename")
print("LUAD:", n03_ZSCOT_luad.shape)

All Test: (852, 7)
BRCA: (131, 8)
LUAD: (82, 8)


# 3. M Category

In [13]:
m01_patient_df = cancer_type_df[cancer_type_df['patient'].isin(list(m01_df['case_submitter_id']))].copy()
m01_patient_df.reset_index(inplace=True, drop=True)
m01_patient_df['case_submitter_id']=m01_patient_df['patient']

m01_merged_df = m01_patient_df.merge(m01_df, on='case_submitter_id')[['type','patient_filename','ajcc_pathologic_m','case_submitter_id']]
m01_merged_df = m01_merged_df[m01_merged_df['ajcc_pathologic_m']!='MX']

m01_merged_df.reset_index(inplace=True, drop=True)
list_m=[]
for a_value in list(m01_merged_df['ajcc_pathologic_m']):
    if '0' in a_value:
        list_m.append(0)
    elif '1' in a_value:
        list_m.append(1)
    else:
        print(a_value) #Check for any unrecognized terms

m01_meta = m01_merged_df[["type", "patient_filename"]]
m01_meta

Unnamed: 0,type,patient_filename
0,BLCA,TCGA-2F-A9KO.FA1D30C7-E486-48DD-989F-E774B42EA1B1
1,BLCA,TCGA-2F-A9KQ.F35113C2-F4CC-43EC-8C52-B71B357DDA46
2,BLCA,TCGA-2F-A9KR.EC17D988-194A-4C20-9BF8-D7CC75D9DF55
3,BLCA,TCGA-2F-A9KT.EC566A42-E711-458F-8082-1C113AECDF60
4,BLCA,TCGA-4Z-AA7M.C14C42D1-6359-4625-90E1-B3E43646DB64
...,...,...
4603,UVM,TCGA-WC-A883.FDE93561-B0B0-495F-8BBD-932EA3B7D02D
4604,UVM,TCGA-WC-A884.21C6D225-052E-4E12-A639-A1B671E22CE4
4605,UVM,TCGA-WC-A885.1D5AEA80-D8C7-444F-BFB4-1B3822C8D17B
4606,UVM,TCGA-WC-A888.513B512E-E9DF-4628-BD36-CADBF3725D9A


In [14]:
""" select patients who are BRCA and LUAD
"""
brca_m01_patients = m01_meta[m01_meta["type"] == "BRCA"].copy()
print(brca_m01_patients.shape)
luad_m01_patients = m01_meta[m01_meta["type"] == "LUAD"].copy()
print(luad_m01_patients.shape)

(869, 2)
(353, 2)


## 3.1 Select BRCA/LUAD patients in M category

In [15]:
m01_data_path = "/secure/shared_data/tcga_path_reports/m01_data/"
m01_training_pd = pd.read_csv(m01_data_path+"Target_Data_M01.csv")
m01_testing_pd = pd.read_csv(m01_data_path+"Target_Data_M01_test.csv")

m01_brca_training = m01_training_pd.merge(brca_m01_patients, on="patient_filename")
m01_brca_testing = m01_testing_pd.merge(brca_m01_patients, on="patient_filename")
print("BRCA training pd: {} and testing pd: {}".format(m01_brca_training.shape, m01_brca_testing.shape))

m01_luad_training = m01_training_pd.merge(luad_m01_patients, on="patient_filename")
m01_luad_testing = m01_testing_pd.merge(luad_m01_patients, on="patient_filename")
print("LUAD training pd: {} and testing pd: {}".format(m01_luad_training.shape, m01_luad_testing.shape))

BRCA training pd: (731, 4) and testing pd: (138, 4)
LUAD training pd: (290, 4) and testing pd: (63, 4)


In [None]:
m01_brca_training.to_csv(m01_data_path+"BRCA_M01_training.csv")
m01_brca_testing.to_csv(m01_data_path+"BRCA_M01_testing.csv")
m01_luad_training.to_csv(m01_data_path+"LUAD_M01_training.csv")
m01_luad_testing.to_csv(m01_data_path+"LUAD_M01_testing.csv")

## 3.2 Analyze ZSCOT Results for BRCA/LUAD Patients

In [10]:
# load the ZS-COT results
tmp1 = pd.read_csv("/secure/shared_data/tnm/m01_res/med42-t0.7-tp0.95-nrs5_batch1.csv")
tmp2 = pd.read_csv("/secure/shared_data/tnm/m01_res/med42-t0.7-tp0.95-nrs5_batch2.csv")
m01_med42_cot = pd.concat([tmp1, tmp2])
print("All Test:", m01_med42_cot.shape)

# only select ZS-COT results for patients who have BRCA
m01_ZSCOT_brca = m01_med42_cot.merge(brca_m01_patients, on="patient_filename")
print("BRCA:", m01_ZSCOT_brca.shape)

# only select ZS-COT results for patients who have LUAD
m01_ZSCOT_luad = m01_med42_cot.merge(luad_m01_patients, on="patient_filename")
print("LUAD:", m01_ZSCOT_luad.shape)

All Test: (692, 17)
BRCA: (138, 18)
LUAD: (63, 18)
