In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
#Directories of the TCGA data
CLIN_PATH = '../data/clinical.project-TCGA-BRCA.2016-09-30T12_32_20.666404.json' #Patient data
DIST_PATH = '../data/dd5_5_for_TCGA.txt' # Distance metric
GENOME_PATH = '../data/BRCA.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes_normalized__data.data.txt' #Genome data

#### Clinical data of patients

In [3]:
# Clinical data of patients
big_df = pd.read_json(CLIN_PATH)

# Change lists to dictionaries to easily expand to columns in DataFrame
def dictify(df):
    df['diagnoses'] = df['diagnoses'][0]
    df['exposures'] = df['exposures'][0]
    return df

big_df = big_df.dropna().apply(dictify,axis=1)

In [4]:
# Expand dictionaries to columns in DataFrame
df_clin = pd.DataFrame(big_df['case_id']).join(big_df['demographic'].apply(pd.Series),how='outer',lsuffix='_case_id',rsuffix='_demo')\
            .join(big_df['diagnoses'].apply(pd.Series),how='outer',lsuffix = '_demo',rsuffix='_diag')\
            .join(big_df['exposures'].apply(pd.Series),how='outer',lsuffix = '_diag',rsuffix='_expo')

In [5]:
# Drop nulls rows and columns
df_clin.dropna(how='all', inplace = True)
df_clin.dropna(how='all', inplace = True, axis=1)

In [6]:
df_clin.head()

Unnamed: 0,case_id,demographic_id,ethnicity,gender,race,submitter_id_demo,updated_datetime_demo,year_of_birth,year_of_death,age_at_diagnosis,...,site_of_resection_or_biopsy,submitter_id_diag,tissue_or_organ_of_origin,tumor_grade,tumor_stage,updated_datetime_diag,vital_status,exposure_id,submitter_id,updated_datetime
0,b205c89f-af62-4186-acad-ed23d243fa98,55ae5a3b-e509-5fcd-98c9-c5007b448890,not hispanic or latino,female,white,TCGA-A2-A0YL_demographic,2016-09-02T19:00:43.039678-05:00,1962.0,,17702.0,...,c50.9,TCGA-A2-A0YL_diagnosis,c50.9,not reported,stage iiia,2016-09-02T19:00:43.039678-05:00,alive,23b6a296-e149-5801-aa20-e144ef683abe,TCGA-A2-A0YL_exposure,2016-09-02T19:00:43.039678-05:00
1,70f34c5c-3671-44c8-9469-99f9786efec1,498e27fb-2201-524a-bdd1-86f4e53dc54b,not hispanic or latino,female,white,TCGA-D8-A1XR_demographic,2016-09-02T19:07:03.114741-05:00,1954.0,,20488.0,...,c50.9,TCGA-D8-A1XR_diagnosis,c50.9,not reported,stage iib,2016-09-02T19:07:03.114741-05:00,alive,6dfa11dc-4266-57c9-b746-02a6c7bb9d23,TCGA-D8-A1XR_exposure,2016-09-02T19:07:03.114741-05:00
2,8986a141-eae7-4157-b695-02cc6fc3b071,8faa57d4-1199-53ce-a0ce-24d358841153,not hispanic or latino,female,white,TCGA-BH-A1ET_demographic,2016-09-02T18:57:52.265307-05:00,1944.0,2005.0,20425.0,...,c50.9,TCGA-BH-A1ET_diagnosis,c50.9,not reported,stage i,2016-09-02T18:57:52.265307-05:00,dead,e72e6b02-58be-539a-af7f-f2714f14899a,TCGA-BH-A1ET_exposure,2016-09-02T18:57:52.265307-05:00
3,7317e605-93ba-49a9-8743-20613b02767a,4394b44c-b51d-5b96-b62b-db7f49007b43,not hispanic or latino,female,white,TCGA-E2-A1LE_demographic,2016-09-02T19:09:17.163541-05:00,1937.0,2010.0,26239.0,...,c50.9,TCGA-E2-A1LE_diagnosis,c50.9,not reported,stage iiic,2016-09-02T19:09:17.163541-05:00,dead,4c456a08-ffda-507f-a3e3-d273e5c49864,TCGA-E2-A1LE_exposure,2016-09-02T19:09:17.163541-05:00
4,7e1673f8-5758-4963-8804-d5e39f06205b,eebe9198-74c8-5996-bf88-c56d5fd6832f,not hispanic or latino,female,asian,TCGA-C8-A12V_demographic,2016-09-02T19:02:30.147198-05:00,1955.0,,20346.0,...,c50.9,TCGA-C8-A12V_diagnosis,c50.9,not reported,stage iia,2016-09-02T19:02:30.147198-05:00,alive,dd5fbf53-64a4-5bc0-8ca7-ae05abbcbbac,TCGA-C8-A12V_exposure,2016-09-02T19:02:30.147198-05:00


#### Distances matrix

In [7]:
#Creating DataFrame from the distance-correlation matrix data
df_distances = pd.read_csv(DIST_PATH,sep="\t")

In [8]:
df_distances.head()

Unnamed: 0,TCGA.3C.AAAU.0.Dis,TCGA.3C.AALI.0.Dis,TCGA.3C.AALJ.0.Dis,TCGA.3C.AALK.0.Dis,TCGA.4H.AAAK.0.Dis,TCGA.5L.AAT0.0.Dis,TCGA.5L.AAT1.0.Dis,TCGA.5T.A9QA.0.Dis,TCGA.A1.A0SB.0.Dis,TCGA.A1.A0SD.0.Dis,...,TCGA.UL.AAZ6.0.Dis,TCGA.UU.A93S.0.Dis,TCGA.V7.A7HQ.0.Dis,TCGA.W8.A86G.0.Dis,TCGA.WT.AB41.0.Dis,TCGA.WT.AB44.0.Dis,TCGA.XX.A899.0.Dis,TCGA.XX.A89A.0.Dis,TCGA.Z7.A8R5.0.Dis,TCGA.Z7.A8R6.0.Dis
TCGA.3C.AAAU.0.Dis,0.0,0.12088,0.105801,0.10954,0.118014,0.11602,0.138326,0.139012,0.16487,0.109789,...,0.130849,0.177643,0.202318,0.118387,0.20481,0.17297,0.131472,0.147361,0.160945,0.102312
TCGA.3C.AALI.0.Dis,0.12088,0.0,0.086734,0.086734,0.10524,0.102997,0.095894,0.142688,0.183251,0.1144,...,0.117577,0.143124,0.172721,0.125615,0.158141,0.134712,0.116518,0.119447,0.129105,0.101502
TCGA.3C.AALJ.0.Dis,0.105801,0.086734,0.0,0.072653,0.091719,0.082123,0.096455,0.141006,0.181756,0.102686,...,0.108979,0.147735,0.151162,0.113216,0.138139,0.120631,0.100941,0.103558,0.112343,0.083557
TCGA.3C.AALK.0.Dis,0.10954,0.086734,0.072653,0.0,0.04075,0.040376,0.062932,0.144121,0.129292,0.05396,...,0.117827,0.158452,0.146738,0.06287,0.165742,0.102312,0.060814,0.0734,0.08013,0.070721
TCGA.4H.AAAK.0.Dis,0.118014,0.10524,0.091719,0.04075,0.0,0.037074,0.065612,0.152471,0.113901,0.054209,...,0.14356,0.180884,0.172347,0.0653,0.1878,0.112343,0.059381,0.069163,0.097015,0.078198


#### Genome data

In [38]:
#Creating DataFrame from the Genome data
#Note please change the first two words of the text file: "Hybridization REF" to "Hybridization_REF"
with open(GENOME_PATH) as f:
    row = [row.split()[0:] for row in f]
df_genome = pd.DataFrame(row[1:],columns=row[0])

In [39]:
df_genome.head(n=3)
#Row 0 as it has no information.

Unnamed: 0,Hybridization_REF,TCGA-3C-AAAU-01A-11R-A41B-07,TCGA-3C-AALI-01A-11R-A41B-07,TCGA-3C-AALJ-01A-31R-A41B-07,TCGA-3C-AALK-01A-11R-A41B-07,TCGA-4H-AAAK-01A-12R-A41B-07,TCGA-5L-AAT0-01A-12R-A41B-07,TCGA-5L-AAT1-01A-12R-A41B-07,TCGA-5T-A9QA-01A-11R-A41B-07,TCGA-A1-A0SB-01A-11R-A144-07,...,TCGA-UL-AAZ6-01A-11R-A41B-07,TCGA-UU-A93S-01A-21R-A41B-07,TCGA-V7-A7HQ-01A-11R-A33J-07,TCGA-W8-A86G-01A-21R-A36F-07,TCGA-WT-AB41-01A-11R-A41B-07,TCGA-WT-AB44-01A-11R-A41B-07,TCGA-XX-A899-01A-11R-A36F-07,TCGA-XX-A89A-01A-11R-A36F-07,TCGA-Z7-A8R5-01A-42R-A41B-07,TCGA-Z7-A8R6-01A-11R-A41B-07
0,gene_id,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,...,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count,normalized_count
1,?|100130426,0.0000,0.0000,0.9066,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
2,?|100133144,16.3644,9.2659,11.6228,12.0894,6.8468,3.9889,0.0000,1.4644,15.3396,...,0.3992,4.3126,0.0000,5.5624,0.0000,0.0000,14.3858,22.3240,2.2638,6.8865


#### Removing data of the patients that are not in the three datasets

In [40]:
#We know that the label "TCGA-XX-XXXX" is a reference for each patient
#We take as a reference the file with less patients, that is to say: distances matrix data

#Taking df_distances as reference to match the other datasets 
ref_distance = [val.replace(".","-")[0:12] for val in df_distances.columns.tolist()]
print("Number of references: " + str(len(ref_distance)))

Number of references: 1093


In [41]:
#Getting the references for clinical data
ref_clinical = [val[0:12] for val in df_clin['submitter_id_demo'].astype(str).tolist()]
print("Number of references: " + str(len(ref_clinical)))

#Getting the references that are not in the matrix data (References that need to be removed)
ref_clinical_to_remove=list(set(ref_clinical)-set(ref_distance))
print(set(ref_clinical)-set(ref_distance))

Number of references: 1097
set(['TCGA-AR-A0U1', 'TCGA-C8-A9FZ', 'TCGA-AC-A5EI', 'TCGA-A7-A0DC'])


In [42]:
#Getting the references for genome data
ref_genome = [val[0:12]for val in df_genome.columns.tolist()[1:]]
print("Number of references: " + str(len(ref_genome)))

Number of references: 1212


In [43]:
#References containing only primary solid tumor TP (01)
ref_genome_tumor=[val for val in df_genome.columns.tolist()[1:] if val[13:15]=="01"]
len(ref_genome_tumor)

1093

In [44]:
#References containing only metastatic TM (06)
ref_genome_tumor_06=[val for val in df_genome.columns.tolist()[1:] if val[13:15]=="06"]
print(len(ref_genome_tumor_06))

7


In [45]:
#References containing solid tissue normal NT (11) - No tumor
ref_genome_no_tumor=[val for val in df_genome.columns.tolist()[1:] if val[13:15]=="11"]
print(len(ref_genome_no_tumor))

112


## New genome DataFrame

In [67]:
#Since we are only going to focus on the primary solid tumor (01), we can get rid of the other references with 11 and 06
df_genome_new = df_genome[["Hybridization_REF"]+[val for val in df_genome.columns.tolist()[1:] if val[13:15]=="01"]].iloc[1:].copy()

In [68]:
df_genome_new.reset_index(drop=True,inplace=True)

In [69]:
#Splitting the Hybridization_REF in order to have gene_id and gene_id_reference
list_gene = [val.split("|")[0] for val in df_genome_new["Hybridization_REF"].tolist()]
list_gene_id = [val.split("|")[1] for val in df_genome_new["Hybridization_REF"].tolist()]
df_genome_new["gene"] = list_gene
df_genome_new["gene_id"] = list_gene_id

In [70]:
cols = df_genome_new.columns.tolist()
df_genome_new = df_genome_new[[cols[-2]]+[cols[-1]] + cols[:-2]]
del df_genome_new["Hybridization_REF"]

In [71]:
df_genome_new.head()

Unnamed: 0,gene,gene_id,TCGA-3C-AAAU-01A-11R-A41B-07,TCGA-3C-AALI-01A-11R-A41B-07,TCGA-3C-AALJ-01A-31R-A41B-07,TCGA-3C-AALK-01A-11R-A41B-07,TCGA-4H-AAAK-01A-12R-A41B-07,TCGA-5L-AAT0-01A-12R-A41B-07,TCGA-5L-AAT1-01A-12R-A41B-07,TCGA-5T-A9QA-01A-11R-A41B-07,...,TCGA-UL-AAZ6-01A-11R-A41B-07,TCGA-UU-A93S-01A-21R-A41B-07,TCGA-V7-A7HQ-01A-11R-A33J-07,TCGA-W8-A86G-01A-21R-A36F-07,TCGA-WT-AB41-01A-11R-A41B-07,TCGA-WT-AB44-01A-11R-A41B-07,TCGA-XX-A899-01A-11R-A36F-07,TCGA-XX-A89A-01A-11R-A36F-07,TCGA-Z7-A8R5-01A-42R-A41B-07,TCGA-Z7-A8R6-01A-11R-A41B-07
0,?,100130426,0.0,0.0,0.9066,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,?,100133144,16.3644,9.2659,11.6228,12.0894,6.8468,3.9889,0.0,1.4644,...,0.3992,4.3126,0.0,5.5624,0.0,0.0,14.3858,22.324,2.2638,6.8865
2,?,100134869,12.9316,17.379,9.2294,11.0799,14.4298,13.609,10.5949,8.9958,...,14.372,10.8828,3.0792,14.3711,6.3091,3.258,21.4409,27.2744,7.2933,24.7795
3,?,10357,52.1503,69.7553,154.2974,143.8643,84.2128,114.2572,115.9984,107.5628,...,135.6241,136.1288,29.9974,128.3151,53.6278,42.2643,137.7756,64.1427,85.0461,167.5511
4,?,10431,408.076,563.8934,1360.8341,865.5358,766.383,807.7431,1108.3945,1420.5021,...,1570.1445,2886.3965,1721.8816,697.6744,1245.2681,1877.418,652.7559,722.7208,1140.2801,1003.5668


In [88]:
df_genome_new.to_csv("genome_data.csv",sep=",",index=False)

## New clinical DataFrame

In [73]:
#Adding a new column with the references only
df_clin["reference"]= [val[0:12] for val in df_clin["submitter_id_demo"].astype(str).tolist()]

In [74]:
#Getting the references with solid tumor
ref_clinical_tumor=[val for val in ref_clinical if val not in ref_clinical_to_remove]

In [75]:
df_clin_new=df_clin[df_clin["reference"].isin(ref_clinical_tumor)].reset_index(drop=True)

In [77]:
df_clin_new.head()

Unnamed: 0,case_id,demographic_id,ethnicity,gender,race,submitter_id_demo,updated_datetime_demo,year_of_birth,year_of_death,age_at_diagnosis,...,submitter_id_diag,tissue_or_organ_of_origin,tumor_grade,tumor_stage,updated_datetime_diag,vital_status,exposure_id,submitter_id,updated_datetime,reference
0,b205c89f-af62-4186-acad-ed23d243fa98,55ae5a3b-e509-5fcd-98c9-c5007b448890,not hispanic or latino,female,white,TCGA-A2-A0YL_demographic,2016-09-02T19:00:43.039678-05:00,1962.0,,17702.0,...,TCGA-A2-A0YL_diagnosis,c50.9,not reported,stage iiia,2016-09-02T19:00:43.039678-05:00,alive,23b6a296-e149-5801-aa20-e144ef683abe,TCGA-A2-A0YL_exposure,2016-09-02T19:00:43.039678-05:00,TCGA-A2-A0YL
1,70f34c5c-3671-44c8-9469-99f9786efec1,498e27fb-2201-524a-bdd1-86f4e53dc54b,not hispanic or latino,female,white,TCGA-D8-A1XR_demographic,2016-09-02T19:07:03.114741-05:00,1954.0,,20488.0,...,TCGA-D8-A1XR_diagnosis,c50.9,not reported,stage iib,2016-09-02T19:07:03.114741-05:00,alive,6dfa11dc-4266-57c9-b746-02a6c7bb9d23,TCGA-D8-A1XR_exposure,2016-09-02T19:07:03.114741-05:00,TCGA-D8-A1XR
2,8986a141-eae7-4157-b695-02cc6fc3b071,8faa57d4-1199-53ce-a0ce-24d358841153,not hispanic or latino,female,white,TCGA-BH-A1ET_demographic,2016-09-02T18:57:52.265307-05:00,1944.0,2005.0,20425.0,...,TCGA-BH-A1ET_diagnosis,c50.9,not reported,stage i,2016-09-02T18:57:52.265307-05:00,dead,e72e6b02-58be-539a-af7f-f2714f14899a,TCGA-BH-A1ET_exposure,2016-09-02T18:57:52.265307-05:00,TCGA-BH-A1ET
3,7317e605-93ba-49a9-8743-20613b02767a,4394b44c-b51d-5b96-b62b-db7f49007b43,not hispanic or latino,female,white,TCGA-E2-A1LE_demographic,2016-09-02T19:09:17.163541-05:00,1937.0,2010.0,26239.0,...,TCGA-E2-A1LE_diagnosis,c50.9,not reported,stage iiic,2016-09-02T19:09:17.163541-05:00,dead,4c456a08-ffda-507f-a3e3-d273e5c49864,TCGA-E2-A1LE_exposure,2016-09-02T19:09:17.163541-05:00,TCGA-E2-A1LE
4,7e1673f8-5758-4963-8804-d5e39f06205b,eebe9198-74c8-5996-bf88-c56d5fd6832f,not hispanic or latino,female,asian,TCGA-C8-A12V_demographic,2016-09-02T19:02:30.147198-05:00,1955.0,,20346.0,...,TCGA-C8-A12V_diagnosis,c50.9,not reported,stage iia,2016-09-02T19:02:30.147198-05:00,alive,dd5fbf53-64a4-5bc0-8ca7-ae05abbcbbac,TCGA-C8-A12V_exposure,2016-09-02T19:02:30.147198-05:00,TCGA-C8-A12V


In [78]:
df_clin_new.to_csv("clinical_data.csv",sep=",",index=False)

## New distances matrix DataFrame

In [79]:
df_distances_new = df_distances.copy()

In [80]:
columns_dict = {}
for n,key in enumerate(df_distances.columns.tolist()):
    columns_dict[key] = [val.replace(".","-")[0:12] for val in df_distances.columns.tolist()][n]

In [83]:
#Just to keep the same format, we rename the distances matrix headers and indices
df_distances_new.rename(columns=columns_dict,index=columns_dict,inplace=True)

In [84]:
df_distances_new.head()

Unnamed: 0,TCGA-3C-AAAU,TCGA-3C-AALI,TCGA-3C-AALJ,TCGA-3C-AALK,TCGA-4H-AAAK,TCGA-5L-AAT0,TCGA-5L-AAT1,TCGA-5T-A9QA,TCGA-A1-A0SB,TCGA-A1-A0SD,...,TCGA-UL-AAZ6,TCGA-UU-A93S,TCGA-V7-A7HQ,TCGA-W8-A86G,TCGA-WT-AB41,TCGA-WT-AB44,TCGA-XX-A899,TCGA-XX-A89A,TCGA-Z7-A8R5,TCGA-Z7-A8R6
TCGA-3C-AAAU,0.0,0.12088,0.105801,0.10954,0.118014,0.11602,0.138326,0.139012,0.16487,0.109789,...,0.130849,0.177643,0.202318,0.118387,0.20481,0.17297,0.131472,0.147361,0.160945,0.102312
TCGA-3C-AALI,0.12088,0.0,0.086734,0.086734,0.10524,0.102997,0.095894,0.142688,0.183251,0.1144,...,0.117577,0.143124,0.172721,0.125615,0.158141,0.134712,0.116518,0.119447,0.129105,0.101502
TCGA-3C-AALJ,0.105801,0.086734,0.0,0.072653,0.091719,0.082123,0.096455,0.141006,0.181756,0.102686,...,0.108979,0.147735,0.151162,0.113216,0.138139,0.120631,0.100941,0.103558,0.112343,0.083557
TCGA-3C-AALK,0.10954,0.086734,0.072653,0.0,0.04075,0.040376,0.062932,0.144121,0.129292,0.05396,...,0.117827,0.158452,0.146738,0.06287,0.165742,0.102312,0.060814,0.0734,0.08013,0.070721
TCGA-4H-AAAK,0.118014,0.10524,0.091719,0.04075,0.0,0.037074,0.065612,0.152471,0.113901,0.054209,...,0.14356,0.180884,0.172347,0.0653,0.1878,0.112343,0.059381,0.069163,0.097015,0.078198


In [86]:
df_distances_new.to_csv("distances_matrix.csv",sep=",",index=True)