In [1]:
import pandas as pd
import pybedtools
import pickle

In [2]:
metabric_cna = pd.read_csv('CNA_METABRIC.tsv', sep='\t')
metabric_cna['loc.end'] = metabric_cna['loc.end'].astype(int)
display(metabric_cna.head())
display(metabric_cna.shape)

Unnamed: 0,SampleID,chrom,loc.start,loc.end,num.mark,seg.mean
0,MB-0135,1,51599,3070800,725,0.059265
1,MB-0135,1,3079347,20956319,11269,-0.363605
2,MB-0135,1,20956825,20958696,2,-2.123863
3,MB-0135,1,20962780,25465716,2649,-0.363605
4,MB-0135,1,25468469,25534089,30,0.059265


(1901383, 6)

In [3]:
tcga_somatic_cna = pd.read_csv('CNA_TCGA_somatic.tsv', sep='\t')
tcga_somatic_cna = tcga_somatic_cna[tcga_somatic_cna.Sample.apply(lambda x: x.split('-')[3].startswith('01'))]
tcga_somatic_cna['Sample'] = tcga_somatic_cna['Sample'].apply(lambda x: '-'.join(x.split('-')[:4]))
display(tcga_somatic_cna.head())
display(tcga_somatic_cna.shape)

Unnamed: 0,Sample,Chromosome,Start,End,Num_Probes,Segment_Mean
67,TCGA-3C-AAAU-01A,1,3208470,63242091,33372,0.1791
68,TCGA-3C-AAAU-01A,1,63244080,63244691,3,-0.8257
69,TCGA-3C-AAAU-01A,1,63245456,85405184,13598,0.2994
70,TCGA-3C-AAAU-01A,1,85408001,85650724,146,0.6498
71,TCGA-3C-AAAU-01A,1,85653866,148157157,20622,0.2781


(259412, 6)

In [4]:
tcga_all_cna = pd.read_csv('CNA_TCGA_all.tsv', sep='\t')                        
tcga_all_cna = tcga_all_cna[tcga_all_cna.Sample.apply(lambda x: x.split('-')[3].startswith('01'))]
tcga_all_cna['Sample'] = tcga_all_cna['Sample'].apply(lambda x: '-'.join(x.split('-')[:4]))
display(tcga_all_cna.head())                                                    
display(tcga_all_cna.shape)

Unnamed: 0,Sample,Chromosome,Start,End,Num_Probes,Segment_Mean
465,TCGA-3C-AAAU-01A,1,51598,1500664,226,0.0814
466,TCGA-3C-AAAU-01A,1,1617778,1662463,17,-0.4123
467,TCGA-3C-AAAU-01A,1,1677447,16022502,8176,0.1465
468,TCGA-3C-AAAU-01A,1,16026084,16026512,6,-2.1915
469,TCGA-3C-AAAU-01A,1,16026788,16027597,4,-0.8068


(674714, 6)

In [5]:
metabric_clinical = pd.read_csv('clinical_METABRIC.tsv', sep='\t')
metabric_clinical = metabric_clinical.loc[(metabric_clinical['ER.Expr'] == '+') & (metabric_clinical['Her2.Expr'] == '-')]
metabric_clinical = metabric_clinical[metabric_clinical.iC10.apply(lambda x: x in ['3','4ER+','7','8','1','2','6','9'])]
metabric_clinical = metabric_clinical[['METABRIC.ID']]

display(metabric_clinical.head())
display(metabric_clinical.shape)

Unnamed: 0,METABRIC.ID
0,MB-0002
1,MB-0005
2,MB-0006
3,MB-0010
4,MB-0014


(1360, 1)

In [6]:
tcga_clinical = pd.read_csv('clinical_TCGA.tsv', sep='\t')
tcga_clinical = tcga_clinical[tcga_clinical.ID.apply(lambda x: x.split('-')[3].startswith('01'))]
tcga_clinical['ID'] = tcga_clinical['ID'].apply(lambda x: '-'.join(x.split('-')[:4]))
tcga_clinical = tcga_clinical.loc[(tcga_clinical['ER'] == 1) & (tcga_clinical['HER2'] == 0)]
tcga_clinical = tcga_clinical[tcga_clinical.ic10.apply(lambda x: x in [3,4,7,8,1,2,6,9])]
tcga_clinical = tcga_clinical[['ID']]

display(tcga_clinical.head())
display(tcga_clinical.shape)

tcga_clinical.drop_duplicates(inplace=True)
display(tcga_clinical.shape)

Unnamed: 0,ID
0,TCGA-AN-A0XW-01A
1,TCGA-BH-A0DX-01A
3,TCGA-A2-A0CT-01A
4,TCGA-BH-A1F8-01A
5,TCGA-A8-A06Z-01A


(732, 1)

(729, 1)

In [7]:
metabric_joined = metabric_cna.merge(metabric_clinical, how='inner', left_on='SampleID', right_on="METABRIC.ID")
metabric_joined = metabric_joined.drop('METABRIC.ID',axis=1)
metabric_joined.columns = ['sample','chrom', 'start', 'end', 'probes', 'cna']
display(metabric_joined.head())
display(metabric_joined.shape)

Unnamed: 0,sample,chrom,start,end,probes,cna
0,MB-0135,1,51599,3070800,725,0.059265
1,MB-0135,1,3079347,20956319,11269,-0.363605
2,MB-0135,1,20956825,20958696,2,-2.123863
3,MB-0135,1,20962780,25465716,2649,-0.363605
4,MB-0135,1,25468469,25534089,30,0.059265


(1244072, 6)

In [8]:
tcga_somatic_joined = tcga_somatic_cna.merge(tcga_clinical, how='inner', left_on='Sample', right_on="ID")
tcga_somatic_joined = tcga_somatic_joined.drop('ID',axis=1)
tcga_somatic_joined.columns = ['sample','chrom', 'start', 'end', 'probes', 'cna']
display(tcga_somatic_joined.head())
display(tcga_somatic_joined.shape)

Unnamed: 0,sample,chrom,start,end,probes,cna
0,TCGA-3C-AAAU-01A,1,3208470,63242091,33372,0.1791
1,TCGA-3C-AAAU-01A,1,63244080,63244691,3,-0.8257
2,TCGA-3C-AAAU-01A,1,63245456,85405184,13598,0.2994
3,TCGA-3C-AAAU-01A,1,85408001,85650724,146,0.6498
4,TCGA-3C-AAAU-01A,1,85653866,148157157,20622,0.2781


(141630, 6)

In [9]:
tcga_all_joined = tcga_all_cna.merge(tcga_clinical, how='inner', left_on='Sample', right_on="ID")
tcga_all_joined = tcga_all_joined.drop('ID',axis=1)                             
tcga_all_joined.columns = ['sample','chrom', 'start', 'end', 'probes', 'cna']   
display(tcga_all_joined.head())                                                 
display(tcga_all_joined.shape)

Unnamed: 0,sample,chrom,start,end,probes,cna
0,TCGA-3C-AAAU-01A,1,51598,1500664,226,0.0814
1,TCGA-3C-AAAU-01A,1,1617778,1662463,17,-0.4123
2,TCGA-3C-AAAU-01A,1,1677447,16022502,8176,0.1465
3,TCGA-3C-AAAU-01A,1,16026084,16026512,6,-2.1915
4,TCGA-3C-AAAU-01A,1,16026788,16027597,4,-0.8068


(409568, 6)

In [10]:
somatic_merged = metabric_joined.append(tcga_somatic_joined, ignore_index=True)
display(somatic_merged.head())
display(somatic_merged.shape)

Unnamed: 0,sample,chrom,start,end,probes,cna
0,MB-0135,1,51599,3070800,725,0.059265
1,MB-0135,1,3079347,20956319,11269,-0.363605
2,MB-0135,1,20956825,20958696,2,-2.123863
3,MB-0135,1,20962780,25465716,2649,-0.363605
4,MB-0135,1,25468469,25534089,30,0.059265


(1385702, 6)

In [11]:
all_merged = metabric_joined.append(tcga_all_joined, ignore_index=True)         
display(all_merged.head())                                                      
display(all_merged.shape)

Unnamed: 0,sample,chrom,start,end,probes,cna
0,MB-0135,1,51599,3070800,725,0.059265
1,MB-0135,1,3079347,20956319,11269,-0.363605
2,MB-0135,1,20956825,20958696,2,-2.123863
3,MB-0135,1,20962780,25465716,2649,-0.363605
4,MB-0135,1,25468469,25534089,30,0.059265


(1653640, 6)

In [12]:
metabric_joined.to_csv('metabric.seg', sep='\t', index=False)
tcga_somatic_joined.to_csv('tcga_somatic.seg', sep='\t', index=False)
tcga_all_joined.to_csv('tcga_all.seg', sep='\t', index=False)
somatic_merged.to_csv('merged_somatic.seg', sep='\t', index=False)
all_merged.to_csv('merged_all.seg', sep='\t', index=False)