In [1]:
import pandas as pd
import numpy as np
import csv
import os

# Filter Samples in Cancer Variant Datasets

This notebook uses breast cancer variant datasets (Staaf et al. Whole-genome-sequencing of triple negative breast cancers: a population study. 2019).

Since the datasets are large, we separate files based on samples of interest.

In [2]:
path_rea = os.path.join('data', 'cancer-variant', 'rearrangement.tsv')
path_sub = os.path.join('data', 'cancer-variant', 'substitution.tsv')
path_ind = os.path.join('data', 'cancer-variant', 'indel.tsv')
o_path = os.path.join('data', 'cancer-variant', 'output')

In [42]:
rea = pd.read_csv(path_rea, '\t')
sub = pd.read_csv(path_sub, '\t')
ind = pd.read_csv(path_ind, '\t')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [43]:
sample_of_interest = [
    'PD35957a',
    'PD36000a',
    'PD31042a',
    'PD35930a'
]

In [44]:
rea.head()

Unnamed: 0,chr1,start1,end1,chr2,start2,end2,id/name,brass_score,strand1,strand2,...,gene2,gene_id2,transcript_id2,strand2.1,phase2,region2,region_number2,total_region_count2,first/last2,fusion_flag
0,1,37923244,37923245,1,37941652,37941653,1,17,+,+,...,ZC3H12A,ZC3H12A,ENST00000373087,1,2,intron,2,6,_,0
1,1,226320683,226320686,6,43549502,43549505,2,8,-,+,...,POLH,POLH,ENST00000372236,1,_,5UTRintron,1,11,_,0
2,10,3816588,3816589,10,3855075,3855076,66,13,-,-,...,_,_,_,_,_,_,_,_,_,0
3,10,47412183,47412184,10,47413650,47413651,67,5,-,+,...,_,_,_,_,_,_,_,_,_,0
4,10,60816926,60816929,10,60837207,60837210,69,74,-,+,...,_,_,_,_,_,_,_,_,_,0


In [45]:
i = sub.copy()

# Calculate pos diff
i['ChromDiff'] = i['Chrom'].shift()
i['ChromDiff'] = (i['ChromDiff'] != i['Chrom'])
i['ChromDiff'] = i['ChromDiff'].apply(lambda x: 0 if x == True else 1)
i['PosDiff'] = i['Pos'].diff()
i['PosDiff'] = i['PosDiff'] * i['ChromDiff']

sub = i

sub.head()

Unnamed: 0,AnalysisProc,Sample,Normal,VariantID,Chrom,Pos,Ref,Alt,Qual,Filter,...,FLG-SR,FLG-CR,FLG-PH,FLG-HSD,FLG-GI,FLG-VUM,FLG-SE,FLG-MNP,ChromDiff,PosDiff
0,4316782,PD31028a,PD31028b,27d49146-3158-11e7-8c2b-087e828a5427,1,1346901,A,T,.,PASS,...,1,1,1,1,1,1,1,1,0,
1,4316782,PD31028a,PD31028b,27d4c9b8-3158-11e7-8c2b-087e828a5427,1,2209891,G,A,.,PASS,...,1,1,1,1,1,1,1,1,1,862990.0
2,4316782,PD31028a,PD31028b,27d4cec2-3158-11e7-8c2b-087e828a5427,1,2269113,C,T,.,PASS,...,1,1,1,1,1,1,1,1,1,59222.0
3,4316782,PD31028a,PD31028b,27d5c0fc-3158-11e7-8c2b-087e828a5427,1,2692127,A,G,.,PASS,...,1,1,1,1,1,1,1,1,1,423014.0
4,4316782,PD31028a,PD31028b,27d5c264-3158-11e7-8c2b-087e828a5427,1,2717398,G,C,.,PASS,...,1,1,1,1,1,1,1,1,1,25271.0


In [38]:
ind.head()

Unnamed: 0,AnalysisProc,Sample,Normal,VariantID,Chrom,Pos,Ref,Alt,Qual,Filter,...,FLG-F010,FLG-F012,FLG-F018,FLG-F015,FLG-F016,1000g2015aug_all,ExAC_ALL,avsnp147,ChromDiff,PosDiff
0,.,PD31028a,PD31028b,a306aae4-30cb-11e7-863c-f13f8c1b7381,1,5302163,TC,T,540,PASS,...,1,1,1,1,1,.,.,.,0,
1,.,PD31028a,PD31028b,35192464-30d3-11e7-863c-f13f8c1b7381,1,6171900,G,GGTAT,360,PASS,...,1,1,1,1,1,.,.,.,1,869737.0
2,.,PD31028a,PD31028b,3957e434-30d3-11e7-863c-f13f8c1b7381,1,6708224,C,CT,360,PASS,...,1,1,1,1,1,.,.,.,1,536324.0
3,.,PD31028a,PD31028b,d242660e-30cb-11e7-863c-f13f8c1b7381,1,10752948,CAATGAA,C,720,PASS,...,1,1,1,1,1,.,.,.,1,4044724.0
4,.,PD31028a,PD31028b,1662b2ee-30cc-11e7-863c-f13f8c1b7381,1,17532948,GC,G,537,PASS,...,1,1,1,1,1,.,.,.,1,6780000.0


In [46]:
for sample in sample_of_interest:
    r = rea[rea['sample'] == sample]
    s = sub[sub['Sample'] == sample]
    i = ind[ind['Sample'] == sample]
    
    r.to_csv(os.path.join(o_path, 'rearrangement.' + sample + '.csv'), index=False)
    s.to_csv(os.path.join(o_path, 'substitution.' + sample + '.csv'), index=False)
    i.to_csv(os.path.join(o_path, 'indel.' + sample + '.csv'), index=False)