### TASK: Extract accepted mutations from a VCF file based on positions in corresponding BED file.

In [2]:
import allel
print("allel version",allel.__version__)
import pandas
print("pandas version", pandas.__version__)

allel version 1.2.1
pandas version 1.0.1


## Import Data
### Import VCF file

In [3]:
vcf_df = allel.vcf_to_dataframe('data/mutect_immediate.vcf', fields='*', alt_number=1)
vcf_df.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,DB,MQ0,SOMATIC,VT,FILTER_PASS,FILTER_REJECT,numalt,altlen,is_snp
0,chr1,802571,.,C,T,,False,-1,True,SNP,True,False,1,0,True
1,chr1,5164391,.,T,C,,False,-1,False,,False,True,1,0,True
2,chr1,5164400,.,A,C,,False,-1,False,,False,True,1,0,True
3,chr1,5164424,.,T,C,,False,-1,False,,False,True,1,0,True
4,chr1,5164432,.,T,C,,False,-1,False,,False,True,1,0,True


### Filter based on FILTER tag as True

In [4]:
vcf_pass = vcf_df[vcf_df.FILTER_PASS == True]
vcf_pass.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,DB,MQ0,SOMATIC,VT,FILTER_PASS,FILTER_REJECT,numalt,altlen,is_snp
0,chr1,802571,.,C,T,,False,-1,True,SNP,True,False,1,0,True
9,chr1,38529840,.,G,C,,False,-1,True,SNP,True,False,1,0,True
10,chr1,38529844,.,G,T,,False,-1,True,SNP,True,False,1,0,True
29,chr1,51993005,.,A,G,,False,-1,True,SNP,True,False,1,0,True
30,chr1,51993024,.,T,G,,False,-1,True,SNP,True,False,1,0,True


### Import BED file

In [5]:
bed_data = pandas.read_csv('data/truseq.bed', sep='\t', names=["chr", "start", "end"])
bed_data.head()

Unnamed: 0,chr,start,end
0,chr1,43815005,43815137
1,chr1,115256525,115256653
2,chr1,115258727,115258855
3,chr10,43609073,43609201
4,chr10,43609929,43610049


## Filter mutations based on position macthing with BED file

In [6]:
new_vcf = pandas.DataFrame()
for index, mutation in vcf_pass.iterrows():
    for index, ranges in bed_data.iterrows():
        if mutation['CHROM'] == ranges['chr'] and mutation['POS'] > ranges['start'] and mutation['POS'] < ranges['end']:
           new_vcf = new_vcf.append(mutation)
           new_vcf = new_vcf[vcf_pass.columns]
           #print(mutation)
           #new_vcf = new_vcf.reindex(mutation.columns, axis=1)
new_vcf.head()

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,DB,MQ0,SOMATIC,VT,FILTER_PASS,FILTER_REJECT,numalt,altlen,is_snp
241,chr2,209113192.0,.,G,A,,0.0,-1.0,1.0,SNP,1.0,0.0,1.0,0.0,1.0
411,chr2,212812156.0,.,T,C,,0.0,-1.0,1.0,SNP,1.0,0.0,1.0,0.0,1.0
441,chr3,10183794.0,.,G,T,,0.0,-1.0,1.0,SNP,1.0,0.0,1.0,0.0,1.0
594,chr3,178921639.0,.,C,A,,0.0,-1.0,1.0,SNP,1.0,0.0,1.0,0.0,1.0
755,chr4,1807894.0,.,G,A,,0.0,-1.0,1.0,SNP,1.0,0.0,1.0,0.0,1.0


## Number of Mutations

In [7]:
print("Total Number of Mutations before filtering:", len(vcf_df.index))
print("Total Number of Mutations after pass tag filtering:", len(vcf_pass.index))
print("Total Number of Accepted Mutations:", len(new_vcf.index))

Total Number of Mutations before filtering: 4775
Total Number of Mutations after pass tag filtering: 141
Total Number of Accepted Mutations: 23


## Export results into Excel

In [8]:
new_vcf.to_excel("accepted_mutations.xlsx",index=False)