In [1]:
import numpy as np
import pandas as pd

In [3]:
# read vcf files
vcf_reader = open('data/gavin_input-gavin.vcf','r')
column_lines = []
info = []
sample_lines = []
for line in vcf_reader.readlines():
    if line.startswith('#CHROM'):
        column_lines = line.strip('\n').strip('#').split('\t')
    elif line.startswith('##INFO'):
        info.append(line.strip('\n').strip('#'))
    elif line[0].isdigit():
        line = line.strip('\n')
        sample_lines.append(line)
        
# parse the column names
column_lines = column_lines[:-1]
info_contents = [line[line.find('INFO=ID=')+len('INFO=ID=')+2:line.find(',')] for line in info[:-1]]
line = info[-1]
info_last = line[line.find("annotations: '")+len("annotations: '")+2:-3].split('|')
columns = column_lines+info_contents+info_last
columns = [col.strip(' ') for col in columns]

In [4]:
columns

['CHROM',
 'POS',
 'ID',
 'REF',
 'ALT',
 'QUAL',
 'FILTER',
 'CADD',
 'CADD_SCALED',
 'EXAC_AF',
 'EXAC_AC_HOM',
 'EXAC_AC_HET',
 't_Allele',
 'Gene_Name',
 'Annotation',
 'Putative_impact',
 'Gene_ID',
 'Feature_type',
 'Feature_ID',
 'Transcript_biotype',
 'Rank_total',
 'HGVS_c',
 'HGVS_p',
 'cDNA_position',
 'CDS_position',
 'Protein_position',
 'Distance_to_feature',
 'Errors',
 'Classification',
 'Confidence',
 'Reason']

In [5]:
def cut_str(string,begin,end):
    '''
    cheery pick the begin till end of a string
    '''
    if begin not in string:
        return ''
    begin_ind = string.find(begin)+len(begin)
    cut = string[begin_ind:]
    end_ind = cut.find(end)+len(end)-1
    return cut[:end_ind]

In [6]:
# samples = []
# for line in sample_lines:
#     var_info = line.split('\t')[:-1]
#     if ',' in line:
#         var = line.split('\t')[-1].split(',')
#     else:
#         var=line.split('\t')[-1]
#     for gene in var:
#         if 'MYO5B' in gene:
#             var_gene = []
#             var_gene_infos = gene.split(';')[-1].split('|')
#             var_gene.append(cut_str(gene,'CADD=',';'))
#             var_gene.append(cut_str(gene,'CADD_SCALED=',';'))
#             var_gene.append(cut_str(gene,'EXAC_AF=',';'))
#             var_gene.append(cut_str(gene,'EXAC_AC_HOM=',';'))
#             var_gene.append(cut_str(gene,'EXAC_AC_HET=',';'))
#     samples.append(var_info+var_gene+var_gene_infos)
# for sample in samples:
#     assert len(sample)==31

In [7]:
# parse the samples
samples = []
for line in sample_lines:
    line = line.strip(' ')
    var_info = line.split('\t')[:-1]
    if ',' in line:
        genes = line.split('\t')[-1].split(',')
        for gene in genes:
            if 'MYO5B' in gene:
                myo_info = gene
                var_gene_infos = myo_info.split(';')[-1].split('|')
    else:
        myo_info=line.split('\t')[-1]
        var_gene_infos = myo_info.split(';')[-2].split('|')
    var_gene = []    
    var_gene.append(cut_str(myo_info,'CADD=',';'))
    var_gene.append(cut_str(myo_info,'CADD_SCALED=',';'))
    var_gene.append(cut_str(myo_info,'EXAC_AF=',';'))
    var_gene.append(cut_str(myo_info,'EXAC_AC_HOM=',';'))
    var_gene.append(cut_str(myo_info,'EXAC_AC_HET=',';'))
    samples.append(var_info+var_gene+var_gene_infos)
for sample in samples:
    assert len(sample)==31

In [8]:
datatable = pd.DataFrame(samples,columns=columns)

In [9]:
datatable.replace('',np.NaN)

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,CADD,CADD_SCALED,EXAC_AF,...,HGVS_c,HGVS_p,cDNA_position,CDS_position,Protein_position,Distance_to_feature,Errors,Classification,Confidence,Reason
0,14,23882063,.,C,T,.,.,1.880579,15.47,,...,c.5808G>A,p.Ter1936Ter,5939/6055,5808/5808,1936/1935,,,Benign,calibrated,Variant CADD score of 15.47 is less than 15.62...
1,14,23882064,.,T,C,.,.,1.823769,15.13,,...,c.5807A>G,p.Ter1936Trpext*?,5938/6055,5807/5808,1936/1935,,,Benign,calibrated,Variant CADD score of 15.13 is less than 15.62...
2,14,23882082,.,T,C,.,.,4.4282580000000005,24.2,,...,c.5791-2A>G,,,,,,,Pathogenic,calibrated,Variant CADD score of 24.2 is greater than 22....
3,14,23882975,.,C,T,.,.,3.5186269999999995,23.1,8.237E-6,...,c.5783G>A,p.Gly1928Asp,5914/6055,5783/5808,1928/1935,,,Pathogenic,calibrated,Variant CADD score of 23.1 is greater than 22....
4,14,23882976,.,C,A,.,.,5.615639,26.6,,...,c.5782G>T,p.Gly1928Cys,5913/6055,5782/5808,1928/1935,,,Pathogenic,calibrated,Variant CADD score of 26.6 is greater than 22....
5,14,23882984,.,C,T,.,.,6.322065,29.2,8.237E-6,...,c.5774G>A,p.Arg1925His,5905/6055,5774/5808,1925/1935,,,Pathogenic,calibrated,Variant CADD score of 29.2 is greater than 22....
6,14,23882985,.,G,A,.,.,8.175613,35.0,8.237E-6,...,c.5773C>T,p.Arg1925Cys,5904/6055,5773/5808,1925/1935,,,Pathogenic,calibrated,Variant CADD score of 35.0 is greater than 22....
7,14,23882993,.,G,A,.,.,3.586021,23.2,8.236E-6,...,c.5765C>T,p.Ala1922Val,5896/6055,5765/5808,1922/1935,,,Pathogenic,calibrated,Variant CADD score of 23.2 is greater than 22....
8,14,23882997,.,G,A,.,.,6.560517999999999,31.0,8.237E-6,...,c.5761C>T,p.Arg1921Trp,5892/6055,5761/5808,1921/1935,,,Pathogenic,calibrated,Variant CADD score of 31.0 is greater than 22....
9,14,23883009,.,C,A,.,.,7.421627,34.0,,...,c.5749G>T,p.Val1917Phe,5880/6055,5749/5808,1917/1935,,,Pathogenic,calibrated,Variant CADD score of 34.0 is greater than 22....


In [10]:
datatable['CADD'] = datatable['CADD'].astype('float')
datatable['CADD_SCALED'] = datatable['CADD_SCALED'].astype('float')
# datatable['EXAC_AF'] = datatable['EXAC_AF'].astype('float')
# datatable['EXAC_AC_HOM'] = datatable['EXAC_AC_HOM'].astype('float')
# datatable['Distance_to_feature'] = datatable['Distance_to_feature'].astype('int')
# datatable['Errors'] = datatable['Errors'].astype('int')

In [11]:
datatable.to_csv('data/gavin_res.csv',sep='\t', index=False, header=True)

In [12]:
datatable.CADD

0      1.880579
1      1.823769
2      4.428258
3      3.518627
4      5.615639
5      6.322065
6      8.175613
7      3.586021
8      6.560518
9      7.421627
10     3.300168
11     7.559069
12     6.566591
13     7.710235
14     7.938670
15     8.266119
16     7.408993
17     6.666103
18     8.245451
19     5.867249
20     6.925272
21     2.354094
22     7.939234
23     8.357856
24     5.127199
25     6.718649
26     6.177906
27     6.721912
28     6.701846
29     1.823764
         ...   
820    3.871197
821    4.486533
822    5.209410
823    2.130051
824    4.304563
825    3.383733
826    2.694014
827    6.136413
828    2.922024
829    1.040996
830    1.760142
831    5.563407
832    2.010589
833    4.707939
834    4.868483
835    3.990305
836   -1.547127
837    2.169108
838    3.204917
839    6.421265
840    6.160776
841    4.774503
842    5.255416
843    7.612844
844    4.636164
845    7.042627
846    5.493546
847    0.958323
848    0.824727
849    0.607012
Name: CADD, Length: 850,