In [1]:
from collections import OrderedDict
from collections import Counter

# Essential VCF headers
VCF_HEADER = ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']

def lines(filename):
    with open(filename) as fh:
        for line in fh:
            if line.startswith('#'):
                continue
            else:
                yield parse(line)

# parse all the VCF file data into a ordered dictiorany including each INFO and FORMAT in key:pair
def parse(line):
    result = OrderedDict()
    fields = line.rstrip().split('\t')

    for i, col in enumerate(VCF_HEADER[:7]):
        result[col] = get_value(fields[i])

    infos = fields[7].split(';')

    for i, info in enumerate(infos, 1):
        try:
            key, value = info.split('=')
        except ValueError:
            key = 'INFO{}'.format(i)
            value = info
        result[key] = get_value(value)
        
    formats_key = fields[8].split(':')
    formats_value = fields[9].split(':')
    
    for i in range(len(formats_key)):
        result[formats_key[i]] = get_value(formats_value[i])

    return result

# if value is not present return NULL
def get_value(value):
    if not value or value in ['', '.', 'NA']:
        return None
    if ',' in value:
        return value.split(',')
    return value

# for getting
def without_keys(d, keys):
    return {x: d[x] for x in d if x not in keys}

In [2]:
# Combine the VCF data
new_vcf_list = []
for fb in lines('freebayes_raw.vcf'):
    for vs in lines('varscan_raw.vcf'):
        
        if fb['CHROM'] == vs['CHROM'] and fb['POS'] == vs['POS']:
            
            # Get common headers only from INFO and FORMAT tag
            common_header_temp = set(fb.keys()).intersection(vs.keys())
            common_header = common_header_temp.symmetric_difference(VCF_HEADER[:7])
            # replace those common headers
            for header in common_header:
                fb["freebayes_" + header] = fb.pop(header)
                vs["varscan_" + header] = vs.pop(header)
            # keep only diffrent filds from other vcf
            new_vs = without_keys(vs, VCF_HEADER[:7])
            
            # merge
            new_vcf = OrderedDict(list(fb.items()) + list(new_vs.items()) + list({'TOOL': 'both'}.items()))
            new_vcf_list.append(new_vcf)
            break

In [3]:
# Overview of each dictionary after merge
print(new_vcf_list[0])

OrderedDict([('CHROM', 'chr2'), ('POS', '29415792'), ('ID', None), ('REF', 'G'), ('ALT', 'A'), ('QUAL', '1101.66'), ('FILTER', None), ('AB', '0.494737'), ('ABP', '3.03316'), ('AC', '1'), ('AF', '0.5'), ('AN', '2'), ('AO', '47'), ('CIGAR', '1X'), ('DPB', '95'), ('DPRA', '0'), ('EPP', '27.4509'), ('EPPR', '33.5919'), ('GTI', '0'), ('LEN', '1'), ('MEANALT', '1'), ('MQM', '60'), ('MQMR', '60'), ('NS', '1'), ('NUMALT', '1'), ('ODDS', '231.64'), ('PAIRED', '0.978723'), ('PAIREDR', '0.979167'), ('PAO', '0'), ('PQA', '0'), ('PQR', '0'), ('PRO', '0'), ('QA', '1807'), ('QR', '1719'), ('RO', '48'), ('RPL', '12'), ('RPP', '27.4509'), ('RPPR', '33.5919'), ('RPR', '35'), ('RUN', '1'), ('SAF', '47'), ('SAP', '105.07'), ('SAR', '0'), ('SRF', '48'), ('SRP', '107.241'), ('SRR', '0'), ('TYPE', 'snp'), ('DPR', ['95', '47']), ('GL', ['-134.266', '0', '-126.33']), ('freebayes_DP', '95'), ('freebayes_GT', '0/1'), ('ADP', '89'), ('WT', '0'), ('HET', '1'), ('HOM', '0'), ('NC', '0'), ('GQ', '163'), ('SDP', '93'

In [4]:
# Export into a CSV file
import csv
keys = new_vcf_list[0].keys()
with open('merged_VCF.csv', 'w') as outfile:
    fp = csv.DictWriter(outfile, keys)
    fp.writeheader()
    fp.writerows(new_vcf_list)