In [3]:
def load_reference_vcf(reference_vcf_file):
    """
    Load the reference VCF file and create a dictionary with variants.
    Returns a dictionary where the keys are variant positions and the values are the corresponding variant records.
    """
    variants_dict = {}
    with open(reference_vcf_file, 'r') as vcf_file:
        for line in vcf_file:
            if line.startswith('#'):
                continue  # Skip header lines
            fields = line.strip().split('\t')
            position = int(fields[1])
            variants_dict[position] = line
    return variants_dict

def annotate_vcf(reference_vcf_file, input_vcf_file, output_vcf_file):
    """
    Annotate the input VCF file using the reference VCF file and save the annotated VCF to the output file.
    """
    variants_dict = load_reference_vcf(reference_vcf_file)
    with open(input_vcf_file, 'r') as input_file, open(output_vcf_file, 'w') as output_file:
        for line in input_file:
            if line.startswith('#'):
                output_file.write(line)
                continue  # Skip header lines
            fields = line.strip().split()
            position = int(fields[1])
            if position in variants_dict:
                variant_record = variants_dict[position]
                annotated_line = line.strip() + '\t' + variant_record
                output_file.write(annotated_line + '\n')
            else:
                output_file.write(line)

# Provide the paths to the reference VCF, input VCF, and output VCF files
# reference_vcf_file is subset of file from https://www.ncbi.nlm.nih.gov/snp/docs/products/vcf/redesign/
reference_vcf_file = 'dbsnp.vcf'

input_vcf_file = 'input.vcf'


output_vcf_file = 'output.vcf'
# you can then used the RS= code  in https://www.ncbi.nlm.nih.gov/snp/ to find the associated pathology

# Call the function to annotate the VCF file
annotate_vcf(reference_vcf_file, input_vcf_file, output_vcf_file)
