In [1]:
import pandas as pd
import io
import os
import argparse
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

In [2]:
chrM_file = "../../input_files/chrM.fa"
ref_seq = SeqIO.read(chrM_file, "fasta")

haplotypes_file = "../../input_files/haplotype_mutations.vcf"
haplotypes = pd.read_csv(haplotypes_file,  sep = "\t")

In [3]:
#Creating our haploptypes dictionary 
#only keep SNVs
haplotypes = haplotypes[haplotypes["VARIANT_TYPE"] == "SNV"]

haplotypes = haplotypes.drop_duplicates(subset = ["STRAIN", "START"], \
                                                keep = "first")[["STRAIN", \
                                                                 "START", "REF", "ALT"]]
strains = list(haplotypes["STRAIN"].unique())

In [4]:
#Creating our haplotype dictionary 
#Coordinates are in 0-index
haplotype_dict = {}
for conplastic in strains:
    #take the position, ref, and alt alleles for all haploptypes for a given strain & place into one list of lists
    haplotype_info_for_strain = haplotypes[haplotypes["STRAIN"] == conplastic]
    haplotype_info_for_strain = haplotype_info_for_strain[["START", "REF", "ALT"]].values.tolist()
    #add the list of lists as a value to the strain keys
    haplotype_dict[conplastic] = haplotype_info_for_strain

In [5]:
def get_ref_genome(strain, haplotype_dict):
    #we need to convert from a seq to a mutable object in order to mutate our sequence
    strain_ref_genome = ref_seq.seq.tomutable()

    haplotype_info = haplotype_dict[strain]

    #we loop through each haplotype and mutate the ref sequence from mm10 accordingly
    for index in range(0, len(haplotype_info)):
        site = haplotype_info[index][0]
        allele = haplotype_info[index][2]
        strain_ref_genome[site] = allele

    #convert back to a seq object in order to access all the methods
    strain_ref_genome = strain_ref_genome.toseq()
    return strain_ref_genome

In [6]:
#note this ref genome is missing the two insertions that occur in the NUMT 
NZB_chrM_ref = get_ref_genome("NZB", haplotype_dict)

In [7]:
NZB_record = SeqRecord(NZB_chrM_ref, id = "NZB_chrM", name = "NZB_chrM", description = "chrM with NZB SNP haplotypes")

In [8]:
#writing out our fasta file 
output_file_path = "../files/NZB_chrM.fa"
SeqIO.write(NZB_record, output_file_path,"fasta")

1