In [None]:
import numpy as np
import pandas as pd
import allel
import re

def load_vcf(vcf_path, metadata):
    """
    Load VCF and filter poor-quality samples
    """
    
    sampleIDs = metadata.sampleID.to_list()
    
    # load vcf and get genotypes and positions
    vcf = allel.read_vcf(vcf_path, fields='*')
    samples = vcf['samples']
    # keep only samples in qcpass metadata 
    sample_mask = np.isin(vcf['samples'], metadata.sampleID)
    
    # remove low quality samples 
    geno = allel.GenotypeArray(vcf['calldata/GT'])
    geno = geno.compress(sample_mask, axis=1)
    pos = vcf['variants/POS']
    contig = vcf['variants/CHROM']
    indel = vcf['variants/INDEL']
    
    # remove indels 
    geno = geno.compress(~indel, axis=0)
    pos = pos[~indel]
    contig = contig[~indel]
    
    return geno, pos, contig, samples[sample_mask], vcf['variants/REF'][~indel], vcf['variants/ALT'][~indel]

In [None]:
metadata_path = '../../../results/config/metadata.qcpass.tsv'
kdr_marker_snps_path = '../../../resources/ag-vampir/Kdr_marker_SNPs.csv'
vcf_path = "../../../results/vcfs/targets/ampseq-vigg-01.annot.vcf"
cohort_cols = 'location,taxon'
wkdir = "../../.."

### Load in the table of SNPs that we use to define haplotypes

In [None]:
cohort_cols = cohort_cols.split(",")

hap_def = pd.read_csv(kdr_marker_snps_path, sep = '\t', index_col = 1)
hap_def['variant_pos'] = hap_def.index.str.replace('.*:', '', regex = True).astype('int')
hap_def

### The functions for making the kdr background calls

In [None]:
# Determine kdr F origin for a genotype
def _F_kdr_origin_gen(genotypes, clean = True):
    if 'sample_name' in genotypes.index:
        sample_name = genotypes['sample_name']
    else:
        sample_name = genotypes.name
    # Check for the 995F mutations
    if pd.isnull(genotypes['kdr-995F']):
        kdr_F_origins = 'F:unknown'
    elif genotypes['kdr-995F'] == 'AA':
        kdr_F_origins = 'F:wt_hom'
    elif genotypes['kdr-995F'] == 'AT':
        kdr_F_origins = 'F:het'
    elif genotypes['kdr-995F'] == 'TT':
        kdr_F_origins = 'F:hom'
    else:
        print(f'Unexpected kdr F genotype. {sample_name} {genotypes["kdr-995F"]}')
        kdr_F_origins = 'Fail. Unexpected kdr F genotype'
    # If the individual has Fkdr, find out its origins
    # For F homozygotes
    if kdr_F_origins == 'F:hom':
        if pd.isnull(genotypes['Def-F1']):
            kdr_F_origins = f'{kdr_F_origins},F1?'
        elif genotypes['Def-F1'] == 'AA':
            kdr_F_origins = f'{kdr_F_origins},F1_hom'
        elif genotypes['Def-F1'] == 'AG':
            kdr_F_origins = f'{kdr_F_origins},F1_het'
        #
        if pd.isnull(genotypes['Def-F2']):
            kdr_F_origins = f'{kdr_F_origins},F2?'
        elif genotypes['Def-F2'] == 'AA':
            kdr_F_origins = f'{kdr_F_origins},F2_hom'
        elif genotypes['Def-F2'] == 'AG':
            kdr_F_origins = f'{kdr_F_origins},F2_het'
        #
        if pd.isnull(genotypes['Def-F3F4-2']):
            kdr_F_origins = f'{kdr_F_origins},F3F4?'
        elif genotypes['Def-F3F4-2'] == 'TT':
            if pd.isnull(genotypes['Def-F3']):
                kdr_F_origins = f'{kdr_F_origins},(F3F4)_hom'
            elif genotypes['Def-F3'] == 'CC':
                kdr_F_origins = f'{kdr_F_origins},F3_hom'
            elif genotypes['Def-F3'] == 'CG':
                kdr_F_origins = f'{kdr_F_origins},F3_het,F4_het'
            elif genotypes['Def-F3'] == 'GG':
                kdr_F_origins = f'{kdr_F_origins},F4_hom'
        elif genotypes['Def-F3F4-2'] == 'AT':
            if pd.isnull(genotypes['Def-F3']):
                kdr_F_origins = f'{kdr_F_origins},(F3F4)_het'
            elif genotypes['Def-F3'] == 'CC':
                kdr_F_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for F3F4, but homozygote for F3.'
            elif genotypes['Def-F3'] == 'CG':
                kdr_F_origins = f'{kdr_F_origins},F3_het'
            elif genotypes['Def-F3'] == 'GG':
                kdr_F_origins = f'{kdr_F_origins},F4_het'
        #
        if pd.isnull(genotypes['Def-F5-2']):
            kdr_F_origins = f'{kdr_F_origins},F5?'
        elif genotypes['Def-F5-2'] == 'GG':
            kdr_F_origins = f'{kdr_F_origins},F5_hom'
        elif genotypes['Def-F5-2'] == 'AG':
            kdr_F_origins = f'{kdr_F_origins},F5_het'
    # for F heterozygotes
    elif kdr_F_origins == 'F:het':
        if pd.isnull(genotypes['Def-F1']):
            kdr_F_origins = f'{kdr_F_origins},F1?'
        elif genotypes['Def-F1'] == 'AA':
            kdr_F_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for F kdr, but homozygote for F1.'
        elif genotypes['Def-F1'] == 'AG':
            kdr_F_origins = f'{kdr_F_origins},F1_het'
        #
        if pd.isnull(genotypes['Def-F2']):
            kdr_F_origins = f'{kdr_F_origins},F2?'
        elif genotypes['Def-F2'] == 'AA':
            kdr_F_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for F kdr, but homozygote for F2.'
        elif genotypes['Def-F2'] == 'AG':
            kdr_F_origins = f'{kdr_F_origins},F2_het'
        #
        if pd.isnull(genotypes['Def-F3F4-2']):
            kdr_F_origins = f'{kdr_F_origins},F3F4?'
        elif genotypes['Def-F3F4-2'] == 'TT':
            kdr_F_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for F kdr, but homozygote for F3F4.'
        elif genotypes['Def-F3F4-2'] == 'AT':
            if pd.isnull(genotypes['Def-F3']):
                kdr_F_origins = f'{kdr_F_origins},(F3F4)_het'
            elif genotypes['Def-F3'] == 'CC':
                kdr_F_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for F kdr and F3F4, but homozygote for F3.'
            elif genotypes['Def-F3'] == 'CG':
                kdr_F_origins = f'{kdr_F_origins},F3_het'
            elif genotypes['Def-F3'] == 'GG':
                kdr_F_origins = f'{kdr_F_origins},F4_het'
        #
        if pd.isnull(genotypes['Def-F5-2']):
            kdr_F_origins = f'{kdr_F_origins},F5?'
        elif genotypes['Def-F5-2'] == 'GG':
            kdr_F_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for F kdr, but homozygote for F5.'
        elif genotypes['Def-F5-2'] == 'AG':
            kdr_F_origins = f'{kdr_F_origins},F5_het'
    if clean:
        return(_kdr_gen_cleanup(kdr_F_origins))
    else:
        return(kdr_F_origins)


# Determine kdr S origin for a genotype
def _S_kdr_origin_gen(genotypes, clean = True, alternate_S4S5 = False):
    if 'sample_name' in genotypes.index:
        sample_name = genotypes['sample_name']
    else:
        sample_name = genotypes.name
    # Check for the 995S mutations
    if pd.isnull(genotypes['kdr-995S']):
        kdr_S_origins = 'S:unknown'
    elif genotypes['kdr-995S'] == 'TT':
        kdr_S_origins = 'S:wt_hom'
    elif genotypes['kdr-995S'] == 'CT':
        kdr_S_origins = 'S:het'
    elif genotypes['kdr-995S'] == 'CC':
        kdr_S_origins = 'S:hom'
    else:
        print(f'Unexpected kdr S genotype. {sample_name} {genotypes["kdr-995S"]}')
        kdr_S_origins = 'Fail. Unexpected kdr S genotype'
    # If the individual has Skdr, find out its origins
    # For S homozygotes
    if kdr_S_origins == 'S:hom':
        if pd.isnull(genotypes['Def-S1-3']):
            kdr_S_origins = f'{kdr_S_origins},S1?'
        elif genotypes['Def-S1-3'] == 'CC':
            kdr_S_origins = f'{kdr_S_origins},S1_hom'
        elif genotypes['Def-S1-3'] == 'CT':
            kdr_S_origins = f'{kdr_S_origins},S1_het'
        #
        if pd.isnull(genotypes['Def-S2S4']):
            kdr_S_origins = f'{kdr_S_origins},S2S4?'
        elif genotypes['Def-S2S4'] == 'TT':
            if pd.isnull(genotypes['Def-S2-4']):
                kdr_S_origins = f'{kdr_S_origins},(S2S4)_hom'
            elif genotypes['Def-S2-4'] == 'AA':
                kdr_S_origins = f'{kdr_S_origins},S2_hom'
            elif genotypes['Def-S2-4'] == 'AT':
                kdr_S_origins = f'{kdr_S_origins},S2_het,S4_het'
            elif genotypes['Def-S2-4'] == 'TT':
                kdr_S_origins = f'{kdr_S_origins},S4_hom'
        elif genotypes['Def-S2S4'] == 'CT':
            if pd.isnull(genotypes['Def-S2-4']):
                kdr_S_origins = f'{kdr_S_origins},(S2S4)_het'
            elif genotypes['Def-S2-4'] == 'AA':
                kdr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S2S4, but homozygote for S2.'
            elif genotypes['Def-S2-4'] == 'AT':
                kdr_S_origins = f'{kdr_S_origins},S2_het'
            elif genotypes['Def-S2-4'] == 'TT':
                kdr_S_origins = f'{kdr_S_origins},S4_het'
        #
        if pd.isnull(genotypes['Def-S3']):
            kdr_S_origins = f'{kdr_S_origins},S3?'
        elif genotypes['Def-S3'] == 'GG':
            kdr_S_origins = f'{kdr_S_origins},S3_hom'
        elif genotypes['Def-S3'] == 'GT':
            kdr_S_origins = f'{kdr_S_origins},S3_het'
        # 
        if alternate_S4S5:
            if pd.isnull(genotypes['Def-S4S5-2']):
                kdr_S_origins = f'{kdr_S_origins},S4S5?'
            elif genotypes['Def-S4S5-2'] == 'TT':
                if pd.isnull(genotypes['Def-S5']):
                    kdr_S_origins = f'{kdr_S_origins},(S4S5)_hom'
                elif genotypes['Def-S5'] == 'CC':
                    kdr_S_origins = f'{kdr_S_origins},S5_hom'
                elif genotypes['Def-S5'] == 'AC':
                    kdr_S_origins = f'{kdr_S_origins},S5_het,S4_het'
                elif genotypes['Def-S5'] == 'AA':
                    kdr_S_origins = f'{kdr_S_origins},S4_hom'
            elif genotypes['Def-S4S5-2'] == 'GT':
                if pd.isnull(genotypes['Def-S5']):
                    kdr_S_origins = f'{kdr_S_origins},(S4S5)_het'
                elif genotypes['Def-S5'] == 'CC':
                    kdr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S4S5, but homozygote for S5.'
                elif genotypes['Def-S5'] == 'AC':
                    kdr_S_origins = f'{kdr_S_origins},S5_het'
                elif genotypes['Def-S5'] == 'AA':
                    kdr_S_origins = f'{kdr_S_origins},S4_het'
        else :
            if pd.isnull(genotypes['Def-S4S5']):
                kdr_S_origins = f'{kdr_S_origins},S4S5?'
            elif genotypes['Def-S4S5'] == 'CC':
                if pd.isnull(genotypes['Def-S5']):
                    kdr_S_origins = f'{kdr_S_origins},(S4S5)_hom'
                elif genotypes['Def-S5'] == 'CC':
                    kdr_S_origins = f'{kdr_S_origins},S5_hom'
                elif genotypes['Def-S5'] == 'AC':
                    kdr_S_origins = f'{kdr_S_origins},S4_het,S5_het'
                elif genotypes['Def-S5'] == 'AA':
                    kdr_S_origins = f'{kdr_S_origins},S4_hom'
            elif genotypes['Def-S4S5'] == 'CT':
                if pd.isnull(genotypes['Def-S5']):
                    kdr_S_origins = f'{kdr_S_origins},(S4S5)_het'
                elif genotypes['Def-S5'] == 'CC':
                    kdr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S4S5, but homozygote for S5.'
                elif genotypes['Def-S5'] == 'AC':
                    kdr_S_origins = f'{kdr_S_origins},S5_het'
                elif genotypes['Def-S5'] == 'AA':
                    kdr_S_origins = f'{kdr_S_origins},S4_het'
    # for S heterozygotes
    elif kdr_S_origins == 'S:het':
        if pd.isnull(genotypes['Def-S1-3']):
            kdr_S_origins = f'{kdr_S_origins},S1?'
        elif genotypes['Def-S1-3'] == 'CC':
            kdr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S kdr, but homozygote for S1.'
        elif genotypes['Def-S1-3'] == 'CT':
            kdr_S_origins = f'{kdr_S_origins},S1_het'
        #
        if pd.isnull(genotypes['Def-S2S4']):
            kdr_S_origins = f'{kdr_S_origins},S2S4?'
        elif genotypes['Def-S2S4'] == 'TT':
            kdr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S kdr, but homozygote for S2S4.'
        elif genotypes['Def-S2S4'] == 'CT':
            if pd.isnull(genotypes['Def-S2-4']):
                kdr_S_origins = f'{kdr_S_origins},(S2S4)_het'
            elif genotypes['Def-S2-4'] == 'AA':
                ksr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S kdr and S2S4, but homozygote for S2.'
            elif genotypes['Def-S2-4'] == 'AT':
                kdr_S_origins = f'{kdr_S_origins},S2_het'
            elif genotypes['Def-S2-4'] == 'TT':
                kdr_S_origins = f'{kdr_S_origins},S4_het'
        #
        if pd.isnull(genotypes['Def-S3']):
            kdr_S_origins = f'{kdr_S_origins},S3?'
        elif genotypes['Def-S3'] == 'GG':
            kdr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S kdr, but homozygote for S3.'
        elif genotypes['Def-S3'] == 'GT':
            kdr_S_origins = f'{kdr_S_origins},S3_het'
        # 
        if alternate_S4S5:
            if pd.isnull(genotypes['Def-S4S5_2']):
                kdr_S_origins = f'{kdr_S_origins},S4S5?'
            elif genotypes['Def-S4S5-2'] == 'TT':
                kdr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S kdr, but homozygote for S4S5.'
            elif genotypes['Def-S4S5-2'] == 'GT':
                if pd.isnull(genotypes['Def-S5']):
                    kdr_S_origins = f'{kdr_S_origins},(S4S5)_het'
                elif genotypes['Def-S5'] == 'CC':
                    kdr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S kdr and S4S5, but homozygote for S5.'
                elif genotypes['Def-S5'] == 'AC':
                    kdr_S_origins = f'{kdr_S_origins},S5_het'
                elif genotypes['Def-S5'] == 'AA':
                    kdr_S_origins = f'{kdr_S_origins},S4_het'
        else :
            if pd.isnull(genotypes['Def-S4S5']):
                kdr_S_origins = f'{kdr_S_origins},S4S5?'
            elif genotypes['Def-S4S5'] == 'CC':
                kdr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S kdr, but homozygote for S4S5.'
            elif genotypes['Def-S4S5'] == 'CT':
                if pd.isnull(genotypes['Def-S5']):
                    kdr_S_origins = f'{kdr_S_origins},(S4S5)_het'
                elif genotypes['Def-S5'] == 'CC':
                    return(f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S kdr and S4S5, but homozygote for S5.')
                elif genotypes['Def-S5'] == 'AC':
                    kdr_S_origins = f'{kdr_S_origins},S5_het'
                elif genotypes['Def-S5'] == 'AA':
                    kdr_S_origins = f'{kdr_S_origins},S4_het'
    if clean:
        return(_kdr_gen_cleanup(kdr_S_origins))
    else:
        return(kdr_S_origins)

def _402_kdr_origin_gen(genotypes, clean = True):
    if 'sample_name' in genotypes.index:
        sample_name = genotypes['sample_name']
    else:
        sample_name = genotypes.name
    # Check for the 995F mutations
    if pd.isnull(genotypes['kdr-402L']):
        kdr_402_origins = '402:unknown'
    elif genotypes['kdr-402L'] == 'GG':
        kdr_402_origins = '402:wt_hom'
    elif genotypes['kdr-402L'] == 'CG':
        kdr_402_origins = '402:het,402LC_het'
    elif genotypes['kdr-402L'] == 'GT':
        kdr_402_origins = '402:het,402LT_het'
    elif genotypes['kdr-402L'] == 'CT':
        kdr_402_origins = '402:hom,402LC_het,402LT_het'
    elif genotypes['kdr-402L'] == 'CC':
        kdr_402_origins = '402:hom,402LC_hom'
    elif genotypes['kdr-402L'] == 'TT':
        kdr_402_origins = '402:hom,402LT_hom'
    else:
        print(f'Unexpected kdr 402 genotype. {sample_name} {genotypes["kdr-402L"]}')
        kdr_402_origins = 'Fail. Unexpected kdr 402 genotype'
        
    if clean:
        return(_kdr_gen_cleanup(kdr_402_origins))
    else:
        return(kdr_402_origins)

# The initial output of the kdr_origin function can be a little messy, since it outputs all of the 
# information that it could or couldn't obtain. This function tidies it up a bit. 
def _kdr_gen_cleanup(kdr_origin_str):
    if re.search('Fail', kdr_origin_str):
        return('?,?')
    if re.search('wt', kdr_origin_str):
        return('wt,wt')
    kdr_type = re.findall('.*(?=:)', kdr_origin_str)[0]
    outcomes = kdr_origin_str.split(',')
    origins = np.unique(outcomes[1:])
    established_origins = [o for o in origins if not re.search('\?', o)]
    # Remove "_het" and "_hom" text
    established_origins = [re.sub('_het', '', o) for o in established_origins]
    # Now double up each _hom entry. this is cludgy, but coudn't find a more elegant way
    for i in range(len(established_origins)):
        o = established_origins[i]
        if re.search('_hom', o):
            o = re.sub('_hom', '', o)
            established_origins[i] = o
            established_origins.append(o)
    if re.search('hom', outcomes[0]):
        if len(established_origins) == 1:
            return(f'{kdr_type},{established_origins[0]}')
        elif len(established_origins) == 2:
            return(','.join(established_origins))
        else:
            return(f'{kdr_type},{kdr_type}')
    if re.search('het', outcomes[0]):
        if len(established_origins) == 1:
            return(f'wt,{established_origins[0]}')
        else:
            return(f'wt,{kdr_type}')
    else:
        return('?,?')


# Single function to call both the F and S origins for a given haplotype. This function determines
# from the look of the genotype table whether it represents genotypes or haplotypes, and calls 
# the appropriate function. 
def kdr_origin(genotypes, alternate_S4S5 = False, clean = True, include_402 = None):
    if 'sample_name' in genotypes.index:
        sample_name = genotypes['sample_name']
    else:
        sample_name = genotypes.name
    if include_402 == None:
        if 'kdr-402L' in genotypes.index:
            include_402 = True
        else:
            include_402 = False
    if include_402 == False:
        kdr_origins = pd.DataFrame({'kdr_F_origin': [_F_kdr_origin_gen(genotypes, clean)], 
                                    'kdr_S_origin': [_S_kdr_origin_gen(genotypes, clean, alternate_S4S5)]
                                    }, index = [sample_name]
        
        )
    else:
        kdr_origins = pd.DataFrame({'kdr_F_origin': [_F_kdr_origin_gen(genotypes, clean)], 
                                    'kdr_S_origin': [_S_kdr_origin_gen(genotypes, clean, alternate_S4S5)],
                                    'kdr_402_origin': [_402_kdr_origin_gen(genotypes, clean)]
                                    }, index = [sample_name]
        )
    return(kdr_origins)

# From a pair of kdr origin calls (obtained by running the kdr_origin function  for each of 
# F and S, followed by kdr_hap_cleanup), output a single call combining the F and S calls. 
def get_single_gen_call(x):  
    if 'kdr_402_origin' in x.index:
        return(_get_single_gen_call_with_402(x))
    else:
        return(_get_single_gen_call_no_402(x))
    
def _get_single_gen_call_no_402(x): 
    if 'sample_name' in x.index:
        sample_name = x['sample_name']
    else:
        sample_name = x.name
    joined_calls = np.array(x['kdr_F_origin'].split(',') + x['kdr_S_origin'].split(','))
    # There should be at least two 'wt' calls
    if np.sum(joined_calls == 'wt') < 2:
        print(f'Too many different mutant haplotype backgrounds in sample {sample_name}')
        return('?,?')
    # Otherwise, drop two wildtype calls
    else:
        which_drop = np.where(joined_calls == 'wt')[0][:2]
        return(','.join(np.delete(joined_calls, which_drop)))

def _get_single_gen_call_with_402(x): 
    if 'sample_name' in x.index:
        sample_name = x['sample_name']
    else:
        sample_name = x.name
    joined_calls = np.array(x['kdr_F_origin'].split(',') + 
                            x['kdr_S_origin'].split(',') +
                            x['kdr_402_origin'].split(',')
    )
    # There should be at least four 'wt' calls
    if np.sum(np.isin(joined_calls,  ['wt', '?'])) < 4:
        print(f'Too many different mutant haplotype backgrounds in sample {sample_name}')
        return('?,?')
    # Otherwise, drop four wildtype calls
    else:
        which_drop = np.concatenate([
            np.where(joined_calls == 'wt')[0],
            np.where(joined_calls == '?')[0]
        ])[:4]
        return(','.join(np.delete(joined_calls, which_drop)))

### Load metadata

In [None]:
metadata = pd.read_csv(metadata_path, sep = '\t', index_col = 0)
metadata.taxon.fillna('UNKN', inplace = True)
metadata

### Get the genotype calls

### Filter the SNPs to just the ones useful for kdr origin analysis

In [None]:
geno, pos, contig, samples, ref, alt = load_vcf(vcf_path, metadata=metadata)

which_snps = (contig == '2L') & np.isin(pos, hap_def['variant_pos']) 

snp_calls = geno[which_snps, :, :]
pos = pos[which_snps]
alt = alt[which_snps, :]
ref = ref[which_snps]

### Convert genotype calls to nucleotides

In [None]:
# combine ref and alt into a single matrix, and add a column of '?' at the end, so that 
# any genotype call of -1 (missing) draws the '?' character'
snp_alleles = np.concatenate([np.reshape(ref, (len(ref), 1)), 
                              alt,
                              np.full((len(ref), 1), '?')], 
                             axis = 1)

# Convert numberic calls to nucleotides, and sort each pair of nucleotides alphabetically
# (so, eg, the genotype 'TA' becomes 'AT')
snp_genotypes_3d = snp_alleles[
    np.array(np.arange(snp_alleles.shape[0])).reshape(snp_alleles.shape[0], 1, 1), 
    snp_calls
]
snp_genotypes = np.apply_along_axis(lambda x: ''.join(np.sort(x)), 2, snp_genotypes_3d)

# Store results in data frame
hap_def.index = hap_def['variant_pos']
gen_df = pd.DataFrame(
    np.transpose(snp_genotypes), 
    index = samples,
    columns = hap_def.loc[pos, 'SNP name']
)
gen_df

### Obtain kdr origin calls

In [None]:

kdr_origins = pd.concat([kdr_origin(gen_df.iloc[i]) for i in range(gen_df.shape[0])])
kdr_origins['kdr_origin'] = kdr_origins.apply(
    get_single_gen_call, axis = 1
)

kdr_origins

### Merge kdr origins with metadata and write to file. 

In [None]:
kdr_origins_df = pd.merge(kdr_origins, metadata.set_index("sampleID"), left_index = True, right_index = True)
kdr_origins_df.to_csv(f'{wkdir}/results/kdr-origins/kdr_origins.csv', sep = '\t')
kdr_origins_df

### Create a table where each row is a haplotype instead of a genotype (although these are genotype-based calls, so order within each sample will be random, so they can be used for, say, mapping, but not haplotype clustering). Write this table to file.

In [None]:

kdr_genhap_origins_df = pd.DataFrame({'kdr_origin': ','.join(list(kdr_origins['kdr_origin'])).split(',')},
                                     index = np.repeat(kdr_origins.index, 2)
)
kdr_genhap_origins_df = pd.merge(kdr_genhap_origins_df, metadata.set_index("sampleID"), left_index=True, right_index=True)
kdr_genhap_origins_df.to_csv(f'{wkdir}/results/kdr-origins/kdr_genhap_origins.csv', sep = '\t')
kdr_genhap_origins_df

### Summarise results by taxon and location

In [None]:
cols_keep = cohort_cols + ['kdr_origin']
# Count the number of occurances of each haplotypes in each population
# "values" could be any column that isn't specified elsewhere in the function. But it's 
# not allowed to be blank, so we had to pick one. 
pop_origin_counts = kdr_genhap_origins_df[cols_keep].pivot_table(columns='kdr_origin', 
                                                                    index=cohort_cols,
                                                                    aggfunc=len
                                                                   ).fillna(0).astype(int)

# A function to round a number up to n_signif significant figures
def signif(x, n_figs):
    power = 10 ** np.floor(np.log10(np.abs(x).clip(1e-200)))
    rounded = np.round(x / power, n_figs - 1) * power
    return rounded

# Calculate row totals of non-"?" columns
if '?' in pop_origin_counts.columns:
    pop_origin_counts = pop_origin_counts.drop('?', axis = 1)

row_totals = pop_origin_counts.sum(axis = 1)
# Calculate origin frequencies. We exclude the "?" calls for this
pop_origin_freqs = pop_origin_counts.div(row_totals, axis = 0)
# Round to 2 significant figures
pop_origin_freqs = signif(pop_origin_freqs, 2)
print('Counts of origins:')
display(pop_origin_counts)
print('\n\nFrequencies of known origins:')
display(pop_origin_freqs)

### That's all folks