### Import packages

In [1]:
import numpy as np
import pandas as pd
import allel
import re

In [None]:
metadata_path = '../../../config/metadata.tsv'
vcf_path = "vcf"
cohort_column = 'location'
wkdir = "../.."

### Load in the table of SNPs that we use to define haplotypes

In [2]:
hap_def = pd.read_csv('Kdr_marker_SNPs.csv', sep = '\t', index_col = 1)
hap_def['variant_pos'] = hap_def.index.str.replace('.*:', '', regex = True).astype('int')
hap_def

Unnamed: 0_level_0,SNP name,Allele1,Allele1 interpretation,Allele2,Allele2 interpretation,variant_pos
SNP Position,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2L:2422651,kdr-995S,T,Not KdrS,C,KdrS,2422651
2L:2422652,kdr-995F,A,Not KdrF,T,KdrF,2422652
2L:2391228,kdr-402L,G,Not 402L,C or T,V402L,2391228
2L:2403941,Def-S4S5,T,Not S4S5,C,kdr haplotype cluster S4 or S5,2403941
2L:2414062,Def-F3F4-2,A,Not F3F4,T,kdr haplotype cluster F3 or F4,2414062
2L:2399997,Def-F3,G,Not F3,C,kdr haplotype cluster F3,2399997
2L:2425052,Def-F2,G,Not F2,A,kdr haplotype cluster F2,2425052
2L:2380982,Def-S1-3,T,Not S1,C,kdr haplotype cluster S1,2380982
2L:2408677,Def-S5,A,Not S5,C,kdr haplotype cluster S5,2408677
2L:2381706,Def-F5-2,A,Not F5,G,kdr haplotype cluster F5,2381706


### The functions for making the kdr background calls

In [3]:
# Determine kdr F origin for a genotype
def _F_kdr_origin_gen(genotypes, clean = True):
    if 'sample_name' in genotypes.index:
        sample_name = genotypes['sample_name']
    else:
        sample_name = genotypes.name
    # Check for the 995F mutations
    if pd.isnull(genotypes['kdr-995F']):
        kdr_F_origins = 'F:unknown'
    elif genotypes['kdr-995F'] == 'AA':
        kdr_F_origins = 'F:wt_hom'
    elif genotypes['kdr-995F'] == 'AT':
        kdr_F_origins = 'F:het'
    elif genotypes['kdr-995F'] == 'TT':
        kdr_F_origins = 'F:hom'
    else:
        print(f'Unexpected kdr F genotype. {sample_name} {genotypes["kdr-995F"]}')
        kdr_F_origins = 'Fail. Unexpected kdr F genotype'
    # If the individual has Fkdr, find out its origins
    # For F homozygotes
    if kdr_F_origins == 'F:hom':
        if pd.isnull(genotypes['Def-F1']):
            kdr_F_origins = f'{kdr_F_origins},F1?'
        elif genotypes['Def-F1'] == 'AA':
            kdr_F_origins = f'{kdr_F_origins},F1_hom'
        elif genotypes['Def-F1'] == 'AG':
            kdr_F_origins = f'{kdr_F_origins},F1_het'
        #
        if pd.isnull(genotypes['Def-F2']):
            kdr_F_origins = f'{kdr_F_origins},F2?'
        elif genotypes['Def-F2'] == 'AA':
            kdr_F_origins = f'{kdr_F_origins},F2_hom'
        elif genotypes['Def-F2'] == 'AG':
            kdr_F_origins = f'{kdr_F_origins},F2_het'
        #
        if pd.isnull(genotypes['Def-F3F4-2']):
            kdr_F_origins = f'{kdr_F_origins},F3F4?'
        elif genotypes['Def-F3F4-2'] == 'TT':
            if pd.isnull(genotypes['Def-F3']):
                kdr_F_origins = f'{kdr_F_origins},(F3F4)_hom'
            elif genotypes['Def-F3'] == 'CC':
                kdr_F_origins = f'{kdr_F_origins},F3_hom'
            elif genotypes['Def-F3'] == 'CG':
                kdr_F_origins = f'{kdr_F_origins},F3_het,F4_het'
            elif genotypes['Def-F3'] == 'GG':
                kdr_F_origins = f'{kdr_F_origins},F4_hom'
        elif genotypes['Def-F3F4-2'] == 'AT':
            if pd.isnull(genotypes['Def-F3']):
                kdr_F_origins = f'{kdr_F_origins},(F3F4)_het'
            elif genotypes['Def-F3'] == 'CC':
                kdr_F_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for F3F4, but homozygote for F3.'
            elif genotypes['Def-F3'] == 'CG':
                kdr_F_origins = f'{kdr_F_origins},F3_het'
            elif genotypes['Def-F3'] == 'GG':
                kdr_F_origins = f'{kdr_F_origins},F4_het'
        #
        if pd.isnull(genotypes['Def-F5-2']):
            kdr_F_origins = f'{kdr_F_origins},F5?'
        elif genotypes['Def-F5-2'] == 'GG':
            kdr_F_origins = f'{kdr_F_origins},F5_hom'
        elif genotypes['Def-F5-2'] == 'AG':
            kdr_F_origins = f'{kdr_F_origins},F5_het'
    # for F heterozygotes
    elif kdr_F_origins == 'F:het':
        if pd.isnull(genotypes['Def-F1']):
            kdr_F_origins = f'{kdr_F_origins},F1?'
        elif genotypes['Def-F1'] == 'AA':
            kdr_F_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for F kdr, but homozygote for F1.'
        elif genotypes['Def-F1'] == 'AG':
            kdr_F_origins = f'{kdr_F_origins},F1_het'
        #
        if pd.isnull(genotypes['Def-F2']):
            kdr_F_origins = f'{kdr_F_origins},F2?'
        elif genotypes['Def-F2'] == 'AA':
            kdr_F_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for F kdr, but homozygote for F2.'
        elif genotypes['Def-F2'] == 'AG':
            kdr_F_origins = f'{kdr_F_origins},F2_het'
        #
        if pd.isnull(genotypes['Def-F3F4-2']):
            kdr_F_origins = f'{kdr_F_origins},F3F4?'
        elif genotypes['Def-F3F4-2'] == 'TT':
            kdr_F_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for F kdr, but homozygote for F3F4.'
        elif genotypes['Def-F3F4-2'] == 'AT':
            if pd.isnull(genotypes['Def-F3']):
                kdr_F_origins = f'{kdr_F_origins},(F3F4)_het'
            elif genotypes['Def-F3'] == 'CC':
                kdr_F_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for F kdr and F3F4, but homozygote for F3.'
            elif genotypes['Def-F3'] == 'CG':
                kdr_F_origins = f'{kdr_F_origins},F3_het'
            elif genotypes['Def-F3'] == 'GG':
                kdr_F_origins = f'{kdr_F_origins},F4_het'
        #
        if pd.isnull(genotypes['Def-F5-2']):
            kdr_F_origins = f'{kdr_F_origins},F5?'
        elif genotypes['Def-F5-2'] == 'GG':
            kdr_F_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for F kdr, but homozygote for F5.'
        elif genotypes['Def-F5-2'] == 'AG':
            kdr_F_origins = f'{kdr_F_origins},F5_het'
    if clean:
        return(_kdr_gen_cleanup(kdr_F_origins))
    else:
        return(kdr_F_origins)


# Determine kdr S origin for a genotype
def _S_kdr_origin_gen(genotypes, clean = True, alternate_S4S5 = False):
    if 'sample_name' in genotypes.index:
        sample_name = genotypes['sample_name']
    else:
        sample_name = genotypes.name
    # Check for the 995S mutations
    if pd.isnull(genotypes['kdr-995S']):
        kdr_S_origins = 'S:unknown'
    elif genotypes['kdr-995S'] == 'TT':
        kdr_S_origins = 'S:wt_hom'
    elif genotypes['kdr-995S'] == 'CT':
        kdr_S_origins = 'S:het'
    elif genotypes['kdr-995S'] == 'CC':
        kdr_S_origins = 'S:hom'
    else:
        print(f'Unexpected kdr S genotype. {sample_name} {genotypes["kdr-995S"]}')
        kdr_S_origins = 'Fail. Unexpected kdr S genotype'
    # If the individual has Skdr, find out its origins
    # For S homozygotes
    if kdr_S_origins == 'S:hom':
        if pd.isnull(genotypes['Def-S1-3']):
            kdr_S_origins = f'{kdr_S_origins},S1?'
        elif genotypes['Def-S1-3'] == 'CC':
            kdr_S_origins = f'{kdr_S_origins},S1_hom'
        elif genotypes['Def-S1-3'] == 'CT':
            kdr_S_origins = f'{kdr_S_origins},S1_het'
        #
        if pd.isnull(genotypes['Def-S2S4']):
            kdr_S_origins = f'{kdr_S_origins},S2S4?'
        elif genotypes['Def-S2S4'] == 'TT':
            if pd.isnull(genotypes['Def-S2-4']):
                kdr_S_origins = f'{kdr_S_origins},(S2S4)_hom'
            elif genotypes['Def-S2-4'] == 'AA':
                kdr_S_origins = f'{kdr_S_origins},S2_hom'
            elif genotypes['Def-S2-4'] == 'AT':
                kdr_S_origins = f'{kdr_S_origins},S2_het,S4_het'
            elif genotypes['Def-S2-4'] == 'TT':
                kdr_S_origins = f'{kdr_S_origins},S4_hom'
        elif genotypes['Def-S2S4'] == 'CT':
            if pd.isnull(genotypes['Def-S2-4']):
                kdr_S_origins = f'{kdr_S_origins},(S2S4)_het'
            elif genotypes['Def-S2-4'] == 'AA':
                kdr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S2S4, but homozygote for S2.'
            elif genotypes['Def-S2-4'] == 'AT':
                kdr_S_origins = f'{kdr_S_origins},S2_het'
            elif genotypes['Def-S2-4'] == 'TT':
                kdr_S_origins = f'{kdr_S_origins},S4_het'
        #
        if pd.isnull(genotypes['Def-S3']):
            kdr_S_origins = f'{kdr_S_origins},S3?'
        elif genotypes['Def-S3'] == 'GG':
            kdr_S_origins = f'{kdr_S_origins},S3_hom'
        elif genotypes['Def-S3'] == 'GT':
            kdr_S_origins = f'{kdr_S_origins},S3_het'
        # 
        if alternate_S4S5:
            if pd.isnull(genotypes['Def-S4S5-2']):
                kdr_S_origins = f'{kdr_S_origins},S4S5?'
            elif genotypes['Def-S4S5-2'] == 'TT':
                if pd.isnull(genotypes['Def-S5']):
                    kdr_S_origins = f'{kdr_S_origins},(S4S5)_hom'
                elif genotypes['Def-S5'] == 'CC':
                    kdr_S_origins = f'{kdr_S_origins},S5_hom'
                elif genotypes['Def-S5'] == 'AC':
                    kdr_S_origins = f'{kdr_S_origins},S5_het,S4_het'
                elif genotypes['Def-S5'] == 'AA':
                    kdr_S_origins = f'{kdr_S_origins},S4_hom'
            elif genotypes['Def-S4S5-2'] == 'GT':
                if pd.isnull(genotypes['Def-S5']):
                    kdr_S_origins = f'{kdr_S_origins},(S4S5)_het'
                elif genotypes['Def-S5'] == 'CC':
                    kdr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S4S5, but homozygote for S5.'
                elif genotypes['Def-S5'] == 'AC':
                    kdr_S_origins = f'{kdr_S_origins},S5_het'
                elif genotypes['Def-S5'] == 'AA':
                    kdr_S_origins = f'{kdr_S_origins},S4_het'
        else :
            if pd.isnull(genotypes['Def-S4S5']):
                kdr_S_origins = f'{kdr_S_origins},S4S5?'
            elif genotypes['Def-S4S5'] == 'CC':
                if pd.isnull(genotypes['Def-S5']):
                    kdr_S_origins = f'{kdr_S_origins},(S4S5)_hom'
                elif genotypes['Def-S5'] == 'CC':
                    kdr_S_origins = f'{kdr_S_origins},S5_hom'
                elif genotypes['Def-S5'] == 'AC':
                    kdr_S_origins = f'{kdr_S_origins},S4_het,S5_het'
                elif genotypes['Def-S5'] == 'AA':
                    kdr_S_origins = f'{kdr_S_origins},S4_hom'
            elif genotypes['Def-S4S5'] == 'CT':
                if pd.isnull(genotypes['Def-S5']):
                    kdr_S_origins = f'{kdr_S_origins},(S4S5)_het'
                elif genotypes['Def-S5'] == 'CC':
                    kdr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S4S5, but homozygote for S5.'
                elif genotypes['Def-S5'] == 'AC':
                    kdr_S_origins = f'{kdr_S_origins},S5_het'
                elif genotypes['Def-S5'] == 'AA':
                    kdr_S_origins = f'{kdr_S_origins},S4_het'
    # for S heterozygotes
    elif kdr_S_origins == 'S:het':
        if pd.isnull(genotypes['Def-S1-3']):
            kdr_S_origins = f'{kdr_S_origins},S1?'
        elif genotypes['Def-S1-3'] == 'CC':
            kdr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S kdr, but homozygote for S1.'
        elif genotypes['Def-S1-3'] == 'CT':
            kdr_S_origins = f'{kdr_S_origins},S1_het'
        #
        if pd.isnull(genotypes['Def-S2S4']):
            kdr_S_origins = f'{kdr_S_origins},S2S4?'
        elif genotypes['Def-S2S4'] == 'TT':
            kdr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S kdr, but homozygote for S2S4.'
        elif genotypes['Def-S2S4'] == 'CT':
            if pd.isnull(genotypes['Def-S2-4']):
                kdr_S_origins = f'{kdr_S_origins},(S2S4)_het'
            elif genotypes['Def-S2-4'] == 'AA':
                ksr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S kdr and S2S4, but homozygote for S2.'
            elif genotypes['Def-S2-4'] == 'AT':
                kdr_S_origins = f'{kdr_S_origins},S2_het'
            elif genotypes['Def-S2-4'] == 'TT':
                kdr_S_origins = f'{kdr_S_origins},S4_het'
        #
        if pd.isnull(genotypes['Def-S3']):
            kdr_S_origins = f'{kdr_S_origins},S3?'
        elif genotypes['Def-S3'] == 'GG':
            kdr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S kdr, but homozygote for S3.'
        elif genotypes['Def-S3'] == 'GT':
            kdr_S_origins = f'{kdr_S_origins},S3_het'
        # 
        if alternate_S4S5:
            if pd.isnull(genotypes['Def-S4S5_2']):
                kdr_S_origins = f'{kdr_S_origins},S4S5?'
            elif genotypes['Def-S4S5-2'] == 'TT':
                kdr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S kdr, but homozygote for S4S5.'
            elif genotypes['Def-S4S5-2'] == 'GT':
                if pd.isnull(genotypes['Def-S5']):
                    kdr_S_origins = f'{kdr_S_origins},(S4S5)_het'
                elif genotypes['Def-S5'] == 'CC':
                    kdr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S kdr and S4S5, but homozygote for S5.'
                elif genotypes['Def-S5'] == 'AC':
                    kdr_S_origins = f'{kdr_S_origins},S5_het'
                elif genotypes['Def-S5'] == 'AA':
                    kdr_S_origins = f'{kdr_S_origins},S4_het'
        else :
            if pd.isnull(genotypes['Def-S4S5']):
                kdr_S_origins = f'{kdr_S_origins},S4S5?'
            elif genotypes['Def-S4S5'] == 'CC':
                kdr_S_origins = f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S kdr, but homozygote for S4S5.'
            elif genotypes['Def-S4S5'] == 'CT':
                if pd.isnull(genotypes['Def-S5']):
                    kdr_S_origins = f'{kdr_S_origins},(S4S5)_het'
                elif genotypes['Def-S5'] == 'CC':
                    return(f'Fail. Genotypes suggest that sample {sample_name} is heterozygote for S kdr and S4S5, but homozygote for S5.')
                elif genotypes['Def-S5'] == 'AC':
                    kdr_S_origins = f'{kdr_S_origins},S5_het'
                elif genotypes['Def-S5'] == 'AA':
                    kdr_S_origins = f'{kdr_S_origins},S4_het'
    if clean:
        return(_kdr_gen_cleanup(kdr_S_origins))
    else:
        return(kdr_S_origins)

def _402_kdr_origin_gen(genotypes, clean = True):
    if 'sample_name' in genotypes.index:
        sample_name = genotypes['sample_name']
    else:
        sample_name = genotypes.name
    # Check for the 995F mutations
    if pd.isnull(genotypes['kdr-402L']):
        kdr_402_origins = '402:unknown'
    elif genotypes['kdr-402L'] == 'GG':
        kdr_402_origins = '402:wt_hom'
    elif genotypes['kdr-402L'] == 'CG':
        kdr_402_origins = '402:het,402LC_het'
    elif genotypes['kdr-402L'] == 'GT':
        kdr_402_origins = '402:het,402LT_het'
    elif genotypes['kdr-402L'] == 'CT':
        kdr_402_origins = '402:hom,402LC_het,402LT_het'
    elif genotypes['kdr-402L'] == 'CC':
        kdr_402_origins = '402:hom,402LC_hom'
    elif genotypes['kdr-402L'] == 'TT':
        kdr_402_origins = '402:hom,402LT_hom'
    else:
        print(f'Unexpected kdr 402 genotype. {sample_name} {genotypes["kdr-402L"]}')
        kdr_402_origins = 'Fail. Unexpected kdr 402 genotype'
        
    if clean:
        return(_kdr_gen_cleanup(kdr_402_origins))
    else:
        return(kdr_402_origins)

# The initial output of the kdr_origin function can be a little messy, since it outputs all of the 
# information that it could or couldn't obtain. This function tidies it up a bit. 
def _kdr_gen_cleanup(kdr_origin_str):
    if re.search('Fail', kdr_origin_str):
        return('?,?')
    if re.search('wt', kdr_origin_str):
        return('wt,wt')
    kdr_type = re.findall('.*(?=:)', kdr_origin_str)[0]
    outcomes = kdr_origin_str.split(',')
    origins = np.unique(outcomes[1:])
    established_origins = [o for o in origins if not re.search('\?', o)]
    # Remove "_het" and "_hom" text
    established_origins = [re.sub('_het', '', o) for o in established_origins]
    # Now double up each _hom entry. this is cludgy, but coudn't find a more elegant way
    for i in range(len(established_origins)):
        o = established_origins[i]
        if re.search('_hom', o):
            o = re.sub('_hom', '', o)
            established_origins[i] = o
            established_origins.append(o)
    if re.search('hom', outcomes[0]):
        if len(established_origins) == 1:
            return(f'{kdr_type},{established_origins[0]}')
        elif len(established_origins) == 2:
            return(','.join(established_origins))
        else:
            return(f'{kdr_type},{kdr_type}')
    if re.search('het', outcomes[0]):
        if len(established_origins) == 1:
            return(f'wt,{established_origins[0]}')
        else:
            return(f'wt,{kdr_type}')
    else:
        return('?,?')


# Single function to call both the F and S origins for a given haplotype. This function determines
# from the look of the genotype table whether it represents genotypes or haplotypes, and calls 
# the appropriate function. 
def kdr_origin(genotypes, alternate_S4S5 = False, clean = True, include_402 = None):
    if 'sample_name' in genotypes.index:
        sample_name = genotypes['sample_name']
    else:
        sample_name = genotypes.name
    if include_402 == None:
        if 'kdr-402L' in genotypes.index:
            include_402 = True
        else:
            include_402 = False
    if include_402 == False:
        kdr_origins = pd.DataFrame({'kdr_F_origin': [_F_kdr_origin_gen(genotypes, clean)], 
                                    'kdr_S_origin': [_S_kdr_origin_gen(genotypes, clean, alternate_S4S5)]
                                    }, index = [sample_name]
        
        )
    else:
        kdr_origins = pd.DataFrame({'kdr_F_origin': [_F_kdr_origin_gen(genotypes, clean)], 
                                    'kdr_S_origin': [_S_kdr_origin_gen(genotypes, clean, alternate_S4S5)],
                                    'kdr_402_origin': [_402_kdr_origin_gen(genotypes, clean)]
                                    }, index = [sample_name]
        )
    return(kdr_origins)

# From a pair of kdr origin calls (obtained by running the kdr_origin function  for each of 
# F and S, followed by kdr_hap_cleanup), output a single call combining the F and S calls. 
def get_single_gen_call(x):  
    if 'kdr_402_origin' in x.index:
        return(_get_single_gen_call_with_402(x))
    else:
        return(_get_single_gen_call_no_402(x))
    
def _get_single_gen_call_no_402(x): 
    if 'sample_name' in x.index:
        sample_name = x['sample_name']
    else:
        sample_name = x.name
    joined_calls = np.array(x['kdr_F_origin'].split(',') + x['kdr_S_origin'].split(','))
    # There should be at least two 'wt' calls
    if np.sum(joined_calls == 'wt') < 2:
        print(f'Too many different mutant haplotype backgrounds in sample {sample_name}')
        return('?,?')
    # Otherwise, drop two wildtype calls
    else:
        which_drop = np.where(joined_calls == 'wt')[0][:2]
        return(','.join(np.delete(joined_calls, which_drop)))

def _get_single_gen_call_with_402(x): 
    if 'sample_name' in x.index:
        sample_name = x['sample_name']
    else:
        sample_name = x.name
    joined_calls = np.array(x['kdr_F_origin'].split(',') + 
                            x['kdr_S_origin'].split(',') +
                            x['kdr_402_origin'].split(',')
    )
    # There should be at least four 'wt' calls
    if np.sum(np.isin(joined_calls,  ['wt', '?'])) < 4:
        print(f'Too many different mutant haplotype backgrounds in sample {sample_name}')
        return('?,?')
    # Otherwise, drop four wildtype calls
    else:
        which_drop = np.concatenate([
            np.where(joined_calls == 'wt')[0],
            np.where(joined_calls == '?')[0]
        ])[:4]
        return(','.join(np.delete(joined_calls, which_drop)))

### Load metadata

In [4]:
metadata = pd.read_csv(metadata_path, sep = '\t', index_col = 0)

### Get the genotype calls

In [5]:
vcf_snp_calls_dict = allel.read_vcf(vcf_path)
vcf_snp_calls_dict



{'samples': array(['GH_01', 'GH_02', 'GH_03', 'GH_04', 'GH_05', 'GH_06', 'GH_07',
        'GH_08', 'GH_09', 'GH_10', 'GH_11', 'GH_12', 'GH_13', 'GH_14',
        'GH_15', 'GH_16', 'GH_17', 'GH_18', 'GH_19', 'GH_20', 'GH_21',
        'GH_22', 'GH_23', 'GH_24', 'GH_25', 'GH_26', 'GH_27', 'GH_28',
        'GH_29', 'GH_30', 'GH_31', 'GH_32', 'GH_33', 'GH_34', 'GH_35',
        'GH_36', 'GH_37', 'GH_38', 'GH_39', 'GH_40_low_vol', 'GH_41',
        'GH_42', 'GH_43', 'GH_44', 'GH_45', 'GH_46', 'GH_47', 'GH_48',
        'GH_49', 'GH_50', 'GH_51', 'GH_52', 'GH_53', 'GH_54', 'GH_55',
        'GH_56', 'GH_57', 'GH_58', 'GH_59', 'GH_60', 'GH_61', 'GH_62',
        'GH_63', 'GH_64', 'GH_65_low_vol', 'GH_66', 'GH_67', 'GH_68',
        'GH_69', 'GH_70', 'GH_71', 'GH_72', 'GH_73', 'GH_74', 'GH_75',
        'GH_76', 'GH_77', 'GH_78', 'GH_79', 'GH_80_low_vol', 'GH_81',
        'GH_82', 'GH_83', 'GH_84', 'GH_85', 'GH_86', 'GH_87', 'GH_88',
        'GH_89', 'GH_90', 'GH_91', 'GH_92', 'GH_93', 'GH_94', 'GH_95'

### Filter the SNPs to just the ones useful for kdr origin analysis

In [6]:
which_snps = (vcf_snp_calls_dict['variants/CHROM'] == '2L') & np.isin(vcf_snp_calls_dict['variants/POS'], hap_def['variant_pos']) 
snp_calls = vcf_snp_calls_dict['calldata/GT'][which_snps, :, :]
pos = vcf_snp_calls_dict['variants/POS'][which_snps]
alt = vcf_snp_calls_dict['variants/ALT'][which_snps, :]
ref = vcf_snp_calls_dict['variants/REF'][which_snps]
snp_calls

array([[[ 0,  0],
        [ 0,  0],
        [-1, -1],
        ...,
        [ 0,  0],
        [-1, -1],
        [-1, -1]],

       [[ 0,  0],
        [ 0,  0],
        [-1, -1],
        ...,
        [ 0,  0],
        [-1, -1],
        [-1, -1]],

       [[ 0,  0],
        [ 0,  0],
        [-1, -1],
        ...,
        [ 0,  0],
        [-1, -1],
        [-1, -1]],

       ...,

       [[ 0,  0],
        [ 0,  0],
        [ 0,  0],
        ...,
        [ 0,  0],
        [ 0,  0],
        [-1, -1]],

       [[ 0,  0],
        [ 0,  0],
        [ 0,  0],
        ...,
        [ 1,  1],
        [ 0,  0],
        [ 0,  0]],

       [[ 1,  1],
        [ 1,  1],
        [ 0,  1],
        ...,
        [ 0,  0],
        [ 0,  1],
        [-1, -1]]], dtype=int8)

### Convert genotype calls to nucleotides

In [7]:
# combine ref and alt into a single matrix, and add a column of '?' at the end, so that 
# any genotype call of -1 (missing) draws the '?' character'
snp_alleles = np.concatenate([np.reshape(ref, (len(ref), 1)), 
                              alt,
                              np.full((len(ref), 1), '?')], 
                             axis = 1)

# Convert numberic calls to nucleotides, and sort each pair of nucleotides alphabetically
# (so, eg, the genotype 'TA' becomes 'AT')
snp_genotypes_3d = snp_alleles[
    np.array(range(snp_alleles.shape[0])).reshape(snp_alleles.shape[0], 1, 1), 
    snp_calls
]
snp_genotypes = np.apply_along_axis(lambda x: ''.join(np.sort(x)), 2, snp_genotypes_3d)

# Store results in data frame
hap_def.index = hap_def['variant_pos']
gen_df = pd.DataFrame(
    np.transpose(snp_genotypes), 
    index = vcf_snp_calls_dict['samples'],
    columns = hap_def.loc[pos, 'SNP name']
)
gen_df

SNP name,Def-S1-3,Def-F5-2,Def-S4S5-2,Def-F3,Def-S4S5,Def-S2S4,Def-S5,Def-F3F4-2,kdr-995S,kdr-995F,Def-F2,Def-S2-4,Def-S3,Def-F1
GH_01,TT,AA,GG,GG,TT,CC,AA,AA,TT,TT,GG,TT,TT,AA
GH_02,TT,AA,GG,GG,TT,CC,AA,AA,TT,TT,GG,TT,TT,AA
GH_03,??,??,??,??,TT,??,??,??,??,??,??,TT,TT,AG
GH_04,TT,AA,GG,GG,TT,CC,AA,AA,TT,TT,GG,TT,TT,AA
GH_05,TT,AA,GG,GG,TT,CC,AA,AA,TT,TT,GG,TT,TT,AA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Siaya_Delta_Alive_130,CT,AA,GG,GG,TT,CC,AA,AA,CT,AT,AG,TT,TT,GG
Siaya_Delta_Alive_131,TT,AA,GG,GG,TT,CC,AA,AA,TT,TT,AA,TT,TT,GG
ML_17,TT,AA,GG,??,TT,CC,AA,AA,CC,AA,AA,TT,GG,GG
ML_18,??,??,??,GG,TT,CC,??,AA,??,??,??,TT,TT,AG


### Obtain kdr origin calls

In [8]:

kdr_origins = pd.concat([kdr_origin(gen_df.iloc[i]) for i in range(gen_df.shape[0])])
kdr_origins['kdr_origin'] = kdr_origins.apply(
    get_single_gen_call, axis = 1
)

kdr_origins

Unexpected kdr F genotype. GH_03 ??
Unexpected kdr S genotype. GH_03 ??
Unexpected kdr F genotype. GH_08 ??
Unexpected kdr S genotype. GH_08 ??
Unexpected kdr F genotype. GH_09 ??
Unexpected kdr S genotype. GH_09 ??
Unexpected kdr F genotype. GH_24 ??
Unexpected kdr S genotype. GH_24 ??
Unexpected kdr F genotype. GH_36 ??
Unexpected kdr S genotype. GH_36 ??
Unexpected kdr F genotype. GH_48 ??
Unexpected kdr S genotype. GH_48 ??
Unexpected kdr F genotype. GH_60 ??
Unexpected kdr S genotype. GH_60 ??
Unexpected kdr F genotype. GH_84 ??
Unexpected kdr S genotype. GH_84 ??
Unexpected kdr F genotype. GH_89 ??
Unexpected kdr S genotype. GH_89 ??
Unexpected kdr F genotype. GH_92 ??
Unexpected kdr S genotype. GH_92 ??
Unexpected kdr F genotype. GH_Negative ??
Unexpected kdr S genotype. GH_Negative ??
Unexpected kdr F genotype. GM_08 ??
Unexpected kdr S genotype. GM_08 ??
Unexpected kdr F genotype. GM_21 ??
Unexpected kdr S genotype. GM_21 ??
Unexpected kdr F genotype. GM_27 ??
Unexpected kdr S

Unnamed: 0,kdr_F_origin,kdr_S_origin,kdr_origin
GH_01,"F1,F1","wt,wt","F1,F1"
GH_02,"F1,F1","wt,wt","F1,F1"
GH_03,"?,?","?,?","?,?"
GH_04,"F1,F1","wt,wt","F1,F1"
GH_05,"F1,F1","wt,wt","F1,F1"
...,...,...,...
Siaya_Delta_Alive_130,"wt,F2","wt,S1","F2,S1"
Siaya_Delta_Alive_131,"F2,F2","wt,wt","F2,F2"
ML_17,"wt,wt","S3,S3","S3,S3"
ML_18,"?,?","?,?","?,?"


### Merge kdr origins with metadata and write to file. 

In [9]:
kdr_origins_df = pd.merge(kdr_origins, metadata, left_index = True, right_index = True)
kdr_origins_df.to_csv('kdr_origins.csv', sep = '\t')
kdr_origins_df

Unnamed: 0,kdr_F_origin,kdr_S_origin,kdr_origin,taxon,location,country,plate,well_letter,well_number,latitude,longitude
GH_01,"F1,F1","wt,wt","F1,F1",,Ghana,Ghana,3,A,1,,
GH_02,"F1,F1","wt,wt","F1,F1",,Ghana,Ghana,3,A,2,,
GH_03,"?,?","?,?","?,?",,Ghana,Ghana,3,A,3,,
GH_04,"F1,F1","wt,wt","F1,F1",,Ghana,Ghana,3,A,4,,
GH_05,"F1,F1","wt,wt","F1,F1",,Ghana,Ghana,3,A,5,,
...,...,...,...,...,...,...,...,...,...,...,...
Siaya_Delta_Alive_130,"wt,F2","wt,S1","F2,S1",gambiae,Siaya,Kenya,7,H,8,,
Siaya_Delta_Alive_131,"F2,F2","wt,wt","F2,F2",gambiae,Siaya,Kenya,7,H,9,,
ML_17,"wt,wt","S3,S3","S3,S3",,ml,ml,7,H,10,,
ML_18,"?,?","?,?","?,?",,ml,ml,7,H,11,,


### Create a table where each row is a haplotype instead of a genotype (although these are genotype-based calls, so order within each sample will be random, so they can be used for, say, mapping, but not haplotype clustering). Write this table to file.

In [10]:

kdr_genhap_origins_df = pd.DataFrame({'kdr_origin': ','.join(list(kdr_origins['kdr_origin'])).split(',')},
                                     index = np.repeat(kdr_origins.index, 2)
)
kdr_genhap_origins_df = pd.merge(kdr_genhap_origins_df, metadata, left_index=True, right_index=True)
kdr_genhap_origins_df.to_csv('kdr_genhap_origins.csv', sep = '\t')
kdr_genhap_origins_df

Unnamed: 0,kdr_origin,taxon,location,country,plate,well_letter,well_number,latitude,longitude
Calvin_01,wt,,UNKN,UNKN,14,A,1,,
Calvin_01,wt,,UNKN,UNKN,14,A,1,,
Calvin_02,wt,,UNKN,UNKN,14,A,2,,
Calvin_02,wt,,UNKN,UNKN,14,A,2,,
Calvin_03,wt,,UNKN,UNKN,14,A,3,,
...,...,...,...,...,...,...,...,...,...
random_funestus,F,funestus,random,random,7,E,12,,
random_stephensi,?,stephensi,random,random,7,F,12,,
random_stephensi,?,stephensi,random,random,7,F,12,,
random_stephensi2,?,stephensi2,random,random,7,G,12,,


### Summarise results by taxon and location

In [11]:
# Count the number of occurances of each haplotypes in each population
# "values" could be any column that isn't specified elsewhere in the function. But it's 
# not allowed to be blank, so we had to pick one. 
pop_origin_counts = pd.pivot_table(kdr_genhap_origins_df, 
                                   values = 'plate', 
                                   index = ['taxon', cohort_column], 
                                   columns = 'kdr_origin', 
                                   aggfunc = len
                                  ).fillna(0).astype(int)

# A function to round a number up to n_signif significant figures
def signif(x, n_figs):
    power = 10 ** np.floor(np.log10(np.abs(x).clip(1e-200)))
    rounded = np.round(x / power, n_figs - 1) * power
    return rounded

# Calculate row totals of non-"?" columns
row_totals = pop_origin_counts.drop('?', axis = 1).sum(axis = 1)

# Calculate origin frequencies. We exclude the "?" calls for this
pop_origin_freqs = pop_origin_counts.drop('?', axis = 1).div(row_totals, axis = 0)
# Round to 2 significant figures
pop_origin_freqs = signif(pop_origin_freqs, 2)
print('Counts of origins:')
display(pop_origin_counts)
print('\n\nFrequencies of known origins:')
display(pop_origin_freqs)


Counts of origins:


Unnamed: 0_level_0,kdr_origin,?,F,F1,F2,S,S1,S3,wt
taxon,location,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
arabiensis,random,0,0,0,0,0,0,0,2
coluzzii,random,0,0,2,0,0,0,0,0
coluzzii,VK7,4,1,185,0,0,0,0,0
funestus,random,0,2,0,0,0,0,0,0
gambiae,Siaya,94,8,0,283,21,103,2,13
stephensi,random,2,0,0,0,0,0,0,0
stephensi2,random,2,0,0,0,0,0,0,0




Frequencies of known origins:


Unnamed: 0_level_0,kdr_origin,F,F1,F2,S,S1,S3,wt
taxon,location,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
arabiensis,random,0.0,0.0,0.0,0.0,0.0,0.0,1.0
coluzzii,random,0.0,1.0,0.0,0.0,0.0,0.0,0.0
coluzzii,VK7,0.0054,0.99,0.0,0.0,0.0,0.0,0.0
funestus,random,1.0,0.0,0.0,0.0,0.0,0.0,0.0
gambiae,Siaya,0.019,0.0,0.66,0.049,0.24,0.0047,0.03
stephensi,random,,,,,,,
stephensi2,random,,,,,,,


### That's all folks