In [1]:
import allel
import malariagen_data
import numpy as np
import pandas as pd
import locusPocus

### Integrating the mvncall phasing into the biallelic haplotypes
A function which -

- returns an allel.HaplotypeArray or Haplotype Xarray for the region
- returns a pd.DataFrame with the same data
- haplotypes ordered by ag3.haplotypes() order
- snps ordered by position

In [2]:
dataset = 'Coeae1f'

cohorts = [
    # Ag1000G phase 3 sample sets in Ag3.0
    "AG1000G-GH", 
    'AG1000G-ML-A',
     'AG1000G-BF-A',
     'AG1000G-BF-B',
     'AG1000G-GN-A',
     'AG1000G-GN-B',
    'AG1000G-TZ',
    # Amenta-Etego sample sets in Ag3.3
    # GAARDIAN sample set in Ag3.4
    '1244-VO-GH-YAWSON-VMF00149',
    # GAARD Ghana sample set in Ag3.2
     "1244-VO-GH-YAWSON-VMF00051",
     '1245-VO-CI-CONSTANT-VMF00054',
     '1253-VO-TG-DJOGBENOU-VMF00052',
     '1237-VO-BJ-DJOGBENOU-VMF00050'
]


contig = '2L'
start = 28_520_000
end = 28_570_000

In [3]:
ag3 = malariagen_data.Ag3(pre=True)

Load VCF

In [4]:
def load_multiallelic_haplotypes(cohorts, path_to_multi_vcf, contig, start, end, non_synon_only=True, remove_invariant=True):
    """
    Load and integrate mvncalls into haplotype data
    """
    
    vcf = allel.read_vcf(path_to_multi_vcf, 
                     fields=['samples', 'variants/ALT', 'variants/CHROM', 'calldata/GT', 'variants/POS', 'variants/QUAL', 'variants/REF'])
    
    print("Phased mvncall data has shape: ", vcf['calldata/GT'].shape)
    
    haps = ag3.haplotypes(region=f"{contig}:{start}-{end}", sample_sets=cohorts, analysis='gamb_colu_arab')
    
    geno_bial = allel.GenotypeArray(haps['call_genotype'])
    pos_bial = allel.SortedIndex(haps['variant_position'].values)    
    geno_multi = allel.GenotypeArray(vcf['calldata/GT'])
    pos_multi = allel.SortedIndex(vcf['variants/POS'])   
    
    assert (vcf['samples'] == haps['sample_id'].values).all(), "VCF and haps sample order do not match"
    
    multi_df = pd.DataFrame(geno_multi.to_haplotypes()).set_index(pos_multi.values)
    bial_df = pd.DataFrame(geno_bial.to_haplotypes()).set_index(pos_bial.values)
    haps_df = pd.concat([multi_df, bial_df], axis=0)
    haps_df = haps_df.sort_index()
    
    pos = allel.SortedIndex(np.sort(np.concatenate([pos_multi.values, pos_bial.values])))
    
    if non_synon_only:       
        print("Returning only non-synonymous SNPs")
        transcripts = ag3.geneset().query("type == 'exon' & contig == @contig & start > @start & end < @end")['Parent'].unique()
        
        snp_freq_dfs = []
        for transcript_id in transcripts:
            snp_allele_freqs_df = ag3.snp_allele_frequencies(
                transcript=transcript_id, 
                cohorts="admin1_year", 
                sample_sets=cohorts, 
                drop_invariant=False,
            )
            snp_freq_dfs.append(snp_allele_freqs_df)
        snp_freq_df = pd.concat(snp_freq_dfs).query("effect == 'NON_SYNONYMOUS_CODING' & max_af > 0.05")
        aa_df = snp_freq_df.reset_index()[['position', 'aa_change']]
        aa_df = aa_df.groupby('position').agg({'aa_change': '_'.join}).reset_index()
        pos_bool, aa_bool = pos.locate_intersection(aa_df['position'])
        pos = pos[pos_bool]
        haps_df = haps_df[pos_bool]
        haps_df = haps_df.set_index(aa_df[aa_bool]['aa_change'], append=True)
        if remove_invariant:
            invariant_cols = haps_df.nunique(axis=1) <= 1
            print(f"Removing {invariant_cols.sum()} invariant SNPs")
            haps_df = haps_df.loc[~invariant_cols, :]
            print(f"There are {haps_df.shape[1]} haplotypes and {haps_df.shape[0]} segregating haplotype calls")
    
    return(allel.HaplotypeArray(haps_df.values), haps_df, pos)

In [5]:
path_to_multi_vcf = "../../results/phasing/coeae1f.phasedMulti.vcf"

### getting appropriate aa changes for multialleles

- We have biallelic haplotype arrays
- We have a multiallelic haplotype array 

- We have to recode the biallelic arrays in order to find the appropriate aa change
- We also need to split multialleles onto different rows, with appropriate aa change 
- Then join them all  

In [6]:
vcf = allel.read_vcf("../../results/phasing/coeae1f.phasedMulti.vcf", fields=['samples', 'variants/ALT', 'variants/CHROM', 'calldata/GT', 'variants/POS', 'variants/QUAL', 'variants/REF'])
haps = ag3.haplotypes(region=f"{contig}:{start}-{end}", sample_sets=cohorts, analysis='gamb_colu_arab')

  chunks = [d[0] for d in it]


In [7]:
geno_bial = allel.GenotypeArray(haps['call_genotype'])
pos_bial = allel.SortedIndex(haps['variant_position'].values)    
geno_multi = allel.GenotypeArray(vcf['calldata/GT'])
pos_multi = allel.SortedIndex(vcf['variants/POS'])   

assert (vcf['samples'] == haps['sample_id'].values).all(), "VCF and haps sample order do not match"

multi_df = pd.DataFrame(geno_multi.to_haplotypes()).set_index(pos_multi.values)
bial_df = pd.DataFrame(geno_bial.to_haplotypes()).set_index(pos_bial.values)

In [10]:
alleles = pd.concat([multi_df, multi_df]).apply(lambda x: pd.unique(x), axis=1).reset_index()

In [63]:
#alleles = multi_df.apply(lambda x: pd.unique(x), axis=1)

def split_multialleles(multi_df):
    #alleles = multi_df.apply(lambda x: pd.unique(x), axis=1)
    df2 = multi_df.copy()
    df2.index = [28_545_700]
    alleles = pd.concat([multi_df, df2]).apply(lambda x: pd.unique(x), axis=1)
    
    df_list = []
    for idx, multi_allele in multi_df.iterrows():
        if len(alleles.loc[idx]) == 2:
            print(f"only two alleles = {idx}")
            return(df_list.append(multi_allele.to_frame()))
        
        multi_allele1 = multi_allele.copy()
        multi_allele2 = multi_allele.copy()

        multi_allele2[multi_allele2 == alleles[idx][1]] = 0
        multi_allele1[multi_allele1 == alleles[idx][2]] = 0 

        df_list.append(pd.concat([multi_allele1, multi_allele2], axis=1).T)

    return(pd.concat(df_list, axis=1))
                

In [64]:
multi_split_df = split_multialleles(multi_df)

In [156]:
hap_array = allel.HaplotypeArray(bial_df.values)
hap_positions = allel.SortedIndex(bial_df.index.values)

In [246]:
transcripts = ag3.geneset().query("type == 'exon' & contig == @contig & start > @start & end < @end")['Parent'].unique()

snp_freq_dfs = []
for transcript_id in transcripts:
    snp_allele_freqs_df = ag3.snp_effects(
        transcript=transcript_id, 
    )
    snp_freq_dfs.append(snp_allele_freqs_df)

df_effects = pd.concat(snp_freq_dfs)

We first remap the alleles

In [226]:
hap_array = allel.HaplotypeArray(bial_df.values)
hap_positions = allel.SortedIndex(bial_df.index.values)

haps_remapped, remap_pos = locusPocus.remap_haplo_alleles(hap_array, 
                                        hap_positions, 
                                        transcript=f"{contig}:{start}-{end}", 
                                        sample_set=cohorts, 
                                        metaquery=None)

bial_df = pd.DataFrame(haps_remapped).set_index(remap_pos.values)

After remapping, we need to join this to our split multiallelic sites

In [272]:
haps_df = pd.concat([bial_df, multi_split_df], axis=0).sort_index()

Load df_effects dataframe

In [273]:
myalt = 28546251
haps_df.loc[myalt]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4852,4853,4854,4855,4856,4857,4858,4859,4860,4861
28546251,0,0,0,0,2,2,0,0,0,0,...,2,2,0,2,0,0,0,0,0,0
28546251,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [274]:
seg_bool = haps_df.apply(lambda x: len(np.unique(x)) > 1, axis=1)
haps_df = haps_df[seg_bool]

pos = allel.SortedIndex(haps_df.index.values)
hap_pos_bool, df_effects_bool = pos.locate_intersection(df_effects['position'])
df_effects = df_effects[df_effects_bool]
hap_remap_positions = pos[hap_pos_bool]
haps_df = haps_df[hap_pos_bool]

In [276]:
aa_change_list = []
non_synon_bool= []
for pos, row in haps_df.iterrows():
    if pos == 28546251:
        print("yay")
    df_eff = df_effects.query("position == @pos")
    alleles = np.unique(row)
    alt = alleles[1]-1
    
    eff = df_eff['effect'].to_numpy()[alt]
    if eff != 'NON_SYNONYMOUS_CODING':
        aa_change = ""
        non_synon_bool.append(False)
    elif eff == 'NON_SYNONYMOUS_CODING':
        aa_change = df_eff['aa_change'].to_numpy()[alt]
        non_synon_bool.append(True)

    aa_change_list.append(aa_change)

yay
yay


There are still intronic SNPs with no aa change

In [277]:
haps_df.loc[:, 'aa_change'] = aa_change_list

haps_df = haps_df.set_index("aa_change", append=True)

haps_df = haps_df[non_synon_bool]

In [300]:
haps_df.loc[28545767]

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,4852,4853,4854,4855,4856,4857,4858,4859,4860,4861
aa_change,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
E477V,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [286]:
haps_df

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3,4,5,6,7,8,9,...,4852,4853,4854,4855,4856,4857,4858,4859,4860,4861
Unnamed: 0_level_1,aa_change,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
28520016,E122A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28520022,L124S,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28520028,H126R,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28520039,S130R,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28520069,V140I,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28569873,S922F,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28569877,P921S,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28569879,R920H,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
28569883,N919H,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [307]:
multi_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4852,4853,4854,4855,4856,4857,4858,4859,4860,4861
28546251,0,0,0,0,2,2,0,0,0,0,...,2,2,0,2,0,0,0,0,0,0


In [310]:
vcf = allel.read_vcf(path_to_multi_vcf, fields=['samples', 'variants/ALT', 'variants/CHROM', 'calldata/GT', 'variants/POS', 'variants/QUAL', 'variants/REF'])
haps = ag3.haplotypes(region=f"{contig}:{start}-{end}", sample_sets=cohorts, analysis="gamb_colu_arab")

print("Genotypes loaded")
haps_bial = allel.GenotypeArray(haps['call_genotype']).to_haplotypes()
pos_bial = allel.SortedIndex(haps['variant_position'].values)    
haps_multi = allel.GenotypeArray(vcf['calldata/GT']).to_haplotypes()
pos_multi = allel.SortedIndex(vcf['variants/POS'])   

assert (vcf['samples'] == haps['sample_id'].values).all(), "VCF and haps sample order do not match"

multi_df = pd.DataFrame(haps_multi).set_index(pos_multi.values)

Genotypes loaded


In [315]:
split_multialleles(multi_df).loc[28545677]

0    [0, 2, 1]
Name: 28545677, dtype: object

In [None]:
multi

In [383]:
alleles = multi_df.apply(lambda x: pd.unique(x), axis=1).to_frame()

df_list = []
for idx, multi_allele in multi_df.iterrows():
    if len(alleles.loc[idx][0]) == 2:
        print(f"only two alleles = {idx}")
        df_list.append(multi_allele.to_frame().T)
    else:
        multi_allele1 = multi_allele.copy()
        multi_allele2 = multi_allele.copy()

        multi_allele2[multi_allele2 == alleles.loc[idx][0][1]] = 0
        multi_allele1[multi_allele1 == alleles.loc[idx][0][2]] = 0 
        df_list.append(pd.concat([multi_allele1, multi_allele2], axis=1).T)
        

only two alleles = 28545862
only two alleles = 28545945
only two alleles = 28546024
only two alleles = 28546029
only two alleles = 28546185
only two alleles = 28546335
only two alleles = 28546340
only two alleles = 28546376
only two alleles = 28546460
only two alleles = 28546587
only two alleles = 28546588
only two alleles = 28546589
only two alleles = 28546634
only two alleles = 28546656
only two alleles = 28546660
only two alleles = 28546810
only two alleles = 28547166
only two alleles = 28547522
only two alleles = 28547729
only two alleles = 28547744
only two alleles = 28547759
only two alleles = 28547780


In [394]:
def split_multialleles(multi_df):
    alleles = multi_df.apply(lambda x: pd.unique(x), axis=1).to_frame()
    
    df_list = []
    for idx, multi_allele in multi_df.iterrows():
        if len(alleles.loc[idx][0]) == 2:
            print(f"only two alleles = {idx}")
            df_list.append(multi_allele.to_frame().T)
        else:
            multi_allele1 = multi_allele.copy()
            multi_allele2 = multi_allele.copy()

            multi_allele2[multi_allele2 == alleles.loc[idx][0][1]] = 0
            multi_allele1[multi_allele1 == alleles.loc[idx][0][2]] = 0 

            df_list.append(pd.concat([multi_allele1, multi_allele2], axis=1).T)

    return(pd.concat(df_list, axis=0))


def load_multiallelic_haplotypes(path_to_multi_vcf, sample_sets ,contig, start, end, analysis='gamb_colu_arab', metaquery=None):
    vcf = allel.read_vcf(path_to_multi_vcf, fields=['samples', 'variants/ALT', 'variants/CHROM', 'calldata/GT', 'variants/POS', 'variants/QUAL', 'variants/REF'])
    haps = ag3.haplotypes(region=f"{contig}:{start}-{end}", sample_sets=sample_sets, analysis=analysis)
    
    print("Genotypes loaded")
    haps_bial = allel.GenotypeArray(haps['call_genotype']).to_haplotypes()
    pos_bial = allel.SortedIndex(haps['variant_position'].values)    
    haps_multi = allel.GenotypeArray(vcf['calldata/GT']).to_haplotypes()
    pos_multi = allel.SortedIndex(vcf['variants/POS'])   

    assert (vcf['samples'] == haps['sample_id'].values).all(), "VCF and haps sample order do not match"

    multi_df = pd.DataFrame(haps_multi).set_index(pos_multi.values)
    multi_split_df = split_multialleles(multi_df)
    
    
    ### load eff dataframe
    print("loading df_effects for region")
    transcripts = ag3.geneset().query("type == 'exon' & contig == @contig & start > @start & end < @end")['Parent'].unique()
    snp_freq_dfs = []
    for transcript_id in transcripts:
        snp_allele_freqs_df = ag3.snp_effects(
            transcript=transcript_id, 
        )
        snp_freq_dfs.append(snp_allele_freqs_df)

    df_effects = pd.concat(snp_freq_dfs)
    
    ## remap haps
    print("remapping biallelic haplotypes")
    haps_remapped, remap_pos = locusPocus.remap_haplo_alleles(haps_bial, 
                                            pos_bial, 
                                            transcript=f"{contig}:{start}-{end}", 
                                            sample_set=cohorts, 
                                            metaquery=metaquery)
    bial_df = pd.DataFrame(haps_remapped).set_index(remap_pos.values)
    haps_df = pd.concat([bial_df, multi_split_df], axis=0).sort_index()
    
    print("filtering to seg sites")
    seg_bool = haps_df.apply(lambda x: len(np.unique(x)) > 1, axis=1)
    haps_df = haps_df[seg_bool]
    
    print("intersecting with df_effects")
    ### intersecting df_effects
    pos = allel.SortedIndex(haps_df.index.values)
    hap_pos_bool, df_effects_bool = pos.locate_intersection(df_effects['position'])
    df_effects = df_effects[df_effects_bool]
    hap_remap_positions = pos[hap_pos_bool]
    haps_df = haps_df[hap_pos_bool]
    
    print("extracting aa change info for each SNP")
    ### get relevant aa change for each snp 
    aa_change_list = []
    non_synon_bool= []
    for pos, row in haps_df.iterrows():
        df_eff = df_effects.query("position == @pos")
        alleles = np.unique(row)
        alt = alleles[1]-1

        eff = df_eff['effect'].to_numpy()[alt]
        if eff != 'NON_SYNONYMOUS_CODING':
            aa_change = ""
            non_synon_bool.append(False)
        elif eff == 'NON_SYNONYMOUS_CODING':
            aa_change = df_eff['aa_change'].to_numpy()[alt]
            non_synon_bool.append(True)
        aa_change_list.append(aa_change)
        
    haps_df.loc[:, 'aa_change'] = aa_change_list
    haps_df = haps_df.set_index("aa_change", append=True)
    return(haps_df, non_synon_bool)

In [395]:
haps_df, non_synon_bool = load_multiallelic_haplotypes(path_to_multi_vcf="../../results/phasing/coeae1f.phasedMulti.vcf",
                                                      sample_sets=cohorts, contig=contig, start=start, end=end)

Genotypes loaded
only two alleles = 28545862
only two alleles = 28545945
only two alleles = 28546024
only two alleles = 28546029
only two alleles = 28546185
only two alleles = 28546335
only two alleles = 28546340
only two alleles = 28546376
only two alleles = 28546460
only two alleles = 28546587
only two alleles = 28546588
only two alleles = 28546589
only two alleles = 28546634
only two alleles = 28546656
only two alleles = 28546660
only two alleles = 28546810
only two alleles = 28547166
only two alleles = 28547522
only two alleles = 28547729
only two alleles = 28547744
only two alleles = 28547759
only two alleles = 28547780
loading df_effects for region
remapping biallelic haplotypes
filtering to seg sites
intersecting with df_effects
extracting aa change info for each SNP
