In [1]:
%run ../scripts/tools.py

In [2]:
### AIMS ###
dataset = "Ag_Busia"
metadata = pd.read_csv("../../config/samples.tsv", sep="\t")
metadata = metadata.sort_values(by='species').reset_index(drop=True)
chroms = ['2L', '2R']
ploidy = 10
numbers = get_numbers_dict(ploidy)
qualflt = 30
missingprop = 0.5 # snakemake.params['missingprop']


In [14]:

# read AIMs
aims = zarr.open("../../resources/gamb_vs_colu.zarr/", mode='r')

## initialize dicts
ancestryPerAim = {}
aims_chrom_gamb = {}
aims_chrom_colu = {}
all_gamb = defaultdict(list)
all_colu = defaultdict(list)
n_aims_per_chrom = {}

for chrom in chroms:

    # read in and filter data
    path = f"../../results/variantAnalysis/vcfs/{dataset}.{chrom}.vcf.gz"
    vcf, geno, acsubpops, pos, depth, snpeff, subpops, pops =  readAndFilterVcf(path=path,
                                                               chrom=chrom,
                                                               samples=metadata,
                                                               numbers=numbers,
                                                               ploidy=ploidy,
                                                               qualflt=qualflt,
                                                               missingfltprop=missingprop)
    aimspos = aims[chrom]['POS'][:]

    # get intersection of aims and our SNPs
    aims_pos_mask, aims_mask_2 = pos.locate_intersection(aimspos)
    our_aims = pos[aims_pos_mask]
    print(f"\n In the data, across all samples there are {our_aims.shape[0]} Ancestry Informative markers on Chromosome {chrom}")

    # get gamb and colu alleles, and subset to aims that we have in the rna-seq data 
    aimscolu = aims[chrom]['colu_allele'][:][aims_mask_2]
    aimsgamb = aims[chrom]['gamb_allele'][:][aims_mask_2]

    # get mask that was used in readAndFilterVcf()
    mask = pos.locate_intersection(vcf['variants/POS'])[1]
    ref  = vcf['variants/REF'][mask][aims_pos_mask]
    alt = vcf['variants/ALT'][mask][aims_pos_mask]

    # filter geno array to set of aims
    geno_aims = geno.compress(aims_pos_mask, axis=0)

    totalgambscore = {}
    totalcoluscore = {}

    for aim in our_aims:

        gambscore = {}
        coluscore = {}

        # filter arrays 
        mask = our_aims == aim
        ref_ = ref[mask]
        alt_ = alt[mask]
        aimscolu_ = aimscolu[mask]
        aimsgamb_ = aimsgamb[mask]

        gn_aim = geno_aims.compress(mask, axis=0)

        # convert genotypes to nucleotides
        gn2nucleotide = {0:ref_[0],
                        1:alt_[0][0],
                         2:alt_[0][1],
                         3:alt_[0][2],
                        -1:float("nan")}
        gn = replace_with_dict2_generic(gn_aim, gn2nucleotide)

        # for each sample, get proportion of gambiae/coluzzii alleles
        # alleles that are different to both will be missed here
        for sample in metadata.treatment.unique():
            alleles = gn.take(subpops[sample], axis=1).flatten()
            
            # at each AIM, do we have gamb or colu alleles
            gamb = alleles[alleles != 'nan'] == aimsgamb_
            colu = alleles[alleles != 'nan'] == aimscolu_

            # get proportion of gamb v colu alleles at each locus
            gambscore[sample] = np.mean(gamb)
            coluscore[sample] = np.mean(colu)

        totalgambscore[aim] = dict(gambscore)
        totalcoluscore[aim] = dict(coluscore)

        gambscores = flip_dict(totalgambscore)
        coluscores = flip_dict(totalcoluscore)

        prop_gambiae = {}
        prop_colu = {}
        n_aims_per_sample = {}

        for sample in metadata.treatment.unique():

            prop_gambiae[sample] = np.nanmean(np.array(list(gambscores[sample].values())))
            all_gamb[sample].append(np.nanmean(np.array(list(gambscores[sample].values()))))
            prop_colu[sample] = np.nanmean(np.array(list(coluscores[sample].values())))
            all_colu[sample].append(np.nanmean(np.array(list(coluscores[sample].values()))))
            
            arr = np.array(list(gambscores[sample].values()))
            dim = arr.shape[0]
            n_aims_per_sample[sample] = dim-np.sum(np.isnan(arr))
    
    # store AIM fractions for each chromosome in outer dict 
    aims_chrom_gamb[chrom] = dict(prop_gambiae)
    aims_chrom_colu[chrom] = dict(prop_colu)
    n_aims_per_chrom[chrom] = dict(n_aims_per_sample)

    # Store ancestry score per aim
    ancestryPerAim[chrom] = pd.concat([pd.DataFrame(gambscores).add_suffix("_gamb"), pd.DataFrame(coluscores).add_suffix("_colu")], axis=1)
    ancestryPerAim[chrom]['contig'] = chrom

    # plot and store for each chromosome
    coludf = pd.DataFrame.from_dict(prop_colu, orient='index', columns=['AIM_fraction_coluzzii'])
    gambdf = pd.DataFrame.from_dict(prop_gambiae, orient='index', columns=['AIM_fraction_gambiae'])
    perchromdf = gambdf.merge(coludf, left_index=True, right_index=True)
    aimsperchromdf = pd.DataFrame.from_dict(n_aims_per_sample, orient='index', columns=['n_AIMs'])

    #perchromdf.to_csv(f"results/variantAnalysis/ancestry/AIM_fraction_{chrom}.tsv", sep="\t", index=True)
    #plot_aims(perchromdf, aimsperchromdf, species1="coluzzii", species2="gambiae", figtitle=f"AIM_fraction_{chrom}", total=False)

aims_chrom_gamb = flip_dict(aims_chrom_gamb)
aims_chrom_colu = flip_dict(aims_chrom_colu)
n_aims_per_chrom = flip_dict(n_aims_per_chrom)

# get ancestry per aim for later plotting on chromosome
ancestryPerAim = pd.concat(ancestryPerAim, axis=0)

# get genome wide average AIM fractions
for k in all_gamb:
    all_gamb[k] = np.nanmean(all_gamb[k])
    all_colu[k] = np.nanmean(all_colu[k])

df1 = pd.DataFrame.from_dict(all_gamb, orient='index',columns=['AIM_fraction_gambiae'])
df2 = pd.DataFrame.from_dict(all_colu, orient='index', columns=['AIM_fraction_coluzzii'])
n_aimsdf = pd.DataFrame.from_dict(n_aims_per_chrom)
#n_aimsdf.to_csv(f"results/variantAnalysis/ancestry/n_AIMS_per_chrom.tsv", sep="\t", index=True)

df = df1.merge(df2, left_index=True, right_index=True)
#df.to_csv(f"results/variantAnalysis/ancestry/AIMs_summary.tsv", sep="\t", index=True)

#plot_aims(df, n_aimsdf, species1="coluzzii", species2="gambiae", figtitle="AIM_fraction_whole_genome", total=True)



-------------- Reading VCF for chromosome 2L --------------
------- Filtering VCF at QUAL=30 and missingness proportion of 0.5 -------
After QUAL filter, 322603 SNPs retained out of 396426 for chromosome 2L
After missingness filter, 247206 SNPs retained out of 322603 for chromosome 2L

 In the data, across all samples there are 53 Ancestry Informative markers on Chromosome 2L

-------------- Reading VCF for chromosome 2R --------------
------- Filtering VCF at QUAL=30 and missingness proportion of 0.5 -------
After QUAL filter, 437795 SNPs retained out of 533960 for chromosome 2R
After missingness filter, 339017 SNPs retained out of 437795 for chromosome 2R

 In the data, across all samples there are 18 Ancestry Informative markers on Chromosome 2R


In [15]:
ancestryPerAim

Unnamed: 0,Unnamed: 1,BusiaSelected_gamb,BusiaParental_gamb,Kisumu_gamb,BusiaSelected_colu,BusiaParental_colu,Kisumu_colu,contig
2L,181564,1.0,1.0,0.000,0.0,0.0,1.000,2L
2L,210286,1.0,1.0,0.000,0.0,0.0,1.000,2L
2L,441325,1.0,1.0,0.000,0.0,0.0,1.000,2L
2L,494285,1.0,1.0,0.150,0.0,0.0,0.850,2L
2L,770749,1.0,1.0,0.000,0.0,0.0,1.000,2L
...,...,...,...,...,...,...,...,...
2R,58793509,1.0,1.0,0.000,0.0,0.0,1.000,2R
2R,58899136,1.0,1.0,0.000,0.0,0.0,1.000,2R
2R,59071740,1.0,1.0,0.500,0.0,0.0,0.475,2R
2R,60987243,1.0,1.0,0.825,0.0,0.0,0.175,2R


In [12]:
ancestryPerAim = pd.concat([pd.DataFrame(gambscores).add_suffix("_gamb"), pd.DataFrame(coluscores).add_suffix("_colu")], axis=1)
ancestryPerAim['contig'] = chrom



Unnamed: 0,BusiaSelected_gamb,BusiaParental_gamb,Kisumu_gamb,BusiaSelected_colu,BusiaParental_colu,Kisumu_colu
181564,1.0,1.0,0.0,0.0,0.0,1.0
210286,1.0,1.0,0.0,0.0,0.0,1.0
441325,1.0,1.0,0.0,0.0,0.0,1.0
494285,1.0,1.0,0.15,0.0,0.0,0.85
770749,1.0,1.0,0.0,0.0,0.0,1.0
771465,1.0,1.0,0.0,0.0,0.0,1.0
772056,1.0,1.0,0.0,0.0,0.0,1.0
927247,1.0,1.0,,0.0,0.0,
933927,1.0,1.0,0.0,0.0,0.0,1.0
955705,1.0,1.0,0.0,0.0,0.0,1.0


In [8]:
pd.DataFrame(coluscores)

Unnamed: 0,BusiaSelected,BusiaParental,Kisumu
181564,0.0,0.0,1.0
210286,0.0,0.0,1.0
441325,0.0,0.0,1.0
494285,0.0,0.0,0.85
770749,0.0,0.0,1.0
771465,0.0,0.0,1.0
772056,0.0,0.0,1.0
927247,0.0,0.0,
933927,0.0,0.0,1.0
955705,0.0,0.0,1.0


In [7]:
pd.DataFrame(gambscores)

Unnamed: 0,BusiaSelected,BusiaParental,Kisumu
181564,1.0,1.0,0.0
210286,1.0,1.0,0.0
441325,1.0,1.0,0.0
494285,1.0,1.0,0.15
770749,1.0,1.0,0.0
771465,1.0,1.0,0.0
772056,1.0,1.0,0.0
927247,1.0,1.0,
933927,1.0,1.0,0.0
955705,1.0,1.0,0.0
