In [None]:
%matplotlib inline

import allel
import numpy as np
import pandas as pd
import zarr
from pathlib import Path
import scipy
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

%run tools.py

### GAARDIAN - Ghana - G123

In [None]:
vcf = allel.read_vcf("../resources/vcfs/ag3_gaardian_X.biallelic.vcf.gz")

In [None]:
geno = allel.GenotypeArray(vcf['calldata/GT']).to_haplotypes()

In [None]:
geno

In [None]:
sample_set = "1244-VO-GH-YAWSON-VMF00149"
chroms = ['2L', '2R', '3L', '3R', 'X']

snps = {}
pos = {}

for chrom in chroms:
    
    snps[chrom], pos[chrom], metadata = load_arrays_and_metadata("../resources", sample_set, chrom)
    
metadata['location2'] = metadata['location'].str.split(".").str.get(0)

In [None]:
from itertools import combinations

In [None]:
def getPopulations(metadata, columns=['species_gambiae_coluzzii', 'location2'], minPopSize=15):
    
    # subset metadata dataFrame and find combinations with more than minPopSize individuals
    df = metadata[columns]
    df = df.groupby(columns).size().reset_index().rename(columns={0:'size'})
    cohorts = df[df['size'] > minPopSize][columns]
    
    idxs = []
    for idx, row in cohorts.iterrows():   
        # create the pandas query for each comparison
        query = " & ".join([col + " == " + "'" + row.astype(str)[col] + "'" for col in cohorts.columns])
        # get indices of individuals for each comparison
        idxs.append(metadata.query(query).index.tolist())
    
    cohorts['cohortText'] = cohorts[columns].agg(' | '.join, axis=1)
    cohorts['cohortNoSpaceText'] = cohorts['cohortText'].str.replace("|", ".").str.replace(" ", "")
    cohorts['indices'] = idxs    
    return(cohorts.reset_index(drop=True))

In [None]:
cohorts = getPopulations(metadata,columns=['species_gambiae_coluzzii','location2'], minPopSize=5)
cohorts

In [None]:
stat = 'G12'

if stat in ['G12', 'G123']:
    print("True")

In [None]:
for idx, coh in cohorts[:1].iterrows():
    print(coh['indices'])

In [None]:
df['species_gambiae_coluzzii']+ "|" + df['location2']

In [None]:
inds = np.array([0,1,200,201,205,210])
a2 = np.arange(0,300)

In [None]:
hapInds = np.sort(np.concatenate(((inds*2),((inds*2)+1))))
hapInds

In [None]:
?np.sort

How many do we have from each species?

In [None]:
metadata.species_gambiae_coluzzii.value_counts()

How many samples do we have from each village??

In [None]:
pd.cro

In [None]:
metadata.location2.value_counts()

## G12

In [None]:
def cluster_G(gnalt, cut_height=0.1, metric='euclidean', g=2):
    """
    Hierarchically clusters genotypes and calculates G12 statistic. 
    """
    # cluster the genotypes in the window
    dist = scipy.spatial.distance.pdist(gnalt.T, metric=metric)
    if metric in {'hamming', 'jaccard'}:
        # convert distance to number of SNPs, easier to interpret
        dist *= gnalt.shape[0]

    Z = scipy.cluster.hierarchy.linkage(dist, method='single')
    cut = scipy.cluster.hierarchy.cut_tree(Z, height=cut_height)[:, 0]
    cluster_sizes = np.bincount(cut)
    clusters = [np.nonzero(cut == i)[0] for i in range(cut.max() + 1)]
    
    # get freq of clusters and sort by largest freq
    freqs = cluster_sizes/gnalt.shape[1]
    freqs = np.sort(freqs)[::-1]
    
    # calculate g12
    g12 = np.sum(freqs[:g])**2 + np.sum(freqs[g:]**2)
    
    return(g12)

def garuds_G(gnalt, pos, cut_height=None, metric='euclidean', window_size=1000, step_size=500, cluster=False, g = 2):
    
    """
    Calculates G12, stores windows in .tsv and plots
    """
        
    # Do we want to cluster the Multi-locus genotypes (MLGs), or just group MLGs if they are identical
    if cluster:
        g12 = allel.moving_statistic(gnalt, cluster_G, size=window_size, step=step_size, metric=metric, cut_height=cut_height, g=g)
    else:
        g12,_,_,_ = allel.moving_garud_h(gnalt, size=window_size, step=step_size)

    midpoint = allel.moving_statistic(pos, np.median, size=window_size, step=step_size)
    
    return(g12, midpoint)


### Run G123 on all coluzzii, and all gambiae

In [None]:
for chrom in chroms:
    
    # have edited .species_gambiae_coluzzii column to contain 'arabiensis' instead of NA 
    for sp in metadata.species_gambiae_coluzzii.unique():

        # filter to species 
        nmeta3 = metadata[metadata.species_gambiae_coluzzii == sp]
        flt = np.array(nmeta3.index)
        # filter to correct loc, year, species individuals
        gt_cohort = snps[chrom].take(flt, axis=1)

        log(f"--------- Running G123 on {sp} Chromosome {chrom} ----------")
        log("filter to biallelic segregating sites")
        ac_cohort = gt_cohort.count_alleles(max_allele=3).compute()
        # N.B., if going to use to_n_alt later, need to make sure sites are 
        # biallelic and one of the alleles is the reference allele
        ref_ac = ac_cohort[:, 0]
        loc_sites = ac_cohort.is_biallelic() & (ref_ac > 0)
        gt_seg = da.compress(loc_sites, gt_cohort, axis=0)
        pos_seg = da.compress(loc_sites, pos[chrom], axis=0)

        log("compute input data for G123")
        pos_seg = pos_seg.compute()
        gn_seg = allel.GenotypeDaskArray(gt_seg).to_n_alt().compute()

        # calculate G12 and plot figs 
        g123, midpoint = garuds_G(gnalt=gn_seg, 
                                   pos=pos_seg, 
                                   cut_height=6,
                                   metric='euclidean',
                                   window_size=1200,
                                   step_size=600, 
                                   cluster=True,
                                  g=3)
        
        saveAndPlot(statName="G123", 
                    values=g123, 
                    midpoints=midpoint,
                    prefix="../results/selection/G123", 
                    species=sp,
                    chrom=chrom,
                    ylim=0.02)

### Run G123 on all locations

In [None]:
min_cohort_size = 20

for chrom in chroms:
    
            ### loop through locations, then years, then species  
    for loc in metadata.location2.unique():

        nmeta = metadata[metadata.location2 == loc]
        
        # have edited .species_gambiae_coluzzii column to contain 'arabiensis' instead of NA 
        for sp in nmeta.species_gambiae_coluzzii.unique():

            # filter to species 
            nmeta3 = nmeta[nmeta.species_gambiae_coluzzii == sp]
            if nmeta3.shape[0] < min_cohort_size:
                continue
            flt = np.array(nmeta3.index)
            # filter to correct loc, year, species individuals
            gt_cohort = snps[chrom].take(flt, axis=1)

            log(f"--------- Running G123 on {sp} Chromosome {chrom} {loc} ----------")
            log("filter to biallelic segregating sites")
            ac_cohort = gt_cohort.count_alleles(max_allele=3).compute()
            # N.B., if going to use to_n_alt later, need to make sure sites are 
            # biallelic and one of the alleles is the reference allele
            ref_ac = ac_cohort[:, 0]
            loc_sites = ac_cohort.is_biallelic() & (ref_ac > 0)
            gt_seg = da.compress(loc_sites, gt_cohort, axis=0)
            pos_seg = da.compress(loc_sites, pos[chrom], axis=0)

            log("compute input data for G123")
            pos_seg = pos_seg.compute()
            gn_seg = allel.GenotypeArray(gt_seg).to_n_alt()

            # calculate G12 and plot figs 
            g123, midpoint = garuds_G(gnalt=gn_seg, 
                                       pos=pos_seg, 
                                       cut_height=6,
                                       metric='euclidean',
                                       window_size=1200,
                                       step_size=600, 
                                       cluster=True,
                                      g=3)

            saveAndPlot(statName=f"G123_{loc}", 
                        values=g123, 
                        midpoints=midpoint,
                        prefix="../results/selection/G123", 
                        species=sp,
                        chrom=chrom,
                        ylim=0.5)