In [1]:
%matplotlib inline

import allel
import numpy as np
import pandas as pd
import zarr
from pathlib import Path
import scipy
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

In [3]:
%run tools.py

### GAARDIAN - Ghana - G12

In [4]:
sample_set = "1244-VO-GH-YAWSON-VMF00149"
chroms = ['2L', '2R', '3L', '3R', 'X']

snps = {}
pos = {}

for chrom in chroms:
    
    snps[chrom], pos[chrom], metadata = load_arrays_and_metadata("../resources", sample_set, chrom)
    
metadata['location2'] = metadata['location'].str.split(".").str.get(0)

Lets load the sample metadata and the SNP genotypes and positions. 

In [4]:
my_sample_set = '1244-VO-GH-YAWSON-VMF00149'
metadata = ag3.sample_metadata(sample_sets=my_sample_set)

Lets make a new column in the metadata of overall village, rather than village x house. We split on full stops and take the first element. 

In [5]:
metadata['location2'] = metadata['location'].str.split(".").str.get(0)

How many do we have from each species?

In [6]:
metadata.species.value_counts()

coluzzii    422
gambiae      63
Name: species, dtype: int64

How many samples do we have from each village??

In [7]:
metadata.location2.value_counts()

Domenase         57
Annorkrom        49
New Edubiase     42
Pokukrom         37
Odumto           34
Koniyaw          32
Nkotumso         30
Kokotro          28
Prisintease      24
Kente            24
Watreso          19
Adansi Apagya    16
Wamase           16
Ankaako          12
Mprakyire        10
Anwona            8
Adumanu           6
Yadome            6
Adansi-Krom       6
Bogyawe           5
Asonkore          5
Jacobu            5
Fumso             4
Dompoase          3
Anhwiaso          2
Subin Camp        2
Bogobiri West     2
Mensonso          1
Name: location2, dtype: int64

For each location, how many gambiae / coluzzii do we have?

In [8]:
pd.crosstab(metadata.location2, metadata.species)

species,coluzzii,gambiae
location2,Unnamed: 1_level_1,Unnamed: 2_level_1
Adansi Apagya,15,1
Adansi-Krom,6,0
Adumanu,5,1
Anhwiaso,2,0
Ankaako,12,0
Annorkrom,46,3
Anwona,5,3
Asonkore,0,5
Bogobiri West,0,2
Bogyawe,3,2


And what does the dask SNP array look like?

## G12

In [9]:
def cluster_G12(gnalt, cut_height=0.1, metric='euclidean'):
    """
    Hierarchically clusters genotypes and calculates G12 statistic. 
    """
    # cluster the genotypes in the window
    dist = scipy.spatial.distance.pdist(gnalt.T, metric=metric)
    if metric in {'hamming', 'jaccard'}:
        # convert distance to number of SNPs, easier to interpret
        dist *= gnalt.shape[0]

    Z = scipy.cluster.hierarchy.linkage(dist, method='single')
    cut = scipy.cluster.hierarchy.cut_tree(Z, height=cut_height)[:, 0]
    cluster_sizes = np.bincount(cut)
    clusters = [np.nonzero(cut == i)[0] for i in range(cut.max() + 1)]
    
    # get freq of clusters and sort by largest freq
    freqs = cluster_sizes/gnalt.shape[1]
    freqs = np.sort(freqs)[::-1]
    
    # calculate g12
    g12 = np.sum(freqs[:2])**2 + np.sum(freqs[2:]**2)
    
    return(g12)

def garuds_G12(gnalt, pos, cut_height=None, save=False, prefix=None, name=None, metric='euclidean', window_size=1000, step_size=500, cluster=False, species=None):
    
    """
    Calculates G12, stores windows in .tsv and plots
    """
        
    # Do we want to cluster the Multi-locus genotypes (MLGs), or just group MLGs if they are identical
    if cluster:
        g12 = allel.moving_statistic(gnalt, cluster_G12, size=window_size, step=step_size, metric=metric, cut_height=cut_height)
    else:
        g12,_,_,_ = allel.moving_garud_h(gnalt, size=window_size, step=step_size)

    midpoint = allel.moving_statistic(pos, np.median, size=window_size, step=step_size)
    
    # store windowed G12 statistics as .tsv 
    df = pd.DataFrame({'midpoint':midpoint, 'G12':g12})
    df.to_csv(f"{prefix}/{name}.tsv", sep="\t", index=False)
    
    # create plot 
    xtick = np.arange(0, midpoint.max(), 2000000)
    ylim = np.max([0.50, g12.max()])
    plt.figure(figsize=[20,10])
    if species == 'gambiae':
        sns.lineplot(midpoint, g12, color='lightcoral')
    else:
        sns.lineplot(midpoint, g12, color='dodgerblue')
    plt.ylim(0, ylim)
    plt.xticks(xtick, rotation=45, ha='right')
    plt.ticklabel_format(style='plain', axis='x')
    plt.title(f"G12 {name}")
    if save: plt.savefig(f"{prefix}/{name}.png",format="png")
    
    plt.show()


### Run G12

In [10]:
chroms = ['2L', '2R', '3R', '3L', 'X']
chrom = '2L'

In [15]:
for chrom in chroms:
    
    # Load site filters, SNP and POS arrays and filter
    gambcolu_filter = ag3.site_filters(mask="gamb_colu", contig=chrom)
    snps = allel.GenotypeDaskArray(ag3.snp_genotypes(contig=chrom, sample_sets=my_sample_set))
    snps = snps.compress(gambcolu_filter, axis=0)
    pos = ag3.snp_sites(contig=chrom)
    pos = allel.SortedIndex(pos[0])[gambcolu_filter]

    # Count alleles per site
    ac = snps.count_alleles()
    # Get boolean array for biallelic or not
    bial_bool = ac.is_biallelic()
    # Filter arrays to biallelic only
    snps = snps.compress(bial_bool, axis=0)
    positions = pos[bial_bool]

    for loc in metadata.location2.unique():

        nmeta = metadata[metadata.location2 == loc]

            # have edited .species_gambiae_coluzzii column to contain 'arabiensis' instead of NA 
        for sp in nmeta.species_gambiae_coluzzii.unique():

            # if there is less than n samples then skip
            if (nmeta.species_gambiae_coluzzii == sp).sum() <= 12:
                continue
                
            # if file exists ignore and skip
            myfile = Path(f"gaardian/{loc}_{sp}_{chrom}.G12.png")
            if myfile.is_file():
                print(f"skipping {loc}_{sp}_{chrom}, as already done.")
                continue

            #filter to species 
            nmeta3 = nmeta[nmeta.species_gambiae_coluzzii == sp]
            flt = np.array(nmeta3.index)
            # filter to correct loc, year, species individuals
            gn = snps.take(flt, axis=1)

            print(f"--------- Running G12 on {loc} {sp} Chromosome {chrom} ----------")

            # filter to segregating sites only as the phase 3 arrays contain every SNP on the chromosome
            seg = gn.count_alleles().is_segregating()
            gn = gn.compress(seg, axis=0)
            pos = positions[seg]

            print(f"There are {gn.shape[0]} segregating sites and {gn.shape[1]} individuals")

            # convert to 0,1,2 genotypes
            gn_alt = gn.to_n_alt().compute()

            # calculate G12 and plot figs 
            garuds_G12(gnalt=gn_alt, pos=pos, cut_height=6,
                       save=True, 
                       prefix='gaardian',
                       name=f"{loc}_{sp}_{chrom}.G12", 
                       metric='euclidean',
                      window_size=1000,
                      step_size=500, cluster=True, species=sp)

skipping Kente_coluzzii_2L, as already done.
skipping Odumto_coluzzii_2L, as already done.
skipping Watreso_coluzzii_2L, as already done.
skipping Koniyaw_coluzzii_2L, as already done.
skipping Kokotro_coluzzii_2L, as already done.
skipping New Edubiase_gambiae_2L, as already done.
skipping New Edubiase_coluzzii_2L, as already done.
skipping Adansi Apagya_coluzzii_2L, as already done.
skipping Wamase_coluzzii_2L, as already done.
skipping Annorkrom_coluzzii_2L, as already done.
skipping Prisintease_coluzzii_2L, as already done.
skipping Pokukrom_coluzzii_2L, as already done.
skipping Nkotumso_coluzzii_2L, as already done.
skipping Domenase_coluzzii_2L, as already done.
skipping Kente_coluzzii_2R, as already done.
skipping Odumto_coluzzii_2R, as already done.
skipping Watreso_coluzzii_2R, as already done.
skipping Koniyaw_coluzzii_2R, as already done.
skipping Kokotro_coluzzii_2R, as already done.
skipping New Edubiase_gambiae_2R, as already done.
skipping New Edubiase_coluzzii_2R, as a