In [15]:
%matplotlib inline

import malariagen_data
import allel
import numpy as np
import pandas as pd
import zarr
from pathlib import Path
import scipy
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

### GAARDIAN - PCA

In [16]:
%run tools.py

In [17]:
ag3 = malariagen_data.Ag3("gs://vo_agam_release/", pre=True)

In [18]:
df_sample_sets = ag3.sample_sets(release="v3.4")
df_sample_sets

Unnamed: 0,sample_set,sample_count,release
0,1191-VO-MULTI-OLOUGHLIN-VMF00106,237,v3.4
1,1191-VO-MULTI-OLOUGHLIN-VMF00140,1095,v3.4
2,1244-VO-GH-YAWSON-VMF00149,485,v3.4


Lets load the sample metadata and the SNP genotypes and positions. 

In [19]:
my_sample_set = '1244-VO-GH-YAWSON-VMF00149'
metadata = ag3.sample_metadata(sample_sets=my_sample_set)

Lets make a new column in the metadata of overall village, rather than village x house. We split on full stops and take the first element. 

In [20]:
metadata['location2'] = metadata['location'].str.split(".").str.get(0)

How many do we have from each species?

In [21]:
metadata.species.value_counts()

coluzzii    422
gambiae      63
Name: species, dtype: int64

### PCA

In [22]:
chroms = ['2L', '2R', '3R', '3L', 'X']

In [23]:
from collections import defaultdict
samplenames = metadata['partner_sample_id']
ind = defaultdict(list)

for s,names in enumerate(samplenames):
    idx = np.where(np.isin(metadata['partner_sample_id'],names))[0][0]
    t = metadata['location2'][idx]
    ind[t].append(s)
    subpops = dict(ind)

In [None]:
for chrom in chroms:
    gambcolu_filter = ag3.site_filters(mask="gamb_colu", contig=chrom)
    geno = allel.GenotypeDaskArray(ag3.snp_genotypes(contig=chrom, sample_sets='1244-VO-GH-YAWSON-VMF00051')).compress(gambcolu_filter, axis=0)
    
    d={}
    for name, inds in subpops.items():
        for n in range(len(inds)):
            p = inds[n]
            d[p] = name

        # Store dict as a dataframe and get colours 
    treatment_indices = pd.DataFrame.from_dict(d, orient='index').reset_index()
    treatment_indices = treatment_indices.rename(columns = {'index':'sample_index', 0:"name"})
    pop_colours = get_colour_dict(treatment_indices['name'], "viridis")

    # Run PCA function defined in tools.py
    print(f"Performing PCA on GAARD chromosome {chrom}")
    pca(geno, chrom, 2, "GAARD", samplenames, metadata, pop_colours, prune=False, scaler=None)