In [59]:
import sys
# adding Folder_2 to the system path
sys.path.insert(0, '/home/sanj/projects/gaardian/workflow/scripts/')
import probetools as probe

%run hapclust.py

import numpy as np 
import allel
import bokeh
import bokeh.plotting
import malariagen_data
%matplotlib inline

import bokeh.io as bkio
bkio.output_notebook()

### CNVs and the coeae1f region

In [7]:
ag3 = malariagen_data.Ag3(pre=True)

In [253]:
sample_sets = [
    # Ag1000G phase 3 Ghana sample set in Ag3.0
    #"AG1000G-GH",
    # GAARD Ghana sample set in Ag3.2
    # Amenta-Etego sample sets in Ag3.3
    #'1190-VO-GH-AMENGA-ETEGO-VMF00013',
    #'1190-VO-GH-AMENGA-ETEGO-VMF00014',
    #'1190-VO-GH-AMENGA-ETEGO-VMF00028',
    #'1190-VO-GH-AMENGA-ETEGO-VMF00029',
    #'1190-VO-GH-AMENGA-ETEGO-VMF00046',
    #'1190-VO-GH-AMENGA-ETEGO-VMF00047',
    # GAARDIAN sample set in Ag3.4
    #'AG1000G-ML-A',
    #'AG1000G-ML-B',
    #'AG1000G-BF-A',
    #'AG1000G-GN-A'    
    "1244-VO-GH-YAWSON-VMF00051",
    '1244-VO-GH-YAWSON-VMF00149',
    '1245-VO-CI-CONSTANT-VMF00054',
    '1253-VO-TG-DJOGBENOU-VMF00052',
    '1237-VO-BJ-DJOGBENOU-VMF00050'
]

In [211]:
# inspect all samples available from Ghana
df_samples = ag3.sample_metadata(sample_sets=sample_sets)
pivot_samples = (
    df_samples
    .pivot_table(
        index=["country", "admin1_iso", "admin1_name", "admin2_name", "year"], 
        columns="taxon", 
        values="sample_id",
        aggfunc="count",
        fill_value=0
    )
)
pivot_samples

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,taxon,coluzzii,gambiae,intermediate_gambiae_coluzzii
country,admin1_iso,admin1_name,admin2_name,year,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Benin,BJ-OU,Ouémé,Avrankou,2017,90,0,0
Cote d'Ivoire,CI-CM,Comoe,Sud-Comoe,2017,0,37,1
Ghana,GH-AA,Greater Accra,Accra,2018,264,0,4
Ghana,GH-AA,Greater Accra,Ga East,2017,0,200,0
Ghana,GH-AH,Ashanti,Adansi East,2018,39,40,0
Ghana,GH-AH,Ashanti,Adansi North,2017,0,198,0
Ghana,GH-AH,Ashanti,Adansi North,2018,1,4,0
Ghana,GH-AH,Ashanti,Adansi West,2018,77,5,0
Ghana,GH-AH,Ashanti,Amansie Central,2018,80,7,0
Ghana,GH-AH,Ashanti,Amansie East,2018,58,7,0


In [212]:
coe_region = "2L:28,520,000-28,580,000"

cnv_freqs_df = ag3.gene_cnv_frequencies(
    region=coe_region,
    cohorts="admin1_year",
    sample_sets=sample_sets,
    sample_query="taxon == 'gambiae'",
)

ag3.plot_frequencies_heatmap(
    cnv_freqs_df.query("max_af > 0.03"), 
    title="Gene CNV frequencies, Ghana An. coluzzii, coeae1f/2f locus"
)

In [224]:
cnv_data = ag3.gene_cnv(region='2L:28520000-28580000', sample_sets=sample_sets)

In [225]:
cnv_data = cnv_data.to_dataframe().query("gene_id == 'AGAP006227'")
coe_dups = cnv_data.query("CN_mode > 2.9")
df_samples['location'] = df_samples['location'].str.split(".").str.get(0)
df_samples = df_samples.merge(cnv_data)
sample_names = coe_dups['sample_id'].to_list()

In [226]:
import pandas as pd
pd.crosstab(df_samples['location2'], df_samples['CN_mode'])

CN_mode,1,2,3,4,5,6,8,10,12
location2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Aboisso,0,38,0,0,0,0,0,0,0
Adansi Apagya,0,16,0,0,0,0,0,0,0
Adansi-Krom,0,6,0,0,0,0,0,0,0
Adumanu,0,6,0,0,0,0,0,0,0
Anhwiaso,0,2,0,0,0,0,0,0,0
Ankaako,0,12,0,0,0,0,0,0,0
Annorkrom,0,49,0,0,0,0,0,0,0
Anwona,0,7,0,0,1,0,0,0,0
Asonkore,0,5,0,0,0,0,0,0,0
Avrankou,1,89,0,0,0,0,0,0,0


### Looking at coverage (HMM)

Lets check how legitimate the CNV looks by investigating the HMM

In [180]:
for s in sample_names[:2]:
    ag3.plot_cnv_hmm_coverage(s, sample_set="1244-VO-GH-YAWSON-VMF00051", region=coe_region)

Looking legit!!

### Comparing CNV status, sweep status, and karyotype

In [227]:
cnv_ids = sample_names

In [251]:
import warnings
warnings.filterwarnings('ignore')

karyotypes = pd.read_csv("/home/sanj/projects/gaard/results/gaard_karyotypes.tsv", sep="\t")
karyo2 = pd.read_csv("/home/sanj/projects/gaardian/results/karyotypes/gaardian_karyotypes.tsv", sep="\t", index_col=0)
karyo2 = karyo2.rename(columns={'location2':'location'})
karyotypes = pd.concat([karyotypes, karyo2])

karyo_2la = karyotypes.query("inversion == '2La'")

karyo_2la.loc[:, 'mean_genotype'] = karyo_2la.loc[:,'mean_genotype'].round(0)

df_samples = karyo_2la.merge(df_samples)

gamb_samples = df_samples.query("aim_species == 'gambiae'")

gamb_samples.loc[:, 'cnv_status'] = pd.factorize(np.isin(gamb_samples['sample_id'], cnv_ids))[0]

pd.crosstab(gamb_samples['mean_genotype'], gamb_samples['cnv_status'])

cnv_status,0,1
mean_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,223,56
1.0,152,18
2.0,28,0


So the CNVs are found on the 2l+a haplotypic background. Interesting. We probably have multiple sweeps on different karyotypes at the same locus, one spreading from togo area and one from Burkina.

Next, we need the IDs of each individual in each sweep