In [1]:
import sys
# adding Folder_2 to the system path
sys.path.insert(0, '/home/sanj/projects/gaardian/workflow/scripts/')
import probetools as probe

%run hapclust.py
import numpy as np 
import allel
import pandas as pd
import bokeh
import bokeh.plotting
import malariagen_data
%matplotlib inline

import bokeh.io as bkio
bkio.output_notebook()

### CNVs and the coeae1f region

In [2]:
ag3 = malariagen_data.Ag3(pre=True)

moshi_dup_start, moshi_dup_end = 28535653, 28571586
gaard_dup_start, gaard_dup_end = 28542695, 28551033

gamb_cnv_breakpoints = [gaard_dup_start, gaard_dup_end] # baguida and obuasi CNV
moshi_cnv_breakpoints = [moshi_dup_start, moshi_dup_end]

In [3]:
cohorts = [
    # Ag1000G phase 3 sample sets in Ag3.0
    "AG1000G-GH", 
    'AG1000G-ML-A',
     'AG1000G-BF-A',
     'AG1000G-BF-B',
     'AG1000G-GN-A',
     'AG1000G-GN-B',
    'AG1000G-TZ',
    # Amenta-Etego sample sets in Ag3.3
    # GAARDIAN sample set in Ag3.4
    '1244-VO-GH-YAWSON-VMF00149',
    # GAARD Ghana sample set in Ag3.2
     "1244-VO-GH-YAWSON-VMF00051",
     '1245-VO-CI-CONSTANT-VMF00054',
     '1253-VO-TG-DJOGBENOU-VMF00052',
     '1237-VO-BJ-DJOGBENOU-VMF00050'
]


contig = '2L'
start = 28_520_000
end = 28_580_000

In [4]:
# inspect all samples available from Ghana
df_samples = ag3.sample_metadata(sample_sets=cohorts)
pivot_samples = (
    df_samples
    .pivot_table(
        index=["country", "admin1_iso", "admin1_name", "admin2_name", "year"], 
        columns="taxon", 
        values="sample_id",
        aggfunc="count",
        fill_value=0
    )
)

Load sample metadata:   0%|          | 0/12 [00:00<?, ?it/s]

In [5]:
pivot_samples.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,taxon,arabiensis,coluzzii,gambiae,gcx3,intermediate_gambiae_coluzzii
country,admin1_iso,admin1_name,admin2_name,year,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Benin,BJ-OU,Ouémé,Avrankou,2017,0,90,0,0,0
Burkina Faso,BF-09,Hauts-Bassins,Houet,2012,0,82,98,0,1
Burkina Faso,BF-09,Hauts-Bassins,Houet,2014,3,53,46,0,0
Cote d'Ivoire,CI-CM,Comoe,Sud-Comoe,2017,0,0,37,0,1
Ghana,GH-AA,Greater Accra,Accra,2018,0,264,0,0,4


In [20]:
cohorts

['AG1000G-GH',
 'AG1000G-ML-A',
 'AG1000G-BF-A',
 'AG1000G-BF-B',
 'AG1000G-GN-A',
 'AG1000G-GN-B',
 'AG1000G-TZ',
 '1244-VO-GH-YAWSON-VMF00149',
 '1244-VO-GH-YAWSON-VMF00051',
 '1245-VO-CI-CONSTANT-VMF00054',
 '1253-VO-TG-DJOGBENOU-VMF00052',
 '1237-VO-BJ-DJOGBENOU-VMF00050']

In [15]:
coe_region = "2L:28,530,000-28,580,000"

cnv_freqs_df = ag3.gene_cnv_frequencies(
    region=coe_region,
    cohorts="admin1_year",
    sample_sets=cohorts,
)

ag3.plot_frequencies_heatmap(
    cnv_freqs_df.query("max_af > 0.1"), 
    title="Gene CNV frequencies, coeae1f/2f locus"
)

Load CNV HMM data:   0%|          | 0/258 [00:00<?, ?it/s]

Compute modal gene copy number:   0%|          | 0/15 [00:00<?, ?it/s]

In [12]:
def get_copy_number(myregion, cohorts):
    
    cnv_data = ag3.gene_cnv(region=myregion, sample_sets=cohorts)
    cnv_data = cnv_data.to_dataframe()
    coe_dups = cnv_data.query("CN_mode > 2.9")
    sample_names = coe_dups['sample_id'].to_numpy()
    
    return(cnv_data, sample_names)

In [23]:
cnv_df, names = get_copy_number('AGAP006227', ['AG1000G-TZ'])

Load CNV HMM data:   0%|          | 0/12 [00:00<?, ?it/s]

Compute modal gene copy number:   0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
pd.Series(names).to_csv("../../results/cnv_tz_ids.tsv", sep="\t")

In [65]:
cnv_df = df_samples.merge(cnv_df)

In [17]:
#pd.crosstab(cnv_df['sample_set'], cnv_df['CN_mode'])

### Looking at coverage (HMM)

Lets check how legitimate the CNV looks by investigating the HMM

In [31]:
#set_ = "1244-VO-GH-YAWSON-VMF00051"
set_ = "AG1000G-GN-A"
cnv_df, names = get_copy_number('AGAP006227', set_)
coe_region = "2L:28,400,000-28,700,000"

for s in names[:8]:
    ag3.plot_cnv_hmm_coverage(s, sample_set=set_, region=coe_region)

Load CNV HMM data:   0%|          | 0/12 [00:00<?, ?it/s]

Compute modal gene copy number:   0%|          | 0/1 [00:00<?, ?it/s]

Looking legit!!

### Comparing CNV status, sweep status, and karyotype

In [227]:
cnv_ids = sample_names

In [251]:
import warnings
warnings.filterwarnings('ignore')

karyotypes = pd.read_csv("/home/sanj/projects/gaard/results/gaard_karyotypes.tsv", sep="\t")
karyo2 = pd.read_csv("/home/sanj/projects/gaardian/results/karyotypes/gaardian_karyotypes.tsv", sep="\t", index_col=0)
karyo2 = karyo2.rename(columns={'location2':'location'})
karyotypes = pd.concat([karyotypes, karyo2])

karyo_2la = karyotypes.query("inversion == '2La'")

karyo_2la.loc[:, 'mean_genotype'] = karyo_2la.loc[:,'mean_genotype'].round(0)

df_samples = karyo_2la.merge(df_samples)

gamb_samples = df_samples.query("aim_species == 'gambiae'")

gamb_samples.loc[:, 'cnv_status'] = pd.factorize(np.isin(gamb_samples['sample_id'], cnv_ids))[0]

pd.crosstab(gamb_samples['mean_genotype'], gamb_samples['cnv_status'])

cnv_status,0,1
mean_genotype,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,223,56
1.0,152,18
2.0,28,0


So the CNVs are found on the 2l+a haplotypic background. Interesting. We probably have multiple sweeps on different karyotypes at the same locus, one spreading from togo area and one from Burkina.

Next, we need the IDs of each individual in each sweep