In [None]:
import sys
# adding Folder_2 to the system path
import probe
import locusPocus

%run hapclust.py
import numpy as np 
import allel
import pandas as pd
import bokeh
import bokeh.plotting
import malariagen_data
%matplotlib inline

import bokeh.io as bkio
bkio.output_notebook()

### CNVs and the coeae1f region

In [None]:
ag3 = malariagen_data.Ag3(pre=True)

moshi_dup_start, moshi_dup_end = 28535653, 28571586
gaard_dup_start, gaard_dup_end = 28542695, 28551033

gamb_cnv_breakpoints = [gaard_dup_start, gaard_dup_end] # baguida and obuasi CNV
moshi_cnv_breakpoints = [moshi_dup_start, moshi_dup_end]

In [None]:
moshi_dup_end - moshi_dup_start

In [None]:
gaard_dup_end-gaard_dup_start

In [None]:
sample_sets = [
    # Ag1000G phase 3 sample sets in Ag3.0
    "AG1000G-GH", 
    'AG1000G-ML-A',
     'AG1000G-BF-A',
     'AG1000G-BF-B',
     'AG1000G-GN-A',
     'AG1000G-GN-B',
    'AG1000G-TZ',
    # Amenta-Etego sample sets in Ag3.3
    # GAARDIAN sample set in Ag3.4
    '1244-VO-GH-YAWSON-VMF00149',
    # GAARD Ghana sample set in Ag3.2
     "1244-VO-GH-YAWSON-VMF00051",
     '1245-VO-CI-CONSTANT-VMF00054',
     '1253-VO-TG-DJOGBENOU-VMF00052',
     '1237-VO-BJ-DJOGBENOU-VMF00050'
]


contig = '2L'
start = 28_520_000
end = 28_580_000

In [None]:
# inspect all samples available from Ghana
df_samples = ag3.sample_metadata(sample_sets=sample_sets)
pivot_samples = (
    df_samples
    .pivot_table(
        index=["country", "admin1_iso", "admin1_name", "admin2_name", "year"], 
        columns="taxon", 
        values="sample_id",
        aggfunc="count",
        fill_value=0
    )
)

In [None]:
pivot_samples.head(5)

In [None]:
coe_region = "2L:28,510,000-28,580,000"

cnv_freqs_df = ag3.gene_cnv_frequencies(
    region=coe_region,
    cohorts="admin2_year",
    sample_sets=sample_sets,
)

In [None]:
cnv_freqs_df

In [None]:
coh_labels = cnv_freqs_df.filter(like='frq').columns
coh_labels.str.replace("frq_", "").str.split("_") 

In [None]:
"TZ-26".split("-")[0]

In [None]:
rename_dict = {'TZ':'Tanzania', 
                     'GH':'Ghana', 
                     'BF':'BurkinaFaso', 
                     'ML':'Mali',
                    'TG':'Togo'}

def reorder_labels(df_freqs, rename_dict):      
    # extract frequency column names
    coh_labels = df_freqs.filter(like='frq').columns
    # remove frq_ and split by underscore
    new_coh_labels = coh_labels.str.replace("frq_", "").str.split("_") 
    # reorder label so we can sort
    new_coh_labels = [f"{l[2]}_{l[1]}_{l[0].split('-')[0]}_{l[3]}" for l in new_coh_labels]
    # get idxs of sort
    sort_idxs = np.argsort(new_coh_labels)
    new_coh_labels = np.sort(new_coh_labels)
    # split df to frq columns and misc columns
    df_freqs_misc = df_freqs.drop(columns=coh_labels)
    df_freqs = df_freqs.loc[:, coh_labels]
    # reorder
    df_freqs = df_freqs.iloc[:, sort_idxs]
    # rename
    df_freqs.columns = ["frq_" + l for l in new_coh_labels]
    for old,new in rename_dict.items():
        df_freqs.columns = df_freqs.columns.str.replace(old, new)
    # join with misc columns
    df_freqs = pd.concat([df_freqs, df_freqs_misc], axis=1)
    
    return(df_freqs)


In [None]:
reorder_labels(cnv_freqs_df, rename_dict)

In [None]:
labels = cnv_freqs_df['label']
cnv_freqs_df = cnv_freqs_df.filter(like="frq")
pop_bool = cnv_freqs_df.max(axis=0) > 0.1
cnv_bool = cnv_freqs_df.max(axis=1) > 0.1

cnv_freqs_df = cnv_freqs_df.filter(like="frq")
cnv_freqs_df = cnv_freqs_df.loc[cnv_bool, pop_bool]
cnv_freqs_df.loc[:, 'label'] = labels[cnv_bool]

In [None]:
cnv_freqs_df = ag3.gene_cnv_frequencies(
    region="2L:28,510,000-28,580,000",
    cohorts="admin2_year",
    sample_sets=sample_sets,
)


ag3.plot_frequencies_heatmap(
    cnv_freqs_df,
    title="Gene CNV frequencies, coeae1f/2f locus"
)

In [None]:
def get_copy_number(myregion, sample_sets):
    
    cnv_data = ag3.gene_cnv(region=myregion, sample_sets=sample_sets)
    cnv_data = cnv_data.to_dataframe()
    coe_dups = cnv_data.query("CN_mode > 2.9")
    sample_names = coe_dups['sample_id'].to_numpy()
    
    return(cnv_data, sample_names)

In [None]:
cnv_df, names = get_copy_number('AGAP006227', ['AG1000G-TZ'])
pd.Series(names).to_csv("../../results/cnv_tz_ids.tsv", sep="\t")
cnv_df = df_samples.merge(cnv_df)

### Looking at coverage (HMM)

Lets check how legitimate the CNV looks by investigating the HMM

In [None]:
#set_ = "1244-VO-GH-YAWSON-VMF00051"
set_ = "AG1000G-GN-A"
cnv_df, names = get_copy_number('AGAP006227', sample_sets=sample_sets)

In [None]:
#cnv_haps_df = pd.DataFrame(np.repeat(cnv_df.values, 2, axis=0), columns=cnv_df.columns)
cnv_data = ag3.gene_cnv(region="AGAP006227", sample_sets=sample_sets)
names = cnv_data['sample_id'].values

### Plot copy number vs n haps in cluster

In [None]:
metahaps = locusPocus.metahaps.reset_index()
metadata = ag3.sample_metadata(sample_sets)

cnv_data = cnv_df.merge(metadata, how='left')

In [None]:
import plotly.express as px

cnv_di = {}
for i, clust in enumerate(locusPocus.clusters_idx.items()):
    
    k, idx = clust
    
    meta = metahaps.iloc[idx, :]
    meta = meta[meta.duplicated()]
    names = meta['sample_id'].to_list()
    
    cnv_di[k] = cnv_data.query(f"sample_id in @names")
    cnv_di[k].loc[:, 'cluster'] = k

In [None]:
cnv_genotypes = metahaps[['sample_id', 'hap_cluster']].value_counts(['sample_id', 'hap_cluster']).to_frame().reset_index()
cnv_genotypes = cnv_genotypes.rename(columns={0:'count'})

In [None]:
metadata = ag3.sample_metadata(sample_sets)
metadata = metadata.set_index("sample_id")

for k, v in locusPocus.clusters_idx.items():
    
    cnv_df = cnv_genotypes.query(f"hap_cluster == '{k}'")
    metadata.loc[cnv_df.sample_id.to_list(), k] = cnv_df.loc[:, 'count'].to_list()

metadata = metadata.fillna(0)

hap_labels = locusPocus.clusters_df['Haplotype cluster'].to_list()

metadata = metadata.reset_index()

cnv_df = cnv_data[['sample_id', 'CN_mode']]

metadata = metadata.merge(cnv_df, how='left')

In [None]:
locusPocus.clusters_df

In [None]:
meta_cnvs = metadata[["sample_id", "CN_mode"] + hap_labels].melt(value_name='hap_count', var_name='hap_cluster', id_vars=['sample_id', 'CN_mode'])
meta_cnvs = meta_cnvs.query("sample_id != 'AV0263-C'")

dup2_ids = metadata.query("country == 'Tanzania' & CN_mode >2.9")['sample_id']
dup1_ids = metadata.query("country != 'Tanzania' & CN_mode >2.9")['sample_id']

meta_cnvs['CN_mode'] = meta_cnvs['CN_mode'] + np.random.randn(meta_cnvs.shape[0])*0.05
meta_cnvs['hap_count'] = meta_cnvs['hap_count'] + np.random.randn(meta_cnvs.shape[0])*0.05

for cluster in ['C1', 'C4']:  #locusPocus.clusters_idx.keys()]:
    meta_cnv2 = meta_cnvs.query("hap_cluster == @cluster")
    cnv_copy_number = 'Coeae1f/2f copy number' if cluster == 'C1' else 'Coeae1f/2f copy number'
        
    if cluster == 'C1': 
        meta_cnv2 = meta_cnv2.query("sample_id not in @dup1_ids")
    elif cluster == 'C4':
        meta_cnv2 = meta_cnv2.query("sample_id not in @dup2_ids")
        
    fig = px.scatter(meta_cnv2, 
                   y="CN_mode",
                   x="hap_count",
                     color_discrete_sequence=['aquamarine'] if cluster == 'C1' else ['navy'],
                   labels={"CN_mode": cnv_copy_number, 'hap_count': f"Number of haplotypes in {cluster} cluster"}, 
                   height=400, width=500, 
                   range_x=[-0.3, 2.15], 
                   range_y=[0, '-inf'],
                   hover_name='sample_id',
                   trendline='ols'
                  )
    
    fig.update_layout(xaxis = dict(tickmode = 'linear',
                            tick0 = 0,
                           dtick=1))

    
    fig.show()

In [None]:
locusPocus.clusters_df

In [None]:
cnv_df

In [None]:
cnv_df['CN_mode'].hist()

In [None]:
meta = ag3.sample_metadata()
meta.query("sample_id == 'VBS45429-5563STDY8794371'")

In [None]:
for s in names[:5]:
    print(s)
    ag3.plot_cnv_hmm_coverage(s, sample_set=sample_sets, region="2L:28,520,000-28,650,000")

In [None]:
ag3.plot_cnv_hmm_coverage('VBS45429-5563STDY8794371', sample_set=sample_sets, region="2L:28,520,000-28,650,000")
ag3.plot_cnv_hmm_coverage('BL0293-C', sample_set=sample_sets, region="2L:28,520,000-28,650,000")

## Looking legit!!

### Comparing CNV status, sweep status, and karyotype

In [None]:
cnv_ids = sample_names

In [None]:
import warnings
warnings.filterwarnings('ignore')

karyotypes = pd.read_csv("/home/sanj/projects/gaard/results/gaard_karyotypes.tsv", sep="\t")
karyo2 = pd.read_csv("/home/sanj/projects/gaardian/results/karyotypes/gaardian_karyotypes.tsv", sep="\t", index_col=0)
karyo2 = karyo2.rename(columns={'location2':'location'})
karyotypes = pd.concat([karyotypes, karyo2])

karyo_2la = karyotypes.query("inversion == '2La'")

karyo_2la.loc[:, 'mean_genotype'] = karyo_2la.loc[:,'mean_genotype'].round(0)

df_samples = karyo_2la.merge(df_samples)

gamb_samples = df_samples.query("aim_species == 'gambiae'")

gamb_samples.loc[:, 'cnv_status'] = pd.factorize(np.isin(gamb_samples['sample_id'], cnv_ids))[0]

pd.crosstab(gamb_samples['mean_genotype'], gamb_samples['cnv_status'])

So the CNVs are found on the 2l+a haplotypic background. Interesting. We probably have multiple sweeps on different karyotypes at the same locus, one spreading from togo area and one from Burkina.

Next, we need the IDs of each individual in each sweep