In [4]:
%matplotlib inline

#import malariagen_data
import allel
import numpy as np
import pandas as pd
import dask
from dask.diagnostics import ProgressBar
import zarr
from pathlib import Path
import scipy
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

In [5]:
%run tools.py

### Karyotyping the GAARDIAN data

In [6]:
ag3 = malariagen_data.Ag3("gs://vo_agam_release/", pre=True)

In [7]:
df_sample_sets = ag3.sample_sets(release="v3.4")
df_sample_sets

Unnamed: 0,sample_set,sample_count,release
0,1191-VO-MULTI-OLOUGHLIN-VMF00106,237,v3.4
1,1191-VO-MULTI-OLOUGHLIN-VMF00140,1095,v3.4
2,1244-VO-GH-YAWSON-VMF00149,485,v3.4


In [8]:
my_sample_set = '1244-VO-GH-YAWSON-VMF00149'
metadata = ag3.sample_metadata(sample_sets=my_sample_set)

In [9]:
metadata['location2'] = metadata['location'].str.split(".").str.get(0)

## compkaryo

In [34]:
invDict = {}

for inversion in inversionDict.keys():
    chrom = inversionDict[inversion][0]

    snps = allel.GenotypeDaskArray(ag3.snp_genotypes(contig=chrom, sample_sets=my_sample_set))
    pos = ag3.snp_sites(contig=chrom)
    pos = allel.SortedIndex(pos[0])

    callset = {'geno':snps, 'pos':pos, 'chrom':chrom}
    print(f"--- Running CompKaryo {inversion}--- ")
    av_gts, total_sites, num_0, num_1, num_2 = compkaryo(callset, inversion)
    total_sites = total_sites.compute()
    invDict[inversion] = pd.DataFrame({'partner_sample_id': metadata['sample_id'], 
                           'inversion':inversion, 
                           'mean_genotype': av_gts, 
                           'total_snp_tags':total_sites})
    

--- Running CompKaryo 2La--- 
--- Running CompKaryo 2Rj--- 
--- Running CompKaryo 2Rb--- 
--- Running CompKaryo 2Rc_col--- 
--- Running CompKaryo 2Rc_gam--- 
--- Running CompKaryo 2Rd--- 
--- Running CompKaryo 2Ru--- 


In [35]:
df = metadata[['sample_id', 'partner_sample_id', 'location2', 'species_gambiae_coluzzii']]

In [None]:
gaardian_karyotypes = pd.concat(invDict.values()).rename(columns={'partner_sample_id':'sample_id'})
gaardian_karyotypes = gaardian_karyotypes.merge(df).set_index('partner_sample_id').reset_index().drop(columns='sample_id')
gaardian_karyotypes.to_csv("gaardian_karyotypes.tsv", sep="\t") 

### Load karyo data

In [7]:
gaardian_karyotypes = pd.read_csv("../gaardian_karyotypes.tsv", sep="\t", index_col=0)

In [8]:
gaardian_karyotypes

Unnamed: 0,partner_sample_id,inversion,mean_genotype,total_snp_tags,location2,species_gambiae_coluzzii
0,WA-2001,2La,0.000000,209,Adumanu,gambiae
1,WA-2001,2Rj,0.060606,99,Adumanu,gambiae
2,WA-2001,2Rb,0.123209,349,Adumanu,gambiae
3,WA-2001,2Rc_col,0.438596,57,Adumanu,gambiae
4,WA-2001,2Rc_gam,0.061224,49,Adumanu,gambiae
...,...,...,...,...,...,...
3390,WA-2500,2Rb,0.086207,348,Domenase,coluzzii
3391,WA-2500,2Rc_col,0.122807,57,Domenase,coluzzii
3392,WA-2500,2Rc_gam,0.020408,49,Domenase,coluzzii
3393,WA-2500,2Rd,0.081633,147,Domenase,coluzzii


In [56]:
def get_karyotype_freq(dfpath, inversion, species, metadata):
    
    karyotypes = pd.read_csv(dfpath, sep="\t", index_col=0)
    karyo = karyotypes.loc[(karyotypes['species_gambiae_coluzzii'] == species) & (karyotypes['inversion'] == inversion)]
    karyo['karyotype']  = karyo.mean_genotype.round()
    
    df = pd.DataFrame(karyo.groupby(['inversion', 'location2']).karyotype.value_counts()).rename(columns={'karyotype':'n'}).reset_index()
    df['total_alleles'] = df['karyotype']*df['n']
    df = df.groupby('location2').agg({'n':'sum', 'total_alleles':'sum'})
    df[f'{inversion}_frequency'] = df['total_alleles']/(df['n']*2)
    df = df.drop(columns=['total_alleles']).reset_index()

    metalocs = metadata[['location2', 'latitude', 'longitude']].groupby('location2').agg({'latitude':'mean', 'longitude':'mean'}).reset_index()

    
    karyo_meta = pd.merge(df, metalocs)
    return  karyo_meta
    

In [57]:
get_karyotype_freq("../gaardian_karyotypes.tsv", "2Rb", "coluzzii", metadata)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,location2,n,2Rb_frequency,latitude,longitude
0,Adansi Apagya,15,0.033333,5.991625,-1.371125
1,Adansi-Krom,6,0.0,6.170833,-1.8355
2,Adumanu,5,0.0,6.303333,-1.6815
3,Anhwiaso,2,0.0,6.144,-1.469
4,Ankaako,12,0.083333,6.175833,-1.793083
5,Annorkrom,46,0.021739,5.97,-1.697
6,Anwona,5,0.1,6.089,-1.51625
7,Bogyawe,3,0.0,6.3888,-1.548
8,Domenase,57,0.017544,6.047947,-1.961877
9,Dompoase,1,0.0,6.308667,-1.533333
