In [None]:
import allel

In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
from numba import njit
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import probe as probe

# vectorized haversine function
def haversine(lat1, lon1, lat2, lon2, to_radians=True, earth_radius=6371):
    """
    slightly modified version: of http://stackoverflow.com/a/29546836/2901002

    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees or in radians)

    All (lat, lon) coordinates must have numeric dtypes and be of equal length.

    """
    if to_radians:
        lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    a = np.sin((lat2-lat1)/2.0)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2

    return earth_radius * 2 * np.arcsin(np.sqrt(a))

In [None]:
contigs = ['2L', '2R', '3R', '3L', 'X']
metadata = pd.read_csv("../../config/metadata.tsv", sep="\t")
dblton = pd.read_csv("../../results/f2variantPairs.tsv", sep="\t")

In [None]:
idx1 = 351
idx2 = 353

In [None]:
snps = {}
pos = {}

for contig in contigs:


    # Load Arrays
    snps[contig], pos[contig] = probe.loadZarrArrays(genotypePath=f"../../resources/snp_genotypes/all/1244-VO-GH-YAWSON-VMF00149/{contig}/calldata/GT/", 
                                            positionsPath=f"../../resources/snp_genotypes/all/sites/{contig}/variants/POS/",
                                            siteFilterPath=f"../../resources/site_filters/dt_20200416/gamb_colu/{contig}/variants/filter_pass/")
          
  

### Load Relatedness and doubleton data

In [None]:
rel = pd.read_csv("../../results/relatedness/ngsRelate.ag3_gaardian", sep="\t")
metadata['order'] = np.arange(0,len(metadata))
n_dbltons = dblton.value_counts(['idx1', 'idx2']).to_frame().reset_index().rename(columns={0:'n_doubletons'})
rel = rel.merge(metadata, left_on='a', right_on='order').merge(metadata, left_on='b', right_on='order')
rel = rel.rename(columns={'a':'idx1', 'b':'idx2'})
rel = rel.merge(n_dbltons)
rel['spcomp'] = rel['species_gambiae_coluzzii_x'] + rel['species_gambiae_coluzzii_y']
#rel = rel.query("spcomp == 'coluzziicoluzzii' | spcomp == 'gambiaegambiae'")

f2Haps = {}
for contig in contigs:
    f2Haps[contig] = pd.read_csv(f"../../results/f2variants/f2HapLengths.{contig}.tsv", sep="\t", index_col=0)
    f2Haps[contig]['contig'] = contig
    f2Haps[contig]['size'] = f2Haps[contig]['end'] - f2Haps[contig]['start']
 
f2df = pd.concat(f2Haps, axis=0).reset_index(drop=True)
f2haps = dblton.merge(f2df.rename(columns={'dblton_pos':'pos'}))
f2haps['distance'] = haversine(f2haps['latitude'], f2haps['longitude'], f2haps['latitude2'], f2haps['longitude2'])

## distance column 
rel['distance'] = haversine(rel['latitude_y'], rel['longitude_y'], rel['latitude_x'], rel['longitude_x'])
totf2HapLength = f2haps.groupby(['idx1','idx2']).agg({'size':'sum'}).reset_index()
rel = rel.merge(totf2HapLength)


rel['kinship'] = np.select(
    [
        rel['KING'].between(-1, 0.0442, inclusive='both'), 
        rel['KING'].between(0.0443, 0.0884, inclusive='both'),
        rel['KING'].between(0.0885, 0.177, inclusive='both'),
        rel['KING'].between(0.178, 0.354, inclusive='both'),
        rel['KING'].between(0.355, 0.5, inclusive='both')
    ], 
    [
        'Unrelated', 
        '3rd-Degree',
        '2nd-Degree',
        '1st Degree (full sib)',
        'Dup/Twin'
    ], 
    default='Unknown'
)

In [None]:
rel.query("spcomp == 'gambiaegambiae'")['KING'].hist()

In [None]:
rel.query("spcomp == 'coluzziicoluzzii'")['KING'].hist()

In [None]:
rel.query("spcomp == 'coluzziigambiae'")['KING'].hist()

In [None]:
import plotly.express as px

In [None]:
rel['partner_sample_id_x']

In [None]:
px.scatter(x=rel.distance,
           y=rel.KING, 
           color='kinship', 
           data_frame=rel, 
           hover_data=['idx1', 
                       'idx2', 
                       'location_x',
                       'location_y',
                       'species_gambiae_coluzzii_x',
                       'partner_sample_id_x',
                       'species_gambiae_coluzzii_y',
                      'partner_sample_id_y']
          )

In [None]:
(rel['n_doubletons'] > 1800).sum()

In [None]:
rel['n_doubletons'].max()

In [None]:
rel.query("partner_sample_id_y == 'WA-2014' & partner_sample_id_x == 'WA-2009'")

In [None]:
karyo_df = pd.read_csv("../../results/karyotypes/gaardian_karyotypes.tsv", sep="\t", index_col=0)
karyo_df.query("partner_sample_id in ['WA-2361', 'WA-2363'] & inversion == '2La'")

In [None]:
karyo_df = karyo_df.query("inversion == '2La'")
karyo_df1 = karyo_df[['partner_sample_id', 'mean_genotype']].rename(columns={"partner_sample_id":'partner_sample_id_x', 'mean_genotype':'karyo_x'}) 
karyo_df2 = karyo_df[['partner_sample_id', 'mean_genotype']].rename(columns={"partner_sample_id":'partner_sample_id_y', 'mean_genotype':'karyo_y'}) 

In [None]:
rel = rel.merge(karyo_df1).merge(karyo_df2)

In [None]:
rel.columns

In [None]:
ax = rel.query("KING < 0.05 and KING > -0.05")[['karyo_x', 'karyo_y']].assign(karyotype_difference=np.round(np.abs(rel['karyo_x'] - rel['karyo_y'])))['karyotype_difference'].hist()
ax.set_xlabel("Pairwise 2La karyotype difference")
ax.set_title("KING < 0.05 and KING > -0.05")

In [None]:
rel['KING'].max()

In [None]:
ax = rel.query("KING < -0.05")[['karyo_x', 'karyo_y']].assign(karyotype_difference=np.round(np.abs(rel['karyo_x'] - rel['karyo_y'])))['karyotype_difference'].hist()
ax.set_xlabel("Pairwise 2La karyotype difference")
ax.set_title("KING < -0.05")

In [None]:
def genome_wide_dxy(snps, pos, idx1, idx2, size, fst=False):    
  
    dxy = {}
    midpoints = {}
  
    for contig in contigs:

        geno = snps[contig].take([idx1,idx2], axis=1)
        ac = geno.count_alleles()
        seg = ac.is_segregating()
        geno = geno.compress(seg, axis=0)
        posi = pos[contig][seg]
    
        ac1 = geno.take([0], axis=1).count_alleles()
        ac2 = geno.take([1], axis=1).count_alleles()

        if fst is True:
            dxy[contig] = allel.moving_hudson_fst(ac1,ac2, size=size)
            midpoints[contig] = allel.moving_statistic(posi, np.median, size=size)
        else:
            dxy[contig], dxypos,_ ,_  = allel.windowed_divergence(posi, ac1, ac2, size=size) 
            midpoints[contig] =  np.median(dxypos, axis=1) 
            
    ## plotting 
    chrom2fst = np.append(dxy['2R'], dxy['2L'])
    chrom2pos = np.append(midpoints['2R'], midpoints['2R'].max() + midpoints['2L'])

    chrom3fst = np.append(dxy['3R'], dxy['3L'])
    chrom3pos = np.append(midpoints['3R'], midpoints['3R'].max() + midpoints['3L'])

    chromxfst = dxy['X']
    chromxpos = midpoints['X']

    f, (ax0, ax1, ax2) = plt.subplots(1, 3, figsize=[20,4], gridspec_kw={'width_ratios': [chrom2pos.max(), chrom3pos.max(), chromxpos.max()]})

    for ax in ax0, ax1, ax2:
      sns.despine(top=True, right=True, left=True, bottom=False, ax=ax)

    sns.scatterplot(ax=ax0, x=chrom2pos, y=chrom2fst, color='red')
    xtick = np.arange(0, chrom2pos.max(), 10000000)
    ax0.set_xticks(xtick, fontsize=42)
    ax0.set_title("Chromosome 2")
    ax0.ticklabel_format(style='sci',scilimits=(6,6),axis='x')

    sns.scatterplot(ax=ax1, x=chrom3pos, y=chrom3fst, color='dodgerblue')
    xtick = np.arange(0, chrom3pos.max(), 10000000)
    ax1.set_title("Chromosome 3")
    ax1.set_xticks(xtick, fontsize=42)
    ax1.ticklabel_format(style='sci',scilimits=(6,6),axis='x')

    sns.scatterplot(ax=ax2, x=chromxpos, y=chromxfst, color='green')
    xtick = np.arange(0, chromxpos.max(), 10000000)
    ax2.set_title("Chromosome X")
    ax2.set_xticks(xtick, fontsize=42)
    ax2.ticklabel_format(style='sci',scilimits=(6,6),axis='x')

    f.supxlabel('Genome Position')
    ax0.set_ylim(-1,1) if fst else ax0.set_ylim(0,0.05)
    ax1.set_ylim(-1,1) if fst else ax1.set_ylim(0,0.05)
    ax2.set_ylim(-1,1) if fst else ax2.set_ylim(0,0.05)

    plt.show()


## genome_wide_dxy(snps, pos, 351, 353, 10_000, fst=True)

In [None]:
genome_wide_dxy(snps, pos, 351, 353, 500_000, fst=False)

In [None]:
genome_wide_dxy(snps, pos, 85, 277, 10_000, fst=True)

In [None]:
rel.query("partner_sample_id_y == 'WA-2014' & partner_sample_id_x == 'WA-2009'")['KING']

In [None]:
rel.query("KING > 0.15")['sex_call_x']

In [None]:
genome_wide_dxy(snps, pos, 217, 408, 10_000, fst=True)

In [None]:
genome_wide_dxy(snps, pos, 8, 12, 500_000)

In [None]:
genome_wide_dxy(snps, pos, 23, 44, 10_000, fst=True)

In [None]:
genome_wide_dxy(snps, pos, 23, 44, 500_000)

In [None]:
genome_wide_dxy(snps, pos, 33, 278, size=100_000)

In [None]:
genome_wide_dxy(snps, pos, idx1, idx2, size=100_000)

In [None]:
rel.query("size > 200_000_000")

In [None]:
rel.query("idx1 == 8 & idx2 == 12")

In [None]:
genome_wide_dxy(snps, pos, 8, 12, 10_000, fst=True)

In [None]:
genome_wide_dxy(snps, pos, idx1, idx2, size=10_000, fst=True)

In [None]:
genome_wide_dxy(snps, pos, idx1, idx2, size=10_000, fst=True)