In [None]:
## import numpy as np
import pandas as pd
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import probe
import scipy

# vectorized haversine function
def haversine(lat1, lon1, lat2, lon2, to_radians=True, earth_radius=6371):
    """
    slightly modified version: of http://stackoverflow.com/a/29546836/2901002

    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees or in radians)

    All (lat, lon) coordinates must have numeric dtypes and be of equal length.

    """
    if to_radians:
        lat1, lon1, lat2, lon2 = np.radians([lat1, lon1, lat2, lon2])

    a = np.sin((lat2-lat1)/2.0)**2 + \
        np.cos(lat1) * np.cos(lat2) * np.sin((lon2-lon1)/2.0)**2

    return earth_radius * 2 * np.arcsin(np.sqrt(a))

In [None]:
contigs = ['2L', '2R', '3R', '3L', 'X']
metadata = pd.read_csv("../../config/metadata.tsv", sep="\t")
dblton = pd.read_csv("../../results/f2variantPairs.tsv", sep="\t")

In [None]:
f2Haps = {}
for contig in contigs:
    f2Haps[contig] = pd.read_csv(f"../../results/f2variants/f2HapLengths.{contig}.tsv", sep="\t", index_col=0)
    f2Haps[contig]['contig'] = contig
    f2Haps[contig]['size'] = f2Haps[contig]['end'] - f2Haps[contig]['start']
 
f2df = pd.concat(f2Haps, axis=0).reset_index(drop=True)
f2haps = dblton.merge(f2df.rename(columns={'dblton_pos':'pos'}))
f2haps['distance'] = haversine(f2haps['latitude'], f2haps['longitude'], f2haps['latitude2'], f2haps['longitude2'])

#f2haps['dist_bin'] = pd.cut(f2haps['distance'], 4)

In [None]:
f2haps

In [None]:
scipy.stats.linregress(f2haps['distance'], f2haps['size'])

In [None]:
f2haps.query("size < 100_0000")['size'].hist(bins=100)

In [None]:
for contig in contigs:
    print(contig, f2Haps[contig]['size'].describe().apply(lambda x: format(x, 'f')))
    print("\n")

In [None]:
for contig in contigs:
    plt.figure()
    sns.scatterplot(x=f2Haps[contig]['start'], y=f2Haps[contig]['size'], alpha=0.3)
    plt.title(contig)
    plt.show()
    

So there does seem to be an excess of large f2 haplotypes around the centromeres - regions of low recombination - makes sense. Theres also a clear spike at Gste2? Though no spike at VGSC. What if we try calculate doubleton density in windows? 

def plot_density(pos, window_size, title):
    
    fig, ax = plt.subplots(figsize=(30, 10))
    sns.despine(ax=ax, offset=5)
    y, windows = allel.windowed_count(pos, size=window_size)
    x = np.mean(windows, axis=1)
    ax.plot(x, y/window_size)
    ax.set_ylabel('Density (bp$^{-1}$)')
    ax.set_xlabel('Position (bp)')
    if title:
        ax.set_title(title)
    plt.show()
    
for contig in contigs:
    dbdf = dblton.query("contig == @contig")
    plot_density(dbdf['pos'], 50000, contig)

In [None]:
for contig in contigs:
    df = f2Haps[contig]

    midpoints = df['dblton_pos']
    midpoints = allel.moving_statistic(midpoints, np.median, size=5000, step=1000) 
    sizes = allel.moving_statistic(df['size'], np.median, size=5000, step=1000)

    plt.figure()
    sns.scatterplot(x=midpoints, y=sizes, alpha=0.3)
    plt.title(contig)
    plt.show()

#### Integrate relatedness

In [None]:
rel = pd.read_csv("../../results/relatedness/ngsRelate.ag3_gaardian", sep="\t")
metadata['order'] = np.arange(0,len(metadata))
n_dbltons = dblton.value_counts(['idx1', 'idx2']).to_frame().reset_index().rename(columns={0:'n_doubletons'})
rel = rel.merge(metadata, left_on='a', right_on='order').merge(metadata, left_on='b', right_on='order')
rel = rel.rename(columns={'a':'idx1', 'b':'idx2'})
rel = rel.merge(n_dbltons)
rel['spcomp'] = rel['species_gambiae_coluzzii_x'] + rel['species_gambiae_coluzzii_y']
rel = rel.query("spcomp == 'coluzziicoluzzii' | spcomp == 'gambiaegambiae'")

## distance column 
rel['distance'] = haversine(rel['latitude_y'], rel['longitude_y'], rel['latitude_x'], rel['longitude_x'])
totf2HapLength = f2haps.groupby(['idx1','idx2']).agg({'size':'sum'}).reset_index()
rel = rel.merge(totf2HapLength)


rel['kinship'] = np.select(
    [
        rel['KING'].between(-1, 0.0442, inclusive='both'), 
        rel['KING'].between(0.0443, 0.0884, inclusive='both'),
        rel['KING'].between(0.0885, 0.177, inclusive='both'),
        rel['KING'].between(0.178, 0.354, inclusive='both'),
        rel['KING'].between(0.355, 0.5, inclusive='both')
    ], 
    [
        'Unrelated', 
        '3rd-Degree',
        '2nd-Degree',
        '1st Degree (full sib)',
        'Dup/Twin'
    ], 
    default='Unknown'
)

In [None]:
rel.query("KING < -0.05")

In [None]:
f2haps['dist_bins'] = pd.cut(f2haps['distance'], bins=4, labels=['0-17km', '17-34km', '34-51km', '51-70km'])
f2haps['size_log'] = np.log(f2haps['size'])

In [None]:
f2hapmean_dict = f2haps.groupby("dist_bins").agg({'size_log':'median'}).reset_index(drop=True).to_dict()
f2hapmean_dict

In [None]:
df = f2haps[['size_log', 'dist_bins']].rename(columns={'dist_bins':'g', 'size_log':'x'})

plt.figure(figsize=[10,10])
sns.set_theme(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})

# Initialize the FacetGrid object
pal = sns.cubehelix_palette(10, rot=-.25, light=.7)
g = sns.FacetGrid(df, row="g", hue="g", aspect=15, height=2, palette=pal)
# Draw the densities in a few steps
g.map(sns.kdeplot, "x",
      bw_adjust=.5, clip_on=False,
      fill=True, alpha=1, linewidth=1.5)
g.map(sns.kdeplot, "x", clip_on=False, color="w", lw=2, bw_adjust=.5)

# passing color=None to refline() uses the hue mapping
g.refline(y=0, linewidth=2, linestyle="-", color=None, clip_on=False)


# Define and use a simple function to label the plot in axes coordinates
def label(x, color, label):
    ax = plt.gca()
    ax.text(0, .2, label, fontweight="bold", color=color,
            ha="left", va="center", transform=ax.transAxes)


# flatten axes into a 1-d array
axes = g.axes.flatten()

# iterate through the axes
for i, ax in enumerate(axes):
    ax.axvline(f2hapmean_dict['size_log'][i], ls='--', c='black')


g.map(label, "x")
g.set_xlabels("log distribution of f2 haplotype size")

# Set the subplots to overlap
g.figure.subplots_adjust(hspace=-.25)

# Remove axes details that don't play well with overlap
g.set_titles("")
g.set(yticks=[], ylabel="")
g.despine(bottom=True, left=True)

In [None]:
re

In [None]:
plt.figure(figsize=[14,10])
sns.scatterplot(data=rel.query("KING > -0.2"), x='distance', y='KING', hue='kinship', alpha=0.95, s=120, linewidth=0.4, edgecolor='white')
plt.show()

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=[14,10])
sns.scatterplot(data=rel, x='distance', y='n_doubletons', hue='kinship', alpha=0.8, s=120, linewidth=0.4, edgecolor='white')
plt.show()

In [None]:
plt.rcParams['axes.facecolor'] = 'white'
plt.figure(figsize=[14,10])
sns.scatterplot(x=rel['distance'], y=rel['n_doubletons'], alpha=0.95, s=120, hue=rel['kinship'])
plt.show()

In [None]:
f2haps['size'].hist()

In [None]:
rel.query("KING > 0.177")[['location2_y', 'location2_x']]

In [None]:
f2haps.query("size > 10_000_000")

In [None]:
f2haps['dist_bins'] = pd.cut(f2haps['distance'], bins=7, labels=['0-10km', '10-20km', '20-30km', '30-40km', '40-50km', '50-60km', '60-70km'])

In [None]:
f2haps['size_log'] = np.log2(f2haps['size'])

In [None]:
f2haps.groupby('dist_bins').agg({'size':'median'}).to_csv("f2_hap_medians.tsv", sep="\t")

In [None]:
sns.displot(data=f2haps, x='size_log', hue='dist_bins', kind='kde', palette='tab10', rug=True)

In [None]:
sns.displot(data=f2haps.query("size < 300_000 & size > 10_000"), x='size', hue='dist_bins', kind='kde', palette='tab10', rug=True)

In [None]:
rel2s = rel.query("KING > 0.0884 & KING < 0.177")
rel2s[['KING','distance', 'location2_y', 'location2_x']]

In [None]:
rel.query("KING > 0.0442 & KING < 0.0884")

In [None]:
rel.query("KING < 0.0442 & species_gambiae_coluzzii_y == 'coluzzii'")

In [None]:
rel3s = rel.query("KING > 0.0442 & KING < 0.0884 &  species_gambiae_coluzzii_y == 'coluzzii'")

In [None]:
rel3s['distance'].hist()