In [1]:
import functions
from dismal import blocking
import pyranges as pr
from collections import Counter
import numpy as np
import scipy
from joblib import Parallel, delayed
from sklearn.ensemble import RandomForestRegressor
import os
import sqlite3

## Real (simulated) data

In [2]:
blocks = pr.read_bed("../simulations/stdpopsim_simulations/dismal_ponabe/PonAbe_blocks_1932.bed")
blocks_df = blocks.df.rename(columns={"ThickStart": "Sample1", "ThickEnd": "Sample2", "ItemRGB": "NumSegSites"}).iloc[:, [0,1,2,6,7,8]].sample(50000)

  return {k: v for k, v in df.groupby(grpby_key)}


In [3]:
pop1_samples = ["Bornean_0", "Bornean_1", "Bornean_2"]
pop2_samples = ["Sumatran_3", "Sumatran_4", "Sumatran_5"]

s1_counter = Counter(blocks_df["NumSegSites"][(blocks_df["Sample1"].isin(pop1_samples)) & (blocks_df["Sample2"].isin(pop1_samples))])
s2_counter = Counter(blocks_df["NumSegSites"][(blocks_df["Sample1"].isin(pop2_samples)) & (blocks_df["Sample2"].isin(pop2_samples))])
s3_counter = Counter(blocks_df["NumSegSites"][(blocks_df["Sample1"].isin(pop1_samples)) & (blocks_df["Sample2"].isin(pop2_samples))])

S = np.array([functions.counter_to_arr(counter, 1932) for counter in [s1_counter, s2_counter, s3_counter]], dtype="int_")

In [4]:
num_blocks_per_state = S.sum(axis=1)
num_blocks_per_state

array([10019,  9986, 29995])

In [5]:
X_test = np.array([np.concatenate(S), ])

## Generate training set

In [7]:
n = 5000

X_train, y_train = functions.generate_training_set(blocklen=1932,
                                                   mutation_rate=2e-8,
                                                   recombination_rate=1.5e-8,
                                                   num_blocks_per_state=num_blocks_per_state,
                                                   n=n, n_cpus=-1,
                                                   saveto="fpongo_trainset_{n}.npz")

Generating training data of length 5000 of 50000 blocks each on 7 cores


## Fit RF

In [9]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

In [13]:
y_pred = rf.predict(X_test)

In [14]:
y_pred

array([[19.63052397, 15.27835279,  4.42824325,  9.03368509, 22.17156758,
        19.69968796]])

In [11]:
functions.reparameterise(y_pred[0,0], y_pred[0,1], y_pred[0,2], y_pred[0,3], y_pred[0,4], y_pred[0, 5], 
                         blocklen=1939,
                         mutation_rate=2e-8)

(126550.56712437692,
 98493.76476293722,
 28547.210250899563,
 2286435.9432465,
 0.00017519927473784588,
 0.0002000094930348675)

If no good:
* Consider alternative parameterisation (Nes, generations)
* Consider if there are ways to narrow down prior - dxy estimate of split time?