# Compute CI

In [1]:
import os
import glob
import sys
import re
import pandas as pd
import moments
import pickle
import gzip
from IPython.display import Markdown as md
import warnings

# Function for printing syntax-highlighted YAML demes models
# If there is a better way to do this, tell me!!
def print_model(file_path):
    with open(file_path) as f:
        lines = f.read()
    return md("```yaml\n" + lines + "```")
warnings.filterwarnings('ignore')

In [2]:
varcat = 'intronic'

## The model
best_guest = f'results/best-guest-NAT-EXPANSION-{varcat}.yml'

options = f'data/FULL-MODEL-options/full-mdl-{varcat}.yml'

In [3]:
print_model(best_guest)

```yaml
description: The Gutenkunst et al. (2009) three-population model of human history,
  modified by replacing CEU with IBS.
time_units: years
generation_time: 29
doi: ['https://doi.org/10.1371/journal.pgen.1000695']
demes:
- name: ancestral
  description: Equilibrium/root population
  epochs:
  - {end_time: 483457.24308765423, start_size: 13580.373657849503}
- name: AMH
  description: Anatomically modern humans
  ancestors: [ancestral]
  epochs:
  - {end_time: 88347.64825867464, start_size: 27142.07281765073}
- name: OOA
  description: Bottleneck out-of-Africa population
  ancestors: [AMH]
  epochs:
  - {end_time: 51166.128542006336, start_size: 1834.8116479610317}
- name: YRI
  description: Yoruba in Ibadan, Nigeria
  ancestors: [AMH]
  epochs:
  - {end_time: 0, start_size: 27142.07281765073}
- name: IBS
  description: Iberian populations in Spain (IBS).
  ancestors: [OOA]
  epochs:
  - {end_time: 0, start_size: 2761.3347067250797, end_size: 26462.05884107293}
- name: CHB
  description: Han Chinese in Beijing, China
  ancestors: [OOA]
  epochs:
  - {end_time: 0, start_size: 1954.52946761033, end_size: 15363.53634144032}
- name: MXB
  description: Native American,  Mexico.
  start_time: 32555.12765744615
  ancestors: [CHB]
  epochs:
  - {end_time: 0, start_size: 1312.8044812722449, end_size: 38461.62164227266}
migrations:
- demes: [YRI, IBS]
  rate: 2.501606466512543e-05
- demes: [YRI, CHB]
  rate: 3.2948319408915263e-06
- demes: [IBS, CHB]
  rate: 6.62400743683469e-05
- demes: [YRI, OOA]
  rate: 0.00015859499514691533
```

In [4]:
print_model(options)

```yaml
parameters:
- name: TA
  description: Time before present of ancestral expansion
  values:
  - demes:
      ancestral:
        epochs:
          0: end_time
- name: TB
  description: Time of YRI-OOA split
  values:
  - demes:
      AMH:
        epochs:
          0: end_time
- name: TF
  description: Time of IBS-CHB split
  values:
  - demes:
      OOA:
       epochs:
         0: end_time
- name: Ne
  description: ancestral effective population size
  values:
  - demes:
      ancestral:
        epochs:
          0: start_size
- name: NA
  description: expansion size in AMH and YRI
  values:
  - demes:
      AMH:
        epochs:
          0: start_size
      YRI:
        epochs:
          0: start_size
- name: NB
  description: Bottleneck size for Eurasian populations
  values:
  - demes:
      OOA:
        epochs:
          0: start_size
- name: NEu0
  description: initial IBS size
  values:
  - demes:
      IBS:
        epochs:
          0: start_size
- name: NEuF
  description: final IBS size
  values:
  - demes:
      IBS:
        epochs:
          0: end_size
- name: NAs0
  description: initial CHB size
  values:
  - demes:
      CHB:
        epochs:
          0: start_size
- name: NAsF
  description: final CHB size
  values:
  - demes:
      CHB:
        epochs:
          0: end_size
- name: mAfB
  description: migration rate between Africa and OOA bottleneck
  values:
  - migrations:
      0: rate
- name: mAfEu
  description: migration rate between Africa and Europe
  values:
  - migrations:
      1: rate
- name: mAfAs
  description: migration rate between Africa and E Asia
  values:
  - migrations:
      2: rate
- name: mEuAs
  description: migration rate between Europe and E Asia
  values:
  - migrations:
      3: rate
- name: TN
  description: Time of MXB branching from CHB.
  upper_bound: 51166.128542006336
  values:
  - demes:
      MXB: start_time
- name: NCmxI
  description: Initial population size in MXB
  values:
  - demes:
      MXB:
        epochs:
          0: start_size
- name: NCmxF
  description: Final population size in MXB
  values:
  - demes:
      MXB:
        epochs:
          0: end_size
constraints:
- params: [TA, TB]
  constraint: greater_than
- params: [TB, TF]
  constraint: greater_than
```

## BOOSTRAPED DATA


In [5]:
def varcat_ml(varcat):
    if varcat == 'intronic':
        return 'introns'
    else:
        return varcat

pattern = f'''
../../data/220113-ConstructBoostrapedDatasets/
data/mL-noncoding/mLs/mL_{varcat_ml(varcat)}_chunk_*.txt
'''.strip().replace('\n', '')

mL_files = glob.glob(pattern)

def read_mL_from_file(mL_file):
    '''Read mL from file'''
    f = open(mL_file, 'r')
    mL = f.readlines()[0]
    mL = mL.replace('mL:', '')
    mL = mL.strip()
    mL = float(mL)
    f.close()
    return mL

def get_boostrap_id_from_file(file):
    file = os.path.basename(file)
    return re.search('_chunk_(\d*)', file).group(1)


mLs = {get_boostrap_id_from_file(f): read_mL_from_file(f) for f in mL_files}

In [None]:
pattern = f'''
../../data/220113-ConstructBoostrapedDatasets/data/jSFS/spectrums/
spectrum_chunk_*_cat_{varcat}.pkl.gz
'''.replace('\n', '').strip()



def load_spectrum(sf_file):
    print(f'Loading data {os.path.basename(sf_file)}')
    with gzip.open(sf_file, "rb") as f:
        sf = pickle.load(f)
    
    # PROJECT AND FOLD
    size = [20] * 4
    sf = sf.project(size)
    sf = sf.fold()
    return sf


# TODO: I'm ussing 10 for testing
sf_files = glob.glob(pattern)[:5]

spectrums = {
    get_boostrap_id_from_file(f): load_spectrum(f) for f in sf_files
}

Loading data spectrum_chunk_6_cat_intronic.pkl.gz


In [None]:
boostrap_ids = spectrums.keys()
bootstraps_mL = [mLs[i] for i in boostrap_ids]
mL = sum(bootstraps_mL)
bootstraps = [spectrums[i] for i in boostrap_ids]
data = sum(bootstraps)

In [None]:
sf = bootstraps[1]
sf.folded

In [None]:
sf.shape

In [None]:
sf.pop_ids

## Estimate CI: GIM method

In [None]:
# I don want to infer the model, I just want to guet the params
# and the likelihood.
#ret = moments.Demes.Inference.optimize(
#    best_guest, #best_guest
#    options,
#    data,
#    maxiter=2,
#    verbose=1,
#    uL=mL,
#    perturb=1,
#    output=None,
#    overwrite=False
#)

In [None]:
std_err = moments.Demes.Inference.uncerts(
    best_guest,
    options,
    data,
    bootstraps=bootstraps,
    uL=mL,
    bootstraps_uL=bootstraps_mL,
    method="GIM",
)
print(std_err)