### Canine Pop Group Benchmarking

```
// Trying to emulate this:
df
    .withColumn("mask1", expr("array_max(transform(filter(genotypes, g -> g.population == 0), g -> array_max(g.calls))) > 0"))
    .withColumn("mask2", expr("array_max(transform(filter(genotypes, g -> g.population == 1), g -> array_max(g.calls))) > 0"))
    .withColumn("mask3", expr("array_max(transform(filter(genotypes, g -> g.population == 2), g -> array_max(g.calls))) > 0"))
    .withColumn("mask", $"mask1" || $"mask2" || $"mask3")
    .filter($"mask")
    .count
```

In [1]:
import hail as hl
import pandas as pd
import numpy as np
import plotnine as pn
import functools 
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
import os.path as osp
%run ../nb.py
%run ../organism/canine/files.py
%run ../organism/canine/common.py
temp_dir = osp.expanduser('~/data/gwas/tmp/canine')
gab.register_timeop_magic(get_ipython(), 'hail')
hl.init()

Running on Apache Spark version 2.4.4
SparkUI available at http://a783b4e25167:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.30-2ae07d872f43
LOGGING: writing to /home/eczech/repos/gwas-analysis/notebooks/benchmark/hail-20200211-2226-0.2.30-2ae07d872f43.log


In [2]:
hl.ReferenceGenome(**load_reference_genome('../organism/canine/data/reference_genome.json'))

ReferenceGenome(name=canine, contigs=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '41'], lengths={'1': 122670980, '2': 85416217, '3': 91858198, '4': 88267880, '5': 88908300, '6': 77552613, '7': 80858461, '8': 74057381, '9': 61043804, '10': 69316974, '11': 74388336, '12': 72480470, '13': 63232306, '14': 60959782, '15': 64187680, '16': 59511764, '17': 64281982, '18': 55763074, '19': 53735656, '20': 58114749, '21': 50855586, '22': 61382644, '23': 52291577, '24': 47651928, '25': 51628093, '26': 38939728, '27': 45753342, '28': 41164216, '29': 41841565, '30': 40196606, '31': 39786599, '32': 38745890, '33': 31361794, '34': 42089769, '35': 26506199, '36': 30798114, '37': 30897806, '38': 23903967, '39': 123833839, '41': 6608343}, x_contigs=['39'], y_contigs=[], mt_contigs=['41'], par=[])

In [3]:
mt = hl.read_matrix_table(osp.join(temp_dir, 'mt_ref_qc_1.mt'))
mt.count()

(149845, 1350)

In [4]:
breeds = ['ACKR', 'MAST', 'CARD', 'MPIN', 'TIBM', 'TURV', 'SAMO', 'MALT', 'POM', 'SALU']

In [5]:
df = get_fam(ORGANISM_CANINE_REF_DIR, PLINK_FILE_REF).assign(breed=get_breed)
ht = hl.Table.from_pandas(df[['fid', 'iid', 'breed']].rename(columns={'fid': 'fam_id', 'iid': 's'})).key_by('s')
mt = mt.annotate_cols(breed=ht[mt.s].breed)
cht = mt.cols()
cht.select(cht.fam_id, cht.breed).show(3)

2020-02-11 22:26:55 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'
2020-02-11 22:26:57 Hail: INFO: Coerced sorted dataset


s,fam_id,breed
str,str,str
"""ACKR_09007""","""ACKR""","""ACKR"""
"""ACKR_09030""","""ACKR""","""ACKR"""
"""ACKR_09032""","""ACKR""","""ACKR"""


In [15]:
%%time
def run_qc(mt, breeds):
    masks = {}
    for i, br in enumerate(breeds):
        mt = mt.annotate_rows(**{'mask' + str(i): hl.agg.fraction(hl.is_defined(mt.GT)) > .9 + .001 * i})
    mt = mt.filter_rows(functools.reduce(lambda m1, m2: m1 & m2, [mt['mask' + str(i)] for i in range(len(breeds))]))
    return mt
run_qc(mt, breeds[:8]).count()

CPU times: user 115 ms, sys: 4.12 ms, total: 119 ms
Wall time: 3.16 s


(149845, 1350)

In [25]:
#mtq = hl.variant_qc(mt)
mt.group_cols_by(mt.breed).aggregate_cols(call_rate=hl.agg.fraction(hl.is_defined(mt.GT)))

2020-02-11 22:58:59 Hail: ERROR: scope violation: 'GroupedMatrixTable.aggregate_cols' expects an expression indexed by []
    Found indices ['row'], with unexpected indices ['row']. Invalid fields:
        'variant_qc' (indices ['row'])


ExpressionException: scope violation: 'GroupedMatrixTable.aggregate_cols' expects an expression indexed by []
    Found indices ['row'], with unexpected indices ['row']. Invalid fields:
        'variant_qc' (indices ['row'])

In [None]:
g.aggregate_rows(call_rate=)

In [None]:
%%time
def run_qc(mt, breeds):
    masks = {}
    for i, br in enumerate(breeds):
        mt = mt.annotate_rows(**{'mask' + str(i): hl.agg.fraction(hl.is_defined(mt.GT)) > .9 + .001 * i})
    mt = mt.filter_rows(functools.reduce(lambda m1, m2: m1 & m2, [mt['mask' + str(i)] for i in range(len(breeds))]))
    return mt
run_qc(mt, breeds[:8]).count()

In [7]:
%%time
def run_qc(mt, breeds):
    masks = {}
    for i, br in enumerate(breeds):
        print(br)
        mtf = hl.variant_qc(mt.filter_cols(mt.breed == br))
        mt = mt.annotate_rows(**{'mask' + str(i): mtf.rows()[mt.row_key].variant_qc.p_value_hwe > 10e-6})
        #mt = mt.annotate_rows(**{'mask' + str(i): mtf.rows()[mt.row_key].variant_qc.call_rate > .9})
    mt = mt.filter_rows(functools.reduce(lambda m1, m2: m1 & m2, [mt['mask' + str(i)] for i in range(len(breeds))]))
    return mt
#mtt.aggregate_rows(hl.agg.counter(functools.reduce(lambda m1, m2: m1 & m2, [mtt["mask0"], mtt["mask1"], mtt["mask2"]])))

CPU times: user 16 µs, sys: 0 ns, total: 16 µs
Wall time: 23.1 µs


In [8]:
%%time
run_qc(mt, breeds[:2]).count()

ACKR
MAST
CPU times: user 201 ms, sys: 15.4 ms, total: 217 ms
Wall time: 6.24 s


(149845, 1350)

In [9]:
%%time
run_qc(mt, breeds[:3]).count()

ACKR
MAST
CARD
CPU times: user 352 ms, sys: 7.69 ms, total: 359 ms
Wall time: 7.81 s


(149845, 1350)

In [10]:
%%time
run_qc(mt, breeds[:4]).count()

ACKR
MAST
CARD
MPIN
CPU times: user 700 ms, sys: 10.7 ms, total: 711 ms
Wall time: 10.7 s


(149845, 1350)

In [11]:
%%time
run_qc(mt, breeds[:5]).count()

ACKR
MAST
CARD
MPIN
TIBM
CPU times: user 1.61 s, sys: 35.9 ms, total: 1.65 s
Wall time: 14.7 s


(149845, 1350)

In [12]:
%%time
run_qc(mt, breeds[:6]).count()

ACKR
MAST
CARD
MPIN
TIBM
TURV
CPU times: user 4.32 s, sys: 35 ms, total: 4.35 s
Wall time: 22.4 s


(149845, 1350)

In [13]:
%%time
run_qc(mt, breeds[:7]).count()

ACKR
MAST
CARD
MPIN
TIBM
TURV
SAMO
CPU times: user 12.3 s, sys: 85.6 ms, total: 12.3 s
Wall time: 40.6 s


(149845, 1350)

In [14]:
%%time
run_qc(mt, breeds[:8]).count()

ACKR
MAST
CARD
MPIN
TIBM
TURV
SAMO
MALT
CPU times: user 35.9 s, sys: 331 ms, total: 36.2 s
Wall time: 1min 33s


(149845, 1350)

In [None]:
%%time
def hwe_maf_qc(mt):
    cts = {}
    res = mt
    # TODO: Is there a more efficient way to do this?  It may make more sense 
    # to use boolean reduction on N mask arrays instead of successive joins
    for br in mts.keys():
        mtf = hl.variant_qc(mt.filter_cols(mt.breed == br))
        #mtf = mtf.filter_rows((mtf.variant_qc.p_value_hwe > threshold_hwep) & (hl.min(mtf.variant_qc.AF) > threshold_maf))
        mtf = mtf.filter_rows((mtf.variant_qc.p_value_hwe > threshold_hwep))
        #cts[br] = mtf.count()[0]
        res = res.semi_join_rows(mtf.rows())
    #print('Number of variants after filtering by breed-specific QC thresholds: ', cts)
    return res

mt_qc = hwe_maf_qc(mt)
mt_qc.count()