## Canine GWAS Alignment

- Align the (now fairly small) set of reference variants to the target variants
    - This step is missing in the UKBB analysis because presumably, the 1KG data and the UKBB data are already aligned to GRCh38
    - This will involve joining variants by locus and resolving strand/allele flips
    - This will result in a merged target + reference dataset that contains **exactly** the same variants (it is crucial that the reference data PCA include only variants that are going to be present in the target dataset to be projected)

In [1]:
import hail as hl
import pandas as pd
import numpy as np
import plotnine as pn
import plotly.express as px
import os.path as osp
%run ../../nb.py
%run paths.py
%run common.py
gab.register_timeop_magic(get_ipython(), 'hail')
hl.init()

Running on Apache Spark version 2.4.4
SparkUI available at http://a783b4e25167:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.30-2ae07d872f43
LOGGING: writing to /home/eczech/repos/gwas-analysis/notebooks/organism/canine/hail-20200213-0002-0.2.30-2ae07d872f43.log


In [None]:
%%capture
hl.ReferenceGenome(**load_reference_genome(REF_GENOME_FILE))

In [35]:
mt_ref = hl.read_matrix_table(osp.join(WORK_DIR, REF_QC_RES_FILE + '.mt'))
mt_ref.count()

(36395, 1350)

In [13]:
mt_tgt = hl.import_plink(
    *plink_files(ORGANISM_CANINE_TGT_DIR, PLINK_FILE_TGT),
    skip_invalid_loci=False,
    reference_genome='canine'
)
mt_tgt.count()

2020-02-13 00:11:17 Hail: INFO: Found 4342 samples in fam file.
2020-02-13 00:11:17 Hail: INFO: Found 160727 variants in bim file.
2020-02-13 00:11:17 Hail: INFO: Coerced sorted dataset


(160727, 4342)

In [9]:
def get_alt_allele_freq(mt):
    cts = mt.aggregate_entries(hl.agg.hist(mt.GT.n_alt_alleles(), 0, 2, 3))
    cts = pd.Series(cts.bin_freq).rename('count').rename_axis('n_alt_alleles').reset_index()
    assert cts.sort_values('count')['n_alt_alleles'].tail(1).values[0] == 0
    return cts

In [10]:
get_alt_allele_freq(mt_tgt)

2020-02-13 00:10:45 Hail: INFO: Coerced sorted dataset


Unnamed: 0,n_alt_alleles,count
0,0,422763319
1,1,181266267
2,2,93838364


In [36]:
get_alt_allele_freq(mt_ref)

Unnamed: 0,n_alt_alleles,count
0,0,26895996
1,1,14281635
2,2,7907143


In [37]:
ht = mt_ref.rows().key_by('locus').rename({'alleles': 'alleles_ref'}).drop(*['rsid', 'cm_position'])\
    .join(mt_tgt.rows().key_by('locus').rename({'alleles': 'alleles_tgt'}).drop(*['rsid', 'cm_position']), how='outer')
ht = ht.annotate(astr_ref=hl.delimit(ht.alleles_ref, ''), astr_tgt=hl.delimit(ht.alleles_tgt, ''))
ht.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Row fields:
    'locus': locus<canine> 
    'alleles_ref': array<str> 
    'allele_str': str 
    'alleles_tgt': array<str> 
    'astr_ref': str 
    'astr_tgt': str 
----------------------------------------
Key: ['locus']
----------------------------------------


In [39]:
mt_tgt.aggregate_rows(hl.agg.counter(hl.delimit(hl.sorted(hl.array([mt_tgt.alleles[0], mt_tgt.alleles[1]])), '|')))

2020-02-13 00:27:07 Hail: INFO: Coerced sorted dataset


{'A|C': 26543, 'A|G': 122575, 'A|T': 6124, 'C|G': 5485}

In [38]:
ht.to_pandas().groupby(['astr_ref', 'astr_tgt']).size().unstack().fillna(0).astype(int)

2020-02-13 00:24:51 Hail: INFO: Coerced sorted dataset


astr_tgt,AC,CA,AG,GA
astr_ref,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
AC,1449,61,0,0
AG,0,0,6180,307
CA,79,1600,0,0
CT,0,0,364,7995
GA,0,0,385,8130
GT,74,1631,0,0
TC,0,0,6219,343
TG,1437,78,0,0
