## REGENIE Unit Test Development

In [1]:
import sgkit_plink
import pandas as pd
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
from sgkit.stats import regenie
from sgkit.tests import test_regenie
from sgkit.stats.association import linear_regression
from sklearn.metrics import r2_score as sk_r2
from sgkit.stats.regenie import r2_score as sg_r2
from pathlib import Path
import sgkit as sg
import yaml
%load_ext autoreload
%autoreload 2

### Load Simulated Data

In [2]:
dataset = 'sim_sm_03'
paramset = 'wgr_01'

In [3]:
dataset_dir = Path('data/dataset/' + dataset)
result_dir = Path('data/result/' + dataset + '-' + paramset)
dataset_dir, result_dir

(PosixPath('data/dataset/sim_sm_03'),
 PosixPath('data/result/sim_sm_03-wgr_01'))

In [4]:
with open('config.yml') as fd:
    config = yaml.load(fd, Loader=yaml.FullLoader)
ds_config, ps_config = config['datasets'][dataset], config['paramsets'][paramset]
ds_config, ps_config

({'n_variants': 250,
  'n_samples': 50,
  'n_covars': 3,
  'n_contigs': 10,
  'n_traits': 1},
 {'variant_block_size': 10, 'sample_block_size': 10, 'alphas': [1000]})

In [5]:
ds = sgkit_plink.read_plink(str(dataset_dir / 'genotypes'), bim_int_contig=True)
ds

Unnamed: 0,Array,Chunk
Bytes,500 B,500 B
Shape,"(250,)","(250,)"
Count,5 Tasks,1 Chunks
Type,int16,numpy.ndarray
"Array Chunk Bytes 500 B 500 B Shape (250,) (250,) Count 5 Tasks 1 Chunks Type int16 numpy.ndarray",250  1,

Unnamed: 0,Array,Chunk
Bytes,500 B,500 B
Shape,"(250,)","(250,)"
Count,5 Tasks,1 Chunks
Type,int16,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1000 B,1000 B
Shape,"(250,)","(250,)"
Count,3 Tasks,1 Chunks
Type,int32,numpy.ndarray
"Array Chunk Bytes 1000 B 1000 B Shape (250,) (250,) Count 3 Tasks 1 Chunks Type int32 numpy.ndarray",250  1,

Unnamed: 0,Array,Chunk
Bytes,1000 B,1000 B
Shape,"(250,)","(250,)"
Count,3 Tasks,1 Chunks
Type,int32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,500 B,250 B
Shape,"(250, 2)","(250, 1)"
Count,15 Tasks,2 Chunks
Type,|S1,numpy.ndarray
"Array Chunk Bytes 500 B 250 B Shape (250, 2) (250, 1) Count 15 Tasks 2 Chunks Type |S1 numpy.ndarray",2  250,

Unnamed: 0,Array,Chunk
Bytes,500 B,250 B
Shape,"(250, 2)","(250, 1)"
Count,15 Tasks,2 Chunks
Type,|S1,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,200 B,200 B
Shape,"(50,)","(50,)"
Count,4 Tasks,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 200 B 200 B Shape (50,) (50,) Count 4 Tasks 1 Chunks Type numpy.ndarray",50  1,

Unnamed: 0,Array,Chunk
Bytes,200 B,200 B
Shape,"(50,)","(50,)"
Count,4 Tasks,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,25.00 kB,25.00 kB
Shape,"(250, 50, 2)","(250, 50, 2)"
Count,2 Tasks,1 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 25.00 kB 25.00 kB Shape (250, 50, 2) (250, 50, 2) Count 2 Tasks 1 Chunks Type int8 numpy.ndarray",2  50  250,

Unnamed: 0,Array,Chunk
Bytes,25.00 kB,25.00 kB
Shape,"(250, 50, 2)","(250, 50, 2)"
Count,2 Tasks,1 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,25.00 kB,25.00 kB
Shape,"(250, 50, 2)","(250, 50, 2)"
Count,3 Tasks,1 Chunks
Type,bool,numpy.ndarray
"Array Chunk Bytes 25.00 kB 25.00 kB Shape (250, 50, 2) (250, 50, 2) Count 3 Tasks 1 Chunks Type bool numpy.ndarray",2  50  250,

Unnamed: 0,Array,Chunk
Bytes,25.00 kB,25.00 kB
Shape,"(250, 50, 2)","(250, 50, 2)"
Count,3 Tasks,1 Chunks
Type,bool,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1000 B,1000 B
Shape,"(250,)","(250,)"
Count,4 Tasks,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 1000 B 1000 B Shape (250,) (250,) Count 4 Tasks 1 Chunks Type numpy.ndarray",250  1,

Unnamed: 0,Array,Chunk
Bytes,1000 B,1000 B
Shape,"(250,)","(250,)"
Count,4 Tasks,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,200 B,200 B
Shape,"(50,)","(50,)"
Count,4 Tasks,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 200 B 200 B Shape (50,) (50,) Count 4 Tasks 1 Chunks Type numpy.ndarray",50  1,

Unnamed: 0,Array,Chunk
Bytes,200 B,200 B
Shape,"(50,)","(50,)"
Count,4 Tasks,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,200 B,200 B
Shape,"(50,)","(50,)"
Count,4 Tasks,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 200 B 200 B Shape (50,) (50,) Count 4 Tasks 1 Chunks Type numpy.ndarray",50  1,

Unnamed: 0,Array,Chunk
Bytes,200 B,200 B
Shape,"(50,)","(50,)"
Count,4 Tasks,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,200 B,200 B
Shape,"(50,)","(50,)"
Count,4 Tasks,1 Chunks
Type,numpy.ndarray,
"Array Chunk Bytes 200 B 200 B Shape (50,) (50,) Count 4 Tasks 1 Chunks Type numpy.ndarray",50  1,

Unnamed: 0,Array,Chunk
Bytes,200 B,200 B
Shape,"(50,)","(50,)"
Count,4 Tasks,1 Chunks
Type,numpy.ndarray,

Unnamed: 0,Array,Chunk
Bytes,50 B,50 B
Shape,"(50,)","(50,)"
Count,3 Tasks,1 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 50 B 50 B Shape (50,) (50,) Count 3 Tasks 1 Chunks Type int8 numpy.ndarray",50  1,

Unnamed: 0,Array,Chunk
Bytes,50 B,50 B
Shape,"(50,)","(50,)"
Count,3 Tasks,1 Chunks
Type,int8,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,50 B,50 B
Shape,"(50,)","(50,)"
Count,3 Tasks,1 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 50 B 50 B Shape (50,) (50,) Count 3 Tasks 1 Chunks Type int8 numpy.ndarray",50  1,

Unnamed: 0,Array,Chunk
Bytes,50 B,50 B
Shape,"(50,)","(50,)"
Count,3 Tasks,1 Chunks
Type,int8,numpy.ndarray


### Apply REGENIE

In [6]:
df_cov = test_regenie.load_covariates(dataset_dir)
df_cov.head()

Unnamed: 0_level_0,X000,X001,X002
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
S0000001,1.403492,0.187771,1.14602
S0000002,1.854252,1.652777,-0.902967
S0000003,0.634048,-0.362843,0.012637
S0000004,0.124066,-0.067924,1.644158
S0000005,0.455337,-0.090256,0.585722


In [7]:
df_trait = test_regenie.load_traits(dataset_dir)
df_trait.head()

Unnamed: 0_level_0,Y0000
sample_id,Unnamed: 1_level_1
S0000001,-1.030415
S0000002,0.405862
S0000003,1.107564
S0000004,1.018582
S0000005,0.121986


In [8]:
contigs = ds['variant/contig'].values
G = ds['call/genotype'].sum(dim='ploidy').values
X = df_cov.values
Y = df_trait.values
G.shape, X.shape, Y.shape, contigs.shape

((250, 50), (50, 3), (50, 1), (250,))

In [9]:
res = regenie.regenie_transform(
    G.T, X, Y, 
    contigs, 
    variant_block_size=ps_config['variant_block_size'], 
    sample_block_size=ps_config['sample_block_size'], 
    normalize=True, 
    alphas=ps_config['alphas'],
    orthogonalize=False,
    _glow_adj_dof=True,
    _glow_adj_alpha=True,
    _glow_adj_scaling=True
)

In [10]:
res

Unnamed: 0,Array,Chunk
Bytes,12.00 kB,80 B
Shape,"(30, 1, 50, 1)","(1, 1, 10, 1)"
Count,3982 Tasks,150 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 12.00 kB 80 B Shape (30, 1, 50, 1) (1, 1, 10, 1) Count 3982 Tasks 150 Chunks Type float64 numpy.ndarray",30  1  1  50  1,

Unnamed: 0,Array,Chunk
Bytes,12.00 kB,80 B
Shape,"(30, 1, 50, 1)","(1, 1, 10, 1)"
Count,3982 Tasks,150 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,400 B,80 B
Shape,"(50, 1)","(10, 1)"
Count,5021 Tasks,5 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 400 B 80 B Shape (50, 1) (10, 1) Count 5021 Tasks 5 Chunks Type float64 numpy.ndarray",1  50,

Unnamed: 0,Array,Chunk
Bytes,400 B,80 B
Shape,"(50, 1)","(10, 1)"
Count,5021 Tasks,5 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,4.00 kB,80 B
Shape,"(10, 50, 1)","(1, 10, 1)"
Count,9806 Tasks,50 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 4.00 kB 80 B Shape (10, 50, 1) (1, 10, 1) Count 9806 Tasks 50 Chunks Type float64 numpy.ndarray",1  50  10,

Unnamed: 0,Array,Chunk
Bytes,4.00 kB,80 B
Shape,"(10, 50, 1)","(1, 10, 1)"
Count,9806 Tasks,50 Chunks
Type,float64,numpy.ndarray


### R2 Score

In [11]:
pairs = [
    ([1, 1], [1, 2]),
    ([1, 0], [1, 2]),
    ([1, -1, 3], [1, 2, 3]),
    ([0, -1, 2], [1, 2, 3]),
    ([3, 2, 1], [1, 2, 3]),
    ([0, 0, 0], [1, 2, 3]),
    ([1.1, 2.1, 3.1], [1, 2, 3]),
    ([1.1, 1.9, 3.0], [1, 2, 3]),
    ([1, 2, 3], [1, 2, 3]),
    ([1, 1, 1], [1, 1, 1]),
    ([1, 1, 1], [1, 2, 3]),
    ([1, 2, 3], [1, 1, 1]),
]
res = []
for pair in pairs:
    yp, yt = pair
    yp, yt = np.array(yp), np.array(yt)
    res.append((list(yp), list(yt), sk_r2(yt, yp)))#, sg_r2(yp, yt)))
print('\n'.join(str(t) + ',' for t in res))

([1, 1], [1, 2], -1.0),
([1, 0], [1, 2], -7.0),
([1, -1, 3], [1, 2, 3], -3.5),
([0, -1, 2], [1, 2, 3], -4.5),
([3, 2, 1], [1, 2, 3], -3.0),
([0, 0, 0], [1, 2, 3], -6.0),
([1.1, 2.1, 3.1], [1, 2, 3], 0.985),
([1.1, 1.9, 3.0], [1, 2, 3], 0.99),
([1, 2, 3], [1, 2, 3], 1.0),
([1, 1, 1], [1, 1, 1], 1.0),
([1, 1, 1], [1, 2, 3], -1.5),
([1, 2, 3], [1, 1, 1], 0.0),
