**Step 1**

In [1]:
WITH_COVARIATE = True

In [2]:
import glow
glow.register(spark, new_session=False)

import pandas as pd 
from pathlib import Path

from pyspark.sql.functions import col

import xarray as xr
import zarr

In [3]:
# Download the zip file mentioned in the link below to target directory and modify the path below as needed:
# https://github.com/pystatgen/sgkit/issues/622
# Direct link: https://github.com/pystatgen/sgkit/files/6779496/regenie_simulations.zip
dsdir = Path('../../../../sgkit/tests/test_regenie/dataset/sim_sm_02')

genotypes_vcf = dsdir / 'genotypes.bed'
covariates_csv = dsdir / 'covariates.csv'
continuous_phenotypes_csv = dsdir / 'traits.csv'

In [4]:
variants = spark.read.format('plink').load(str(genotypes_vcf), famDelimiter='\t')
genotypes = variants.withColumn('values', glow.mean_substitute(glow.genotype_states(col('genotypes'))))

label_df = pd.read_csv(continuous_phenotypes_csv, index_col='sample_id') #[['Continuous_Trait_1', 'Continuous_Trait_2']]
label_df = (label_df - label_df.mean()) / label_df.std(ddof=0)

if WITH_COVARIATE is True:
    covariate_df = pd.read_csv(covariates_csv, index_col='sample_id')
    covariate_df = (covariate_df - covariate_df.mean()) / covariate_df.std(ddof=0)
    covariate_suffix = ""
else:
    covariate_df = pd.DataFrame()
    covariate_suffix = "_nocovariate"

In [5]:
variants_per_block = 10
sample_block_count = 5

sample_ids = glow.get_sample_ids(genotypes)
sample_ids = [sample_id[2:] for sample_id in sample_ids]

block_df, sample_blocks = glow.block_variants_and_samples(genotypes, sample_ids, variants_per_block, sample_block_count)

                                                                                

In [6]:
reduction = glow.RidgeReduction(block_df, label_df, sample_blocks, covariate_df)

The label DataFrame is quantitative. Reduction/regression for quantitative phenotypes will be applied.
Generated alphas: [252.52525252525254, 333.3333333333333, 500.0, 1000.0, 25000.0]


In [7]:
reduced_block_df = reduction.fit_transform()

In [8]:
regression = glow.RidgeRegression.from_ridge_reduction(reduction)
model_df, cv_df = regression.fit()

                                                                                

Generated alphas: [151.51515151515153, 200.0, 300.0, 600.0, 15000.0]


In [None]:
continuous_offsets = regression.transform_loco()

                                                                                

Inferred chromosomes: ['7', '3', '8', '5', '6', '9', '1', '10', '4', '2']
Generating predictions for chromosome 1.


21/09/17 15:42:01 WARN DAGScheduler: Broadcasting large task binary with size 1278.7 KiB
21/09/17 15:42:04 WARN DAGScheduler: Broadcasting large task binary with size 1294.0 KiB
                                                                                

Generating predictions for chromosome 10.


21/09/17 15:42:16 WARN DAGScheduler: Broadcasting large task binary with size 1269.4 KiB
21/09/17 15:42:19 WARN DAGScheduler: Broadcasting large task binary with size 1287.0 KiB
                                                                                

Generating predictions for chromosome 2.


21/09/17 15:42:31 WARN DAGScheduler: Broadcasting large task binary with size 1269.5 KiB
21/09/17 15:42:33 WARN DAGScheduler: Broadcasting large task binary with size 1287.1 KiB
                                                                                

Generating predictions for chromosome 3.


21/09/17 15:42:45 WARN DAGScheduler: Broadcasting large task binary with size 1269.4 KiB
21/09/17 15:42:48 WARN DAGScheduler: Broadcasting large task binary with size 1287.0 KiB
                                                                                

Generating predictions for chromosome 4.


21/09/17 15:43:00 WARN DAGScheduler: Broadcasting large task binary with size 1269.4 KiB
21/09/17 15:43:02 WARN DAGScheduler: Broadcasting large task binary with size 1287.0 KiB
                                                                                

Generating predictions for chromosome 5.


21/09/17 15:43:14 WARN DAGScheduler: Broadcasting large task binary with size 1269.4 KiB
21/09/17 15:43:17 WARN DAGScheduler: Broadcasting large task binary with size 1287.0 KiB
                                                                                

Generating predictions for chromosome 6.


21/09/17 15:43:29 WARN DAGScheduler: Broadcasting large task binary with size 1269.4 KiB
21/09/17 15:43:32 WARN DAGScheduler: Broadcasting large task binary with size 1287.0 KiB
                                                                                

Generating predictions for chromosome 7.


21/09/17 15:43:44 WARN DAGScheduler: Broadcasting large task binary with size 1269.5 KiB
21/09/17 15:43:46 WARN DAGScheduler: Broadcasting large task binary with size 1287.1 KiB
                                                                                

Generating predictions for chromosome 8.




**Export Offsets**

In [None]:
co_df = continuous_offsets.reset_index(level=['sample_id', 'contigName'])
co_df['sample_id']=co_df['sample_id'].apply(
    lambda m: int(str(m)[-2:])-1
)
co_df['contigName'] = co_df['contigName'].apply(
    lambda m: int(str(m))-1
)
co_df = co_df.rename(columns= {
    'sample_id': 'samples',
    'contigName': 'contigs'
})

co_df = co_df.set_index(['samples', 'contigs'])


co = co_df.to_xarray()

co=co.assign(regenie_loco_prediction=xr.concat([co.Y0000, co.Y0001, co.Y0002, co.Y0003, co.Y0004], 'outcomes'))
co['regenie_loco_prediction'] = co['regenie_loco_prediction'].transpose()


glow_offset_path = Path(f'../../../../sgkit/tests/test_regenie/dataset/sim_sm_02/glow_offsets{covariate_suffix}.zarr.zip')
store = zarr.ZipStore(str(glow_offset_path), mode='w')
co.to_zarr(store)
store.close()

**Step 2**

In [None]:
genotypes = (glow.transform('split_multiallelics', variants)
    .withColumn('gt', glow.mean_substitute(glow.genotype_states(col('genotypes'))))
    .select('contigName', 'start', 'names', 'gt')
    .cache())

In [None]:
lin_reg_df = glow.gwas.linear_regression(
    genotypes,
    label_df,
    covariate_df,
    offset_df=continuous_offsets,
    values_column='gt'
  )

In [None]:
output_path = Path('../../../../sgkit/tests/test_regenie/result/sim_sm_02-wgr_02')

In [None]:
# Convert to pandas
wgr_gwas = lin_reg_df.toPandas()

path = output_path / f'gwas_loco{covariate_suffix}.csv'
wgr_gwas.to_csv(path, index=False)