**Step 1**

In [7]:
import glow
glow.register(spark, new_session=False)

import pandas as pd
from pathlib import Path

from pyspark.sql.functions import col

import xarray as xr
import zarr

In [5]:
# Download the following zip file to target directory and modify the path below as needed: 
# https://github.com/pystatgen/sgkit/issues/622 
dsdir = Path('../../../../sgkit/tests/test_regenie/dataset/sim_sm_02')

genotypes_vcf = dsdir / 'genotypes.bed'
covariates_csv = dsdir / 'covariates.csv'
continuous_phenotypes_csv = dsdir / 'traits.csv'

In [None]:
variants = spark.read.format('plink').load(str(genotypes_vcf), famDelimiter='\t')
genotypes = variants.withColumn('values', glow.mean_substitute(glow.genotype_states(col('genotypes'))))

label_df = pd.read_csv(continuous_phenotypes_csv, index_col='sample_id') #[['Continuous_Trait_1', 'Continuous_Trait_2']]
label_df = (label_df - label_df.mean()) / label_df.std(ddof=0)

covariate_df = pd.read_csv(covariates_csv, index_col='sample_id')
covariate_df = (covariate_df - covariate_df.mean()) / covariate_df.std(ddof=0)

In [6]:
variants_per_block = 10
sample_block_count = 5

sample_ids = glow.get_sample_ids(genotypes)
sample_ids = [sample_id[2:] for sample_id in sample_ids]

block_df, sample_blocks = glow.block_variants_and_samples(genotypes, sample_ids, variants_per_block, sample_block_count)

In [None]:
reduction = glow.RidgeReduction(block_df, label_df, sample_blocks, covariate_df)

In [8]:
reduced_block_df = reduction.fit_transform()

In [None]:
regression = glow.RidgeRegression.from_ridge_reduction(reduction)
model_df, cv_df = regression.fit()

In [None]:
continuous_offsets = regression.transform_loco()

**Export Offsets**

In [None]:
co_df = continuous_offsets.reset_index(level=['sample_id', 'contigName'])
co_df['sample_id']=co_df['sample_id'].apply(
    lambda m: int(str(m)[-2:])-1
)
co_df['contigName'] = co_df['contigName'].apply(
    lambda m: int(str(m))-1
)
co_df = co_df.rename(columns= {
    'sample_id': 'samples',
    'contigName': 'contigs'
})

co_df = co_df.set_index(['samples', 'contigs'])


co = co_df.to_xarray()

co=co.assign(regenie_loco_prediction=xr.concat([co.Y0000, co.Y0001, co.Y0002, co.Y0003, co.Y0004], 'outcomes'))
co['regenie_loco_prediction'] = co['regenie_loco_prediction'].transpose()


glow_offset_path = Path('~/sgkit/sgkit/tests/test_regenie/dataset/sim_sm_02/glow_offsets.zarr.zip')
store = zarr.Zipstore(str(glow_offset_path), mode='w')
co.to_zarr(store)

**Step 2**

In [11]:
genotypes = (glow.transform('split_multiallelics', variants)
    .withColumn('gt', glow.mean_substitute(glow.genotype_states(col('genotypes'))))
    .select('contigName', 'start', 'names', 'gt')
    .cache())

In [51]:
lin_reg_df = glow.gwas.linear_regression(
    genotypes,
    label_df,
    covariate_df,
    offset_df=continuous_offsets,
    values_column='gt'
  )

In [69]:
output_path = Path('~/sgkit/sgkit/tests/test_regenie/result/sim_sm_02-wgr_02')


In [70]:
# Convert to pandas
wgr_gwas = lin_reg_df.toPandas()

path = output_path / 'gwas_loco.csv'
wgr_gwas.to_csv(path, index=False)
