### SNP-Seek GWAS Export

In [2]:
import hail as hl
import pandas as pd
import os.path as osp
import dask.array as da
import dask
dask.config.set(scheduler='single-threaded')
%run ../../nb.py
%run $TUTORIAL_DIR/files.py
hl.init() 

Running on Apache Spark version 2.4.4
SparkUI available at http://8352602c2ab9:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.32-a5876a0a2853
LOGGING: writing to /home/eczech/repos/gwas-analysis/notebooks/organism/rice/hail-20200514-1735-0.2.32-a5876a0a2853.log


In [4]:
data_dir = ORGANISM_RICE_3KG_GWAS_DIR
data_dir

'/home/eczech/data/gwas/rice-snpseek/1M_GWAS_SNP_Dataset'

### Downloads

In [6]:
%%time
%%bash -s "$data_dir"
set -e; cd $1
# Download "3K RG 1M GWAS SNP Dataset, all chromosomes" from https://snp-seek.irri.org/_download.zul
for ext in bed bim fam; do
  wget -q https://3kricegenome.s3.amazonaws.com/snpseek-dl/3k-pruned-v2.1/pruned_v2.1.${ext}
done

CPU times: user 14.4 ms, sys: 5.61 ms, total: 20.1 ms
Wall time: 2min 25s


In [7]:
%%time
%%bash -s "$data_dir"
set -e; cd $1
# Download phenotypes
wget -q https://s3-ap-southeast-1.amazonaws.com/oryzasnp-atcg-irri-org/3kRG-phenotypes/3kRG_PhenotypeData_v20170411.xlsx

# Download accession mapping from:
# Genomic variation in 3,010 diverse accessions of Asian cultivated rice
# https://www.nature.com/articles/s41586-018-0063-9
# Supplementary Data 1
wget -q https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-018-0063-9/MediaObjects/41586_2018_63_MOESM3_ESM.xlsx

CPU times: user 6.4 ms, sys: 16 ms, total: 22.4 ms
Wall time: 4.14 s


In [10]:
!du -ch $data_dir/*

1.1M	/home/eczech/data/gwas/rice-snpseek/1M_GWAS_SNP_Dataset/3kRG_PhenotypeData_v20170411.xlsx
304K	/home/eczech/data/gwas/rice-snpseek/1M_GWAS_SNP_Dataset/41586_2018_63_MOESM3_ESM.xlsx
730M	/home/eczech/data/gwas/rice-snpseek/1M_GWAS_SNP_Dataset/pruned_v2.1.bed
26M	/home/eczech/data/gwas/rice-snpseek/1M_GWAS_SNP_Dataset/pruned_v2.1.bim
104K	/home/eczech/data/gwas/rice-snpseek/1M_GWAS_SNP_Dataset/pruned_v2.1.fam
757M	total


In [23]:
mt = hl.import_plink(
    bed=osp.join(data_dir, 'pruned_v2.1.bed'),
    bim=osp.join(data_dir, 'pruned_v2.1.bim'),
    fam=osp.join(data_dir, 'pruned_v2.1.fam')
)

2020-05-14 12:46:49 Hail: INFO: Found 3024 samples in fam file.
2020-05-14 12:46:49 Hail: INFO: Found 1011601 variants in bim file.


In [24]:
sids = mt.cols().s.collect()
sids[:10]

2020-05-14 12:47:03 Hail: WARN: cols(): Resulting column table is sorted by 'col_key'.
    To preserve matrix table column order, first unkey columns with 'key_cols_by()'


['B001',
 'B002',
 'B003',
 'B004',
 'B005',
 'B006',
 'B007',
 'B008',
 'B009',
 'B010']

### Phenotypes

In [32]:
def get_accessions():
    cols = ['3K_DNA_IRIS_UNIQUE_ID','Genetic_Stock_Accession','Genetic_Stock_varname','IRGC_Accno_source']
    df = pd.read_excel(osp.join(data_dir, '41586_2018_63_MOESM3_ESM.xlsx'), sheet_name='Table 1 Metadata', header=1)
    df = df[cols]
    df = df.rename(columns={
        '3K_DNA_IRIS_UNIQUE_ID': 'iris_id',
        'Genetic_Stock_Accession': 'gs_acc',
        'Genetic_Stock_varname': 'gs_variety_name',
        'IRGC_Accno_source': 'igrc_acc_src'
    })
    return df
df_acc = get_accessions()
# This spreadsheet should contain all accessions in the genotyping data
assert df_acc['iris_id'].isin(sids).all()

# Eliminate leading strings in ids
assert df_acc['gs_acc'].str.contains('IRGC').all()
assert df_acc['igrc_acc_src'].str.contains('IRGC').all()
df_acc['gs_acc'] = df_acc['gs_acc'].str.replace('IRGC', '').str.strip()
df_acc['igrc_acc_src'] = df_acc['igrc_acc_src'].str.replace('IRGC', '').str.strip()

df_acc.head()

Unnamed: 0,iris_id,gs_acc,gs_variety_name,igrc_acc_src
0,B001,135900,Heibiao,
1,B002,136041,Sansuijin,
2,B003,136088,Zaoshengbai,
3,B004,136031,Qiuguangtengxi 104,
4,B005,136067,Wanshi,


In [89]:
# Phenotypes from the "Use new descriptors" group in the "Annotations" tab
# indicating phenotypes for which the encodings are the same regardless of
# the 2007 refactorings
target_phenotypes = [
    'APANTH_REPRO',
    'APSH',
    'APCO_REV_POST',
    'APCO_REV_REPRO',
    'AWCO_LREV',
    'AWCO_REV',
    'AWDIST',
    'BLANTHPR_VEG',
    'BLANTHDI_VEG',
    'BLPUB_VEG',
    'BLSCO_ANTH_VEG',
    'BLSCO_REV_VEG',
    'CCO_REV_VEG',
    'CUAN_REPRO',
    'ENDO',
    'FLA_EREPRO',
    'FLA_REPRO',
    'INANTH',
    'LIGCO_REV_VEG',
    'LIGSH',
    'LPCO_REV_POST',
    'LPPUB',
    'LSEN',
    'NOANTH',
    'PEX_REPRO',
    'PTH',
    'SCCO_REV',
    'SECOND_BR_REPRO',
    'SLCO_REV',
    'SPKF',
    'SLLT_CODE',   
]


def get_phenotypes():
    df = pd.read_excel(osp.join(data_dir, '3kRG_PhenotypeData_v20170411.xlsx'), sheet_name='Phenotype Data')
    cols = {
        'Seqno': 'seq_no',
        'STOCK_ID': 'stock_id',
        'GS_ACCNO': 'gs_acc',
        'NAME': 'gs_variety_name',
        'Source_Accno': 'igrc_acc_src'
    }
    df = df.rename(columns=cols)
    df = df[list(cols.values()) + target_phenotypes]
    df = df.rename(columns=lambda c: c if not c in target_phenotypes else 'pt_' + c)
    return df
df_pt = get_phenotypes()
df_pt.head()

Unnamed: 0,seq_no,stock_id,gs_acc,gs_variety_name,igrc_acc_src,pt_APANTH_REPRO,pt_APSH,pt_APCO_REV_POST,pt_APCO_REV_REPRO,pt_AWCO_LREV,...,pt_LPPUB,pt_LSEN,pt_NOANTH,pt_PEX_REPRO,pt_PTH,pt_SCCO_REV,pt_SECOND_BR_REPRO,pt_SLCO_REV,pt_SPKF,pt_SLLT_CODE
0,1,16,121316.0,CIWINI SML::IRGC 50642-1,50642,,,,20.0,,...,4.0,9.0,,5.0,3.0,10.0,1.0,20.0,4.0,3.0
1,2,17,117426.0,ARC 13829::IRGC 42469-1,42469,,,,80.0,,...,4.0,9.0,,9.0,3.0,10.0,1.0,80.0,5.0,3.0
2,3,18,117466.0,DOM ZARD::IRGC 12881-1,12881,,,,52.0,,...,3.0,9.0,,9.0,1.0,10.0,1.0,20.0,4.0,3.0
3,4,19,117425.0,ARC 10497::IRGC 12485-1,12485,,,,80.0,,...,,9.0,,9.0,2.0,10.0,,80.0,4.0,3.0
4,5,20,121499.0,SHANKA::IRGC 67848-1,67848,,,,20.0,,...,4.0,7.0,,7.0,2.0,10.0,1.0,20.0,5.0,3.0


In [90]:
def get_lkp(df, c):
    return (
        df_acc
        .dropna(subset=[c])
        .assign(gs_acc=lambda df: df[c].astype(int))
        .set_index(c)['iris_id']
        .dropna().to_dict()
    )
gs_acc_to_iris = get_lkp(df_acc, 'gs_acc')
igrc_acc_src_to_iris = get_lkp(df_acc, 'igrc_acc_src')

In [114]:
def get_row(r):
    iris_id = None
    
    iid1, iid2 = None, None
    try:
        iid1 = gs_acc_to_iris.get(int(r['gs_acc']))
    except ValueError:
        pass
        
    try:
        iid2 = igrc_acc_src_to_iris.get(int(r['igrc_acc_src']))
    except ValueError:
        pass

    if iid1 and iid2 and iid1 != iid2:
        raise ValueError(f'Found conflict: {iid1} != {iid2}')
        
    if not iid1 and not iid2:
        iris_id = None
    elif iid1:
        iris_id = iid1
    else:
        iris_id = iid2
    return r.append(pd.Series({'iris_id': iris_id}))
        
dfm = pd.DataFrame([
    get_row(r)
    for _, r in df_pt.iterrows()
])
dfm = dfm.rename(columns=lambda c: 'acc_' + c if not c.startswith('pt_') else c)
dfm = dfm[[c for c in dfm if c.startswith('acc_')] + [c for c in dfm if c.startswith('pt_')]]

# Finally, filter to where iris_id is present since this is the 
# link to sample id in plink fam 
dfm = dfm[dfm['acc_iris_id'].notnull()]
assert dfm['acc_seq_no'].notnull().all()
dfm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2113 entries, 0 to 2264
Data columns (total 37 columns):
acc_seq_no             2113 non-null int64
acc_stock_id           2113 non-null int64
acc_gs_acc             2113 non-null float64
acc_gs_variety_name    2113 non-null object
acc_igrc_acc_src       2113 non-null int64
acc_iris_id            2113 non-null object
pt_APANTH_REPRO        91 non-null float64
pt_APSH                133 non-null float64
pt_APCO_REV_POST       552 non-null float64
pt_APCO_REV_REPRO      2108 non-null float64
pt_AWCO_LREV           133 non-null float64
pt_AWCO_REV            2112 non-null float64
pt_AWDIST              30 non-null float64
pt_BLANTHPR_VEG        133 non-null float64
pt_BLANTHDI_VEG        13 non-null float64
pt_BLPUB_VEG           2112 non-null float64
pt_BLSCO_ANTH_VEG      133 non-null float64
pt_BLSCO_REV_VEG       2111 non-null float64
pt_CCO_REV_VEG         2110 non-null float64
pt_CUAN_REPRO          2111 non-null float64
pt_ENDO     

### Merge

In [95]:
ht = hl.Table.from_pandas(dfm).key_by('acc_iris_id')
ht

<hail.table.Table at 0x7f11ea5a2a90>

In [122]:
mte = mt.annotate_cols(**{c: ht[mt.s][c] for c in ht.to_pandas() if c != 'acc_iris_id'})
mte = mte.filter_cols(hl.is_defined(mte.acc_seq_no))
mte = mte.drop('fam_id', 'pat_id', 'mat_id', 'is_female', 'is_case')
mte.describe()

----------------------------------------
Global fields:
    None
----------------------------------------
Column fields:
    's': str
    'acc_seq_no': int64
    'acc_stock_id': int64
    'acc_gs_acc': float64
    'acc_gs_variety_name': str
    'acc_igrc_acc_src': int64
    'pt_APANTH_REPRO': float64
    'pt_APSH': float64
    'pt_APCO_REV_POST': float64
    'pt_APCO_REV_REPRO': float64
    'pt_AWCO_LREV': float64
    'pt_AWCO_REV': float64
    'pt_AWDIST': float64
    'pt_BLANTHPR_VEG': float64
    'pt_BLANTHDI_VEG': float64
    'pt_BLPUB_VEG': float64
    'pt_BLSCO_ANTH_VEG': float64
    'pt_BLSCO_REV_VEG': float64
    'pt_CCO_REV_VEG': float64
    'pt_CUAN_REPRO': float64
    'pt_ENDO': float64
    'pt_FLA_EREPRO': float64
    'pt_FLA_REPRO': float64
    'pt_INANTH': float64
    'pt_LIGCO_REV_VEG': float64
    'pt_LIGSH': float64
    'pt_LPCO_REV_POST': float64
    'pt_LPPUB': float64
    'pt_LSEN': float64
    'pt_NOANTH': float64
    'pt_PEX_REPRO': float64
    'pt_PTH': float64
 

In [123]:
df_cols = mte.cols().to_pandas()
df_cols.head(5)

2020-05-14 16:58:52 Hail: INFO: Coerced sorted dataset


Unnamed: 0,s,acc_seq_no,acc_stock_id,acc_gs_acc,acc_gs_variety_name,acc_igrc_acc_src,pt_APANTH_REPRO,pt_APSH,pt_APCO_REV_POST,pt_APCO_REV_REPRO,...,pt_LPPUB,pt_LSEN,pt_NOANTH,pt_PEX_REPRO,pt_PTH,pt_SCCO_REV,pt_SECOND_BR_REPRO,pt_SLCO_REV,pt_SPKF,pt_SLLT_CODE
0,IRIS_313-10000,335,387,125907.0,SUWEON 311::IRGC 61890-1,61890,,,,20.0,...,2.0,9.0,,7.0,1.0,10.0,1.0,20.0,5.0,1.0
1,IRIS_313-10001,336,388,125692.0,C 662083::IRGC 62101-1,62101,,,,20.0,...,2.0,7.0,,5.0,2.0,10.0,1.0,20.0,4.0,3.0
2,IRIS_313-10002,103,129,125955.0,BW 295-5::IRGC 63098-1,63098,,,20.0,20.0,...,4.0,7.0,,7.0,3.0,10.0,1.0,20.0,4.0,1.0
3,IRIS_313-10007,337,389,125749.0,GARURA::IRGC 64111-1,64111,,,,10.0,...,4.0,3.0,,5.0,3.0,10.0,1.0,20.0,4.0,3.0
4,IRIS_313-10010,338,390,125818.0,LALKA (LAL DHAN)::IRGC 64946-1,64946,,,,20.0,...,2.0,5.0,,7.0,3.0,10.0,1.0,20.0,4.0,3.0


In [124]:
df_cols.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2113 entries, 0 to 2112
Data columns (total 37 columns):
s                      2113 non-null object
acc_seq_no             2113 non-null int64
acc_stock_id           2113 non-null int64
acc_gs_acc             2113 non-null float64
acc_gs_variety_name    2113 non-null object
acc_igrc_acc_src       2113 non-null int64
pt_APANTH_REPRO        91 non-null float64
pt_APSH                133 non-null float64
pt_APCO_REV_POST       552 non-null float64
pt_APCO_REV_REPRO      2108 non-null float64
pt_AWCO_LREV           133 non-null float64
pt_AWCO_REV            2112 non-null float64
pt_AWDIST              30 non-null float64
pt_BLANTHPR_VEG        133 non-null float64
pt_BLANTHDI_VEG        13 non-null float64
pt_BLPUB_VEG           2112 non-null float64
pt_BLSCO_ANTH_VEG      133 non-null float64
pt_BLSCO_REV_VEG       2111 non-null float64
pt_CCO_REV_VEG         2110 non-null float64
pt_CUAN_REPRO          2111 non-null float64
pt_ENDO     

### Export

In [6]:
export_dir = osp.join(data_dir, 'rg-3k-gwas-export')
export_dir

'/home/eczech/data/gwas/rice-snpseek/1M_GWAS_SNP_Dataset/rg-3k-gwas-export'

In [125]:
path = osp.join(export_dir, 'rg-3k-gwas-export.mt')
mte.write(path, overwrite=True)
path

2020-05-14 17:00:00 Hail: INFO: Coerced sorted dataset
2020-05-14 17:01:15 Hail: INFO: wrote matrix table with 1011601 rows and 2113 columns in 23 partitions to /home/eczech/data/gwas/rice-snpseek/1M_GWAS_SNP_Dataset/rg-3k-gwas-export.mt


'/home/eczech/data/gwas/rice-snpseek/1M_GWAS_SNP_Dataset/rg-3k-gwas-export.mt'

In [127]:
!du -sh $path

471M	/home/eczech/data/gwas/rice-snpseek/1M_GWAS_SNP_Dataset/rg-3k-gwas-export.mt


In [132]:
# Export column data as csv
path = osp.join(export_dir, 'rg-3k-gwas-export.cols.csv')
mte.cols().to_pandas().to_csv(path, index=False)
path

'/home/eczech/data/gwas/rice-snpseek/1M_GWAS_SNP_Dataset/rg-3k-gwas-export.cols.csv'

In [134]:
# Export row data as csv
path = osp.join(export_dir, 'rg-3k-gwas-export.rows.csv.gz')
mte.rows().to_pandas().to_csv(path, index=False)
path

2020-05-14 17:10:01 Hail: INFO: Coerced sorted dataset


'/home/eczech/data/gwas/rice-snpseek/1M_GWAS_SNP_Dataset/rg-3k-gwas-export.rows.csv.gz'

In [131]:
# Export call data (mean imputed) as zarr
bm = hl.linalg.BlockMatrix.from_entry_expr(mte.GT.n_alt_alleles(), mean_impute=True, center=False, normalize=False)
bm

2020-05-14 17:04:09 Hail: INFO: Coerced sorted dataset
2020-05-14 17:04:12 Hail: INFO: Coerced sorted dataset
2020-05-14 17:08:32 Hail: INFO: Wrote all 247 blocks of 1011601 x 2113 matrix with block size 4096.


<hail.linalg.blockmatrix.BlockMatrix at 0x7f120b440450>

In [135]:
%%time
# Load the whole float64 call array into memory (need ~70G RAM to handle this)
ca = bm.to_numpy()

CPU times: user 18.2 ms, sys: 10.4 s, total: 10.5 s
Wall time: 1min 31s


In [153]:
ca.dtype, ca.shape, ca.nbytes

(dtype('float64'), (1011601, 2113), 17100103304)

In [144]:
# Write using dask to avoid in-memory copies
cad = da.round(da.asarray(ca, chunks=10000)).astype(np.int8)
cad

Unnamed: 0,Array,Chunk
Bytes,2.14 GB,21.13 MB
Shape,"(1011601, 2113)","(10000, 2113)"
Count,307 Tasks,102 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 2.14 GB 21.13 MB Shape (1011601, 2113) (10000, 2113) Count 307 Tasks 102 Chunks Type int8 numpy.ndarray",2113  1011601,

Unnamed: 0,Array,Chunk
Bytes,2.14 GB,21.13 MB
Shape,"(1011601, 2113)","(10000, 2113)"
Count,307 Tasks,102 Chunks
Type,int8,numpy.ndarray


In [156]:
path = osp.join(export_dir, 'rg-3k-gwas-export.calls.zarr')
path

'/home/eczech/data/gwas/rice-snpseek/1M_GWAS_SNP_Dataset/rg-3k-gwas-export.calls.zarr'

In [147]:
%%time
cad.to_zarr(path)
path

CPU times: user 12.8 s, sys: 2.57 s, total: 15.4 s
Wall time: 8.98 s


'/home/eczech/data/gwas/rice-snpseek/1M_GWAS_SNP_Dataset/rg-3k-gwas-export.calls.zarr'

In [148]:
!du -sh $path

582M	/home/eczech/data/gwas/rice-snpseek/1M_GWAS_SNP_Dataset/rg-3k-gwas-export.calls.zarr


### Upload

Uploads go to: https://console.cloud.google.com/storage/browser/public-gwas-datasets/rice-3k-snpseek