## PySnpTools EDA

- [Github](https://github.com/fastlmm/pysnptools)
- [Tutorial](https://nbviewer.jupyter.org/github/fastlmm/PySnpTools/blob/master/doc/ipynb/tutorial.ipynb)
- [Homepage](https://fastlmm.github.io/)
- [Docs](https://fastlmm.github.io/PySnpTools/)

In [1]:
from pysnptools.snpreader import Bed, SnpReader
import pandas as pd
import numpy as np
# Note: count_A1 = true is necessary for HapMap data since A1 is the alternate rather than the reference
# (when False, homozygous alternate is the most common count by far, which makes no sense)
data = Bed('/home/eczech/data/gwas/tutorial/1_QC_GWAS/HapMap_3_r3_1', count_A1=True)
print(data) # prints specification for reading from file

Bed('/home/eczech/data/gwas/tutorial/1_QC_GWAS/HapMap_3_r3_1',count_A1=True)


In [2]:
# sid = snp id, iid = individual id
(data.sid_count, data.iid_count) # should be 1457897, 165

(1457897, 165)

In [3]:
ds = data[:5, :15]
(ds.sid_count, ds.iid_count)

(15, 5)

In [4]:
snpd = ds.read()
arr = snpd.val

In [5]:
type(arr), arr.dtype, arr.shape

(numpy.ndarray, dtype('float64'), (5, 15))

In [6]:
# This is [chromosome, genetic distance, basepair distance]
snpd.pos

array([[     1,      0, 556738],
       [     1,      0, 557616],
       [     1,      0, 718814],
       [     1,      0, 742584],
       [     1,      0, 744045],
       [     1,      0, 750775],
       [     1,      0, 758311],
       [     1,      0, 766409],
       [     1,      0, 769185],
       [     1,      0, 782343],
       [     1,      0, 789326],
       [     1,      0, 798632],
       [     1,      0, 828418],
       [     1,      0, 836671],
       [     1,      0, 843817]])

In [7]:
arr

array([[0., 0., 0., 1., 1., 1., 1., 0., 1., 0., 0., 0., 1., 0., 1.],
       [0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 1., 1., 0., 1., 1., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.]])

In [8]:
arr = data.read().val

In [9]:
arr.size, arr.shape

(240553005, (165, 1457897))

In [12]:
pd.Series(*np.unique(np.nan_to_num(arr, nan=-1), return_counts=True)[::-1])

-1.0       630620
 0.0    158635633
 1.0     65468332
 2.0     15818420
dtype: int64

### Dask Array Reader

In [2]:
import dask.array as da

class BedArray(object):
    
    def __init__(self, bed):
        self.bed = bed
        self.shape = (bed.sid_count, bed.iid_count)
        self.dtype = np.uint8
        self.ndim = 2
        
    def __getitem__(self, idx):
        assert isinstance(idx, tuple)
        chunk = self.bed.__getitem__(idx[::-1]).read(dtype=np.float32)
        arr = chunk.val.T
        arr = np.nan_to_num(arr, nan=-1) + 1
        arr = arr.astype(np.uint8)
        return arr

In [3]:
ba = BedArray(data)
ba.shape

(1457897, 165)

In [4]:
ba = da.from_array(BedArray(data), chunks=(10000, 165), lock=False)

In [5]:
arr = ba.compute()
arr.shape, arr.dtype

((1457897, 165), dtype('uint8'))

In [6]:
arr.min(), arr.max()

(0, 3)