In [1]:
# default_exp genodata

In [1]:
#hide
%load_ext autoreload
%autoreload 2

# Genodata module
> read and extract genodata

In [111]:
#export
import numpy as np
import pandas as pd
import dask.array as da
from bgen_reader import open_bgen  
from pandas_plink import read_plink
from pandas_plink.bed_reader import lib, ffi
try:
    from pybgen.parallel import ParallelPyBGEN as PyBGEN
except:
    print('Can not import ParallelPyBGEN. import PyBGEN instead')
    from pybgen import PyBGEN

In [35]:
#export
from math import floor
from pathlib import Path
from typing import Optional, Union
from tqdm import tqdm
from numpy import ascontiguousarray, empty, float32, float64, nan_to_num, uint8, uint64, arange, full
from xarray import DataArray
from pandas import DataFrame, array

In [112]:
# export
def read_bgen(file, sample_file=None,pybgen=True):
    '''the function to read genotype data'''
    if pybgen:
        bg = PyBGEN(file,probs_only=True)
        bim = []
        for i,t in enumerate(bg.iter_variant_info()):
            bim.append([int(t.chrom),t.name,0.0,t.pos,t.a1,t.a2,i])
        bim = pd.DataFrame(bim,columns=['chrom','snp','cm','pos','a0','a1','i'])
        bim.snp = 'chr'+bim[['chrom','pos','a0','a1']].astype(str).agg(':'.join, axis=1)
    else:
        bg = open_bgen(file,verbose=False)
        snp,aa0,aa1 = [],[],[]
        for c,p,alleles in zip(bg.chromosomes,bg.positions,bg.allele_ids):
            a0,a1 = alleles.split(',')
            aa0.append(a0)
            aa1.append(a1)
            snp.append(':'.join(['chr'+str(int(c)),str(p),a0,a1]))  # '05' first change to int, then change to str
        bim = pd.DataFrame({'chrom':bg.chromosomes.astype(int),'snp':snp,'pos':bg.positions,'a0':aa0,'a1':aa1})
    if sample_file is None:
        fam = None
    else:
        fam = pd.read_csv(sample_file, header=0, delim_whitespace=True, quotechar='"',skiprows=1)
        fam.columns = ['fid','iid','missing','sex'] #Fix me
        fam = fam
    return bim,fam,bg

In [None]:
#export
def read_bim(fn):
    header = ["chrom", "snp", "cm","pos","a0", "a1"]
    df = pd.read_csv(fn,delim_whitespace=True,header=None,names=header,compression=None,engine="c",iterator=False)
    df["i"] = range(df.shape[0])
    return df

In [113]:
# export
def bgen2dask(bgen,index,step=500):
    '''The function to covert bgen to dask array'''
    genos = []
    n = len(index)
    for i in range(0,n,step):
        onecode_geno = bgen.read(index[i:min(n,i+step)])  #samples x variants
        geno = onecode_geno.argmax(axis=2).astype(np.int8)
        genos.append(da.from_array(geno))
    return(da.concatenate(genos,axis=1).T)

In [126]:
# export
def pybgen_region(bgen,region,step=100):
    genos,geno=[],[]
    i = 1
    for _,v in bgen.iter_variants_in_region('0'+str(region[0]) if region[0]<10 else str(region[0]),region[1],region[2]):
        if i % step == 0:
            genos.append(da.from_array(geno))
            geno = []    
        geno.append(v.argmax(axis=1).astype(np.int8))
        i += 1
    genos.append(da.from_array(geno))
    return(da.concatenate(genos,axis=0))

In [115]:
# export
def extract_bed(geno,idx,row=True,step=500,region=None):  #row = True by variants, row = False by samples
    if isinstance(geno,da.core.Array):
        if row:
            geno = geno[idx,:]
        else:
            geno = geno[:,idx]
    elif isinstance(geno,PyBGEN):
        geno = pybgen_region(geno,region,step)
    else:
        if row:
            #must be numric index
            if type(list(idx)[0]) is bool:
                pd_idx = pd.Series(idx)
                idx = list(pd_idx[pd_idx].index)
            geno = bgen2dask(geno,idx,step)
        else:
            geno = geno.read() # read all variants
            geno = geno[:,idx]
    return geno

In [1]:
# export
class Genodata:
    def __init__(self,geno_path,sample_path=None):
        self.bim,self.fam,self.bed = self.read_geno(geno_path,sample_path)
        
    def __repr__(self): 
        return "bim:% s \n fam:% s \n bed:%s" % (self.bim, self.fam, self.bed)
        
    def read_geno(self,geno_file,sample_file):
        if geno_file.endswith('.bed'):
            bim,fam,bed =  read_plink(geno_file[:-4], verbose=False)
            bim.snp = 'chr'+bim[['chrom','pos','a0','a1']].astype(str).agg(':'.join, axis=1)
        elif geno_file.endswith('.bgen'):
            if sample_file is None:
                sample_file = geno_file.replace('.bgen', '.sample')
            bim,fam,bed = read_bgen(geno_file,sample_file)
        else:
            raise ValueError('Plesae provide the genotype files with PLINK binary format or BGEN format')
        bim.chrom = bim.chrom.astype(int)
        bim.pos = bim.pos.astype(int)
        return bim,fam,bed
    
        
    def geno_in_stat(self,stat,notin=False):
        '''The function to find an overlap region between geno data with sumstat'''
        variants = stat.SNP
        self.extractbyvariants(variants,notin)
        
        
    def geno_in_unr(self,unr):
        '''The function to find an overlap samples between geno data with unr'''
        samples = unr.IID
        self.extractbysamples(samples)
        
    def extractbyregion(self,region):
        bim = self.bim
        idx = (bim.chrom == region[0]) & (bim.pos >= region[1]) & (bim.pos <= region[2])
        print('this region',region,'has',sum(idx),'SNPs in Genodata')
        if sum(idx) == 0:
            raise ValueError('The extraction is empty')
        #update bim,bed
        self.extractbyidx(idx,row=True,region=region)
      
    def extractbyvariants(self,variants,notin=False):  #variants is list or pd.Series
        idx = self.bim.snp.isin(variants)
        if notin:
            idx = idx == False
        if sum(idx) == 0:
            raise ValueError('The extraction is empty')
        #update bim,bed
        self.extractbyidx(idx,row=True)
        
    def extractbysamples(self,samples,notin=False): #samples is list or pd.Series
        samples = pd.Series(samples,dtype=str)
        idx = self.fam.iid.astype(str).isin(samples)
        if notin:
            idx = idx == False
        if sum(idx) == 0:
            raise ValueError('The extraction is empty')
        #update fam,bed
        self.extractbyidx(idx,row=False)
        
    def extractbyidx(self,idx,row=True,region=None):
        '''get subset of genodata by index
        if index is numbers, the order of genodata will be sorted by the order of index.
        if row = True, extract by variants. Otherwise, extract by samples.'''
        idx = list(idx)
        self.idx = idx
        if row:
            #update bim
            if type(idx[0]) is bool:
                self.bim = self.bim[idx]
            else:
                self.bim = self.bim.iloc[idx]
        else:
            #update fam
            if type(idx[0]) is bool:
                self.fam = self.fam[idx]
            else:
                self.fam = self.fam.iloc[idx]
        self.bed = extract_bed(self.bed,idx,row,region=region)
        
    def export_plink(self, bed: Union[str, Path], bim: Optional[Union[str, Path]] = None, fam: Optional[Union[str, Path]] = None,row: str = "variant",verbose: bool = True):
        bed = Path(bed)
        if bim is None:
            bim = bed.with_suffix(".bim")
        if fam is None:
            fam = bed.with_suffix(".fam")
        bim = Path(bim)
        fam = Path(fam)

        write_bed(bed, self.bed, row, verbose)

        _echo("Writing FAM... ", end="", disable=not verbose)
        write_fam(fam, self.fam)
        _echo("done.", disable=not verbose)

        _echo("Writing BIM... ", end="", disable=not verbose)
        write_bim(bim, self.bim)
        _echo("done.", disable=not verbose)



In [85]:
#export
def write_plink(
    G,
    bed: Union[str, Path],
    bim: Optional[Union[str, Path]] = None,
    fam: Optional[Union[str, Path]] = None,
    row: str = "variant",
    verbose: bool = True,
):
    """
    Write PLINK 1 binary files into a data array.

    A PLINK 1 binary file set consists of three files:

    - BED: containing the genotype.
    - BIM: containing variant information.
    - FAM: containing sample information.

    The user must provide the genotype (dosage) via a :class:`xarray.DataArray` matrix
    with data type :const:`numpy.float32` or :const:`numpy.float64`. That matrix must
    have two named dimensions: **sample** and **variant**. The only allowed values for
    the genotype are: :const:`0`, :const:`1`, :const:`2`, and :data:`math.nan`.

    Parameters
    ----------
    G
        Genotype with bim, bed, and fam.
    bed
        Path to a BED file.
    bim
        Path to a BIM file.It defaults to :const:`None`, in which case it will try to be
        inferred.
    fam
        Path to a FAM file. It defaults to :const:`None`, in which case it will try to
        be inferred.
    major
        It can be either :const:`"sample"` or :const:`"variant"` (recommended and
        default). Specify the matrix layout on the BED file.
    verbose
        :const:`True` for progress information; :const:`False` otherwise.
    """
    if G.bed.ndim != 2:
        raise ValueError("G has to be bidimensional")

    bed = Path(bed)
    if bim is None:
        bim = bed.with_suffix(".bim")
    if fam is None:
        fam = bed.with_suffix(".fam")
    bim = Path(bim)
    fam = Path(fam)

    write_bed(bed, G.bed, row, verbose)

    _echo("Writing FAM... ", end="", disable=not verbose)
    write_fam(fam, G.fam)
    _echo("done.", disable=not verbose)

    _echo("Writing BIM... ", end="", disable=not verbose)
    write_bim(bim, G.bim)
    _echo("done.", disable=not verbose)


def _echo(msg: str, end: str = "\n", disable: bool = False):
    if not disable:
        print(msg, end=end, flush=True)


def write_fam(filepath: Path, df):
    cols = ["fid", "iid", "father","mother","gender","trait"]
    df = df[cols]
    df.to_csv(
        filepath,
        index=False,
        sep="\t",
        header=False,
        encoding="ascii",
        line_terminator="\n",
    )


def write_bim(filepath: Path, df):
    cols = ["chrom","snp","cm","pos","a0","a1"]
    df = df[cols]
    df.to_csv(
        filepath,
        index=False,
        sep="\t",
        header=False,
        encoding="ascii",
        line_terminator="\n",
    )

In [81]:
#export
def write_bed(filepath: Path, G, row='variant', verbose=True):
    """
    Write BED file.
    It assumes that ``X`` is a variant-by-sample matrix.
    """
    if not isinstance(G,da.core.Array):
        G = da.asanyarray(G)

    if row != "variant":
        G = G.T

    row_code = 1 if row == "variant" else 0
    e = lib.write_bed_header(str(filepath).encode(), row_code)
    if e != 0:
        raise RuntimeError(f"Failure while writing BED file {filepath}.")

    nrows = G.shape[0]
    ncols = G.shape[1]

    row_chunk = max(1, floor((1024 * 1024 * 256) / ncols))
    row_chunk = min(row_chunk, nrows)

    G = G.rechunk((row_chunk, ncols))

    row_start = 0
    for chunk in tqdm(G.chunks[0], "Writing BED", disable=not verbose):
        data = G[row_start : row_start + chunk, :].compute()
        if data.dtype not in [float32, float64]:
            msg = "Unsupported data type. "
            msg += "Please, provide a dosage matrix in either "
            msg += "float32 or float64 format."
            raise ValueError(msg)

        _write_bed_chunk(filepath, data)
        row_start += chunk


def _write_bed_chunk(filepath: Path, X):

    base_type = uint8
    base_size = base_type().nbytes
    base_repr = "uint8_t"

    nan_to_num(X, False, 3.0)
    G = ascontiguousarray(X, base_type)
    assert G.flags.aligned

    strides = empty(2, uint64)
    strides[:] = G.strides
    strides //= base_size

    e = lib.write_bed_chunk(
        str(filepath).encode(),
        G.shape[1],
        G.shape[0],
        ffi.cast(f"{base_repr} *", G.ctypes.data),
        ffi.cast("uint64_t *", strides.ctypes.data),
    )
    if e != 0:
        raise RuntimeError(f"Failure while writing BED file {filepath}.")

In [86]:
write_plink(geno,'test.bed')


Writing BED:   0%|          | 0/1 [00:00<?, ?it/s][A
Writing BED: 100%|██████████| 1/1 [00:04<00:00,  4.77s/it][A

Writing FAM... 




done.
Writing BIM... done.


In [70]:
from pandas_plink import read_plink1_bin

In [87]:
geno1 = Genodata('test.bed')

In [88]:
geno1

bim:      chrom                  snp   cm        pos a0 a1     i
0         1    chr1:55039741:G:C  0.0   55039741  G  C     0
1         1    chr1:55039742:G:A  0.0   55039742  G  A     1
2         1    chr1:55039749:G:C  0.0   55039749  G  C     2
3         1    chr1:55039750:T:C  0.0   55039750  T  C     3
4         1    chr1:55039753:T:C  0.0   55039753  T  C     4
...     ...                  ...  ...        ... .. ..   ...
1408     11  chr11:116832956:T:C  0.0  116832956  T  C  1408
1409     11  chr11:116832976:C:G  0.0  116832976  C  G  1409
1410     11  chr11:116832977:T:G  0.0  116832977  T  G  1410
1411     11  chr11:116832978:T:G  0.0  116832978  T  G  1411
1412     11  chr11:116832980:T:C  0.0  116832980  T  C  1412

[1413 rows x 7 columns] 
 fam:            fid      iid father mother gender trait       i
0       1000019  1000019      0      0      2    -9       0
1       1000078  1000078      0      0      2    -9       1
2       1000081  1000081      0      0      1    -9  

# Test

In [6]:
geno_path ='/home/dmc2245/UKBiobank/data/exome_files/project_VCF/072721_run/plink/ukb23156_c1.merged.filtered.bed'

In [117]:
region = [5,272741,1213528-900000]
geno_path = 'MWE_region_extraction/ukb23156_c5.merged.filtered.5_272741_1213528.bed'
sumstats_path = 'MWE_region_extraction/090321_UKBB_Hearing_aid_f3393_expandedwhite_6436cases_96601ctrl_PC1_2_f3393.regenie.snp_stats'
pheno_path = None
unr_path = 'MWE_region_extraction/UKB_genotypedatadownloaded083019.090221_sample_variant_qc_final_callrate90.filtered.extracted.white_europeans.filtered.092821_ldprun_unrelated.filtered.prune.txt'
imp_geno_path = 'MWE_region_extraction/ukb_imp_chr5_v3_05_272856_1213643.bgen'
imp_sumstats_path = 'MWE_region_extraction/100521_UKBB_Hearing_aid_f3393_expandedwhite_15601cases_237318ctrl_500k_PC1_PC2_f3393.regenie.snp_stats'
imp_ref = 'hg19'

output_sumstats = 'test.snp_stats'
output_LD = 'test_corr.csv'

#main(region,geno_path,sumstats_path,pheno_path,unr_path,imp_geno_path,imp_sumstats_path,imp_ref,output_sumstats,output_LD)

In [16]:
from pandas_plink import Chunk

In [17]:
Chunk(512,512)

Chunk(nsamples=512, nvariants=512)

In [12]:
exome_geno.extractbyvariants(exome_geno.bim.snp[:50])

In [13]:
exome_geno.extractbysamples(exome_geno.fam.iid[:60])

In [6]:
from cugg.sumstat import *

In [8]:
region = [5, 272741, 1213528]

In [7]:
imput_sumstats = Sumstat('/home/dmc2245/UKBiobank/results/REGENIE_results/results_imputed_data/2021_10_07_f3393_500K/100521_UKBB_Hearing_aid_f3393_expandedwhite_15601cases_237318ctrl_500k_PC1_PC2_f3393.regenie.snp_stats.gz')

In [10]:
imput_sumstats.extractbyregion(region)

In [11]:
imput_sumstats

sumstat:         CHR      POS REF ALT                SNP      BETA        SE         P
6767726    5   272851   A   G    chr5:272851:A:G  0.357496  0.888197  0.687318
6767727    5   272906   A   C    chr5:272906:A:C -0.003007  0.019764  0.879070
6767728    5   273143   A   G    chr5:273143:A:G -0.013693  0.016716  0.412684
6767729    5   273160   G   C    chr5:273160:G:C  0.235713  0.348772  0.499145
6767730    5   273534   C   T    chr5:273534:C:T  0.050095  0.139496  0.719509
...      ...      ...  ..  ..                ...       ...       ...       ...
6776191    5  1213094   C   T   chr5:1213094:C:T -0.015881  0.023298  0.495462
6776192    5  1213134   G   A   chr5:1213134:G:A -1.142280  1.344380  0.395509
6776193    5  1213223   C   T   chr5:1213223:C:T -0.003009  0.013631  0.825270
6776194    5  1213404   T  TC  chr5:1213404:T:TC -0.039146  0.117837  0.739735
6776195    5  1213510   C   T   chr5:1213510:C:T  0.009318  0.012922  0.470845

[8470 rows x 8 columns]

In [None]:

bgen = PyBGEN(geno_file)
sample_file = geno_file.replace('.bgen', '.sample')
if not os.path.isfile(sample_file):
    if not os.path.isfile(${bgen_sample_path:r}):
        raise ValueError(f"Cannot find the matching sample file ``{sample_file}`` for ``{geno_file}``.\nYou can specify path to sample file for all BGEN files using ``--bgen-sample-path``.")
    else:
        sample_file = ${bgen_sample_path:r}
bgen_fam = pd.read_csv(sample_file, header=0, delim_whitespace=True, quotechar='"',skiprows=1)
bgen_fam.columns = ['fid','iid','missing','sex']
geno = [bgen,bgen_fam]

In [127]:
#imp_geno_path = '/mnt/mfs/statgen/archive/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr5_v3.bgen'
bgen_sample_path = '/home/dmc2245/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb32285_imputedindiv.sample'
imput_geno = Genodata(imp_geno_path,bgen_sample_path)

In [128]:
imput_geno.extractbyregion(region)

In [130]:
imput_geno.extractbyvariants(list(imput_geno.bim.snp[10:20]))

In [134]:
imput_geno.extractbysamples(list(imput_geno.fam.iid[50:100]))

In [135]:
imput_geno

bim:    chrom              snp   cm     pos a0 a1   i
10      5  chr5:273143:A:G  0.0  273143  A  G  10
11      5  chr5:273160:G:C  0.0  273160  G  C  11
12      5  chr5:273209:A:G  0.0  273209  A  G  12
13      5  chr5:273212:T:C  0.0  273212  T  C  13
14      5  chr5:273237:T:G  0.0  273237  T  G  14
15      5  chr5:273261:C:T  0.0  273261  C  T  15
16      5  chr5:273267:G:A  0.0  273267  G  A  16
17      5  chr5:273310:A:G  0.0  273310  A  G  17
18      5  chr5:273319:A:G  0.0  273319  A  G  18
19      5  chr5:273326:T:C  0.0  273326  T  C  19 
 fam:        fid      iid  missing  sex
50  1426232  1426232        0    1
51  3769979  3769979        0    1
52  1751431  1751431        0    2
53  4658109  4658109        0    2
54  3538677  3538677        0    2
55  3542585  3542585        0    2
56  3249692  3249692        0    2
57  2299845  2299845        0    1
58  4136172  4136172        0    2
59  5406314  5406314        0    1
60  5987848  5987848        0    2
61  3872614  3872614

In [103]:
region

[5, 272741, 313528]

In [10]:
from pybgen import PyBGEN

In [120]:
bgen = PyBGEN(imp_geno_path,probs_only=True)

In [121]:
pybgen_region(bgen,region)

Unnamed: 0,Array,Chunk
Bytes,789.75 MiB,46.48 MiB
Shape,"(1699, 487409)","(100, 487409)"
Count,34 Tasks,17 Chunks
Type,int8,numpy.ndarray
"Array Chunk Bytes 789.75 MiB 46.48 MiB Shape (1699, 487409) (100, 487409) Count 34 Tasks 17 Chunks Type int8 numpy.ndarray",487409  1699,

Unnamed: 0,Array,Chunk
Bytes,789.75 MiB,46.48 MiB
Shape,"(1699, 487409)","(100, 487409)"
Count,34 Tasks,17 Chunks
Type,int8,numpy.ndarray


In [64]:
for t,g in bgen.iter_variants_in_region('0'+str(region[0]) if region[0]<10 else str(region[0]),region[1],region[2]):
    print(t)

In [5]:
import pandas as pd

In [46]:
tmp = bgen.iter_variants()

In [60]:
genos = []
for i,v in zip(range(bgen.nb_variants),bgen):
    geno = []
    if i % 100000 ==0:
        geno.append(v.argmax(axis=1).astype(np.int8))
        print(i,j)

0 (<Variant rs537688122 chr05:272856_A/G>, array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.]]))


In [None]:
    genos = []
    n = len(index)
    for i in range(0,n,step):
        onecode_geno = bgen.read(index[i:min(n,i+step)])  #samples x variants
        geno = onecode_geno.argmax(axis=2).astype(np.int8)
        genos.append(da.from_array(geno))

In [59]:
1002 %10000

1002

In [48]:
tmp

PyBGEN(487,409 samples; 6,070,641 variants)

In [33]:
a = tmp.next()

In [25]:
import numpy as np

In [37]:
a[1]

array([[1.        , 0.        , 0.        ],
       [0.83921569, 0.16078431, 0.        ],
       [0.96078431, 0.03921569, 0.        ],
       ...,
       [1.        , 0.        , 0.        ],
       [0.00392157, 0.95686275, 0.03921569],
       [0.85490196, 0.14509804, 0.        ]])

In [34]:
aa = a[1].argmax(axis=1).astype(np.int8)

In [35]:
pd.Series(aa).value_counts()

0    455061
1     31952
2       396
dtype: int64

In [6]:
tmp = []
for i,t in enumerate(bgen.iter_variant_info()):
    tmp.append([int(t.chrom),t.name,0.0,t.pos,t.a1,t.a2,i])
tmp = pd.DataFrame(tmp,columns=['chrom','snp','cm','pos','a0','a1','i'])
tmp.snp = 'chr'+tmp[['chrom','pos','a0','a1']].astype(str).agg(':'.join, axis=1)

In [7]:
tmp

Unnamed: 0,chrom,snp,cm,pos,a0,a1,i
0,5,rs546183826,0.0,10043,T,A,0
1,5,rs564373080,0.0,10055,T,A,1
2,5,rs528775171,0.0,10056,A,C,2
3,5,rs547354230,0.0,10058,C,A,3
4,5,rs562245928,0.0,10059,C,A,4
...,...,...,...,...,...,...,...
6070636,5,rs552848768,0.0,180902794,G,A,6070636
6070637,5,rs572660833,0.0,180902887,T,A,6070637
6070638,5,rs544891279,0.0,180903491,G,T,6070638
6070639,5,rs558980847,0.0,180904360,A,T,6070639


In [14]:
list(bgen.iter_variant_info())[0]

<Variant rs546183826 chr05:10043_T/A>

In [28]:
idx = imput_geno.idx

In [29]:
if type(list(idx)[0]) is bool:
    pd_idx = pd.Series(idx)
    idx = list(pd_idx[pd_idx].index)

In [31]:
len(idx)

8470

In [33]:
idx[1:10]

[10535, 10541, 10542, 10556, 10567, 10569, 10570, 10573, 10574]

In [46]:
imp_geno_path = 'MWE_region_extraction/ukb_imp_chr5_v3_05_272856_1213643.bgen'
bgen = open_bgen(imp_geno_path)
bgen.read(1)

reading -- time=0:00:00.00, thread 1 of 1, part 1 of 1


array([[[1., 0., 0.]],

       [[1., 0., 0.]],

       [[1., 0., 0.]],

       ...,

       [[1., 0., 0.]],

       [[1., 0., 0.]],

       [[1., 0., 0.]]])

In [2]:
imp_geno_path = '/mnt/mfs/statgen/archive/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr5_v3.bgen'
bgen = open_bgen(imp_geno_path)
bgen.read(1)

KeyError: 'max_combinations'

In [38]:
imput_geno.bed

<bgen_reader._bgen2.open_bgen at 0x2b1a4e207e80>

In [41]:
bgen

<bgen_reader._bgen2.open_bgen at 0x2b1dfde98b20>

In [25]:
imput_geno.geno_in_stat(imput_sumstats.ss)

[False, False, False, False, False, False, False, False, False, False]
8470 [10529, 10535, 10541, 10542, 10556, 10567, 10569, 10570, 10573, 10574]


KeyError: 'max_combinations'

In [4]:
read_bgen('/mnt/mfs/statgen/archive/UKBiobank_Yale_transfer/ukb39554_imputeddataset/ukb_imp_chr5_v3.bgen')

(         chrom                 snp        pos a0 a1
 0            5      chr5:10043:T:A      10043  T  A
 1            5      chr5:10055:T:A      10055  T  A
 2            5      chr5:10056:A:C      10056  A  C
 3            5      chr5:10058:C:A      10058  C  A
 4            5      chr5:10059:C:A      10059  C  A
 ...        ...                 ...        ... .. ..
 6070636      5  chr5:180902794:G:A  180902794  G  A
 6070637      5  chr5:180902887:T:A  180902887  T  A
 6070638      5  chr5:180903491:G:T  180903491  G  T
 6070639      5  chr5:180904360:A:T  180904360  A  T
 6070640      5  chr5:180904689:T:C  180904689  T  C
 
 [6070641 rows x 5 columns],
 None,
 <bgen_reader._bgen2.open_bgen at 0x2b1a4e295c70>)