## BGEN Sample and Variant Metadata EDA

In [1]:
import pandas as pd

#### Sample EDA

In [4]:
files = !gsutil ls gs://rs-ukb/raw-data/gt-imputation/*.sample

In [5]:
files

['gs://rs-ukb/raw-data/gt-imputation/ukb59384_imp_chr10_v3_s487296.sample',
 'gs://rs-ukb/raw-data/gt-imputation/ukb59384_imp_chr11_v3_s487296.sample',
 'gs://rs-ukb/raw-data/gt-imputation/ukb59384_imp_chr12_v3_s487296.sample',
 'gs://rs-ukb/raw-data/gt-imputation/ukb59384_imp_chr13_v3_s487296.sample',
 'gs://rs-ukb/raw-data/gt-imputation/ukb59384_imp_chr14_v3_s487296.sample',
 'gs://rs-ukb/raw-data/gt-imputation/ukb59384_imp_chr15_v3_s487296.sample',
 'gs://rs-ukb/raw-data/gt-imputation/ukb59384_imp_chr16_v3_s487296.sample',
 'gs://rs-ukb/raw-data/gt-imputation/ukb59384_imp_chr17_v3_s487296.sample',
 'gs://rs-ukb/raw-data/gt-imputation/ukb59384_imp_chr18_v3_s487296.sample',
 'gs://rs-ukb/raw-data/gt-imputation/ukb59384_imp_chr19_v3_s487296.sample',
 'gs://rs-ukb/raw-data/gt-imputation/ukb59384_imp_chr1_v3_s487296.sample',
 'gs://rs-ukb/raw-data/gt-imputation/ukb59384_imp_chr20_v3_s487296.sample',
 'gs://rs-ukb/raw-data/gt-imputation/ukb59384_imp_chr21_v3_s487296.sample',
 'gs://rs-ukb

In [6]:
def load(path):
    cols = [("id1", "int32"), ("id2", "int32"), ("missing", str), ("sex", str)]
    return pd.read_csv(
        path,
        sep=" ",
        dtype=dict(cols),
        names=[c[0] for c in cols],
        header=0,
        skiprows=1,  # Skip the first non-header row
    )

In [15]:
pd.set_option('max.info_rows', 100000000)
df = pd.concat([
    load(path).assign(file=path.split('/')[-1])
    for path in files
])
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11696198 entries, 0 to 486756
Data columns (total 5 columns):
 #   Column   Non-Null Count     Dtype 
---  ------   --------------     ----- 
 0   id1      11696198 non-null  int32 
 1   id2      11696198 non-null  int32 
 2   missing  11696198 non-null  object
 3   sex      11696198 non-null  object
 4   file     11696198 non-null  object
dtypes: int32(2), object(3)
memory usage: 446.2+ MB


In [13]:
df.groupby('file').size()

file
ukb59384_imp_chr10_v3_s487296.sample    487409
ukb59384_imp_chr11_v3_s487296.sample    487409
ukb59384_imp_chr12_v3_s487296.sample    487409
ukb59384_imp_chr13_v3_s487296.sample    487409
ukb59384_imp_chr14_v3_s487296.sample    487409
ukb59384_imp_chr15_v3_s487296.sample    487409
ukb59384_imp_chr16_v3_s487296.sample    487409
ukb59384_imp_chr17_v3_s487296.sample    487409
ukb59384_imp_chr18_v3_s487296.sample    487409
ukb59384_imp_chr19_v3_s487296.sample    487409
ukb59384_imp_chr1_v3_s487296.sample     487409
ukb59384_imp_chr20_v3_s487296.sample    487409
ukb59384_imp_chr21_v3_s487296.sample    487409
ukb59384_imp_chr22_v3_s487296.sample    487409
ukb59384_imp_chr2_v3_s487296.sample     487409
ukb59384_imp_chr3_v3_s487296.sample     487409
ukb59384_imp_chr4_v3_s487296.sample     487409
ukb59384_imp_chr5_v3_s487296.sample     487409
ukb59384_imp_chr6_v3_s487296.sample     487409
ukb59384_imp_chr7_v3_s487296.sample     487409
ukb59384_imp_chr8_v3_s487296.sample     487409
ukb59384

In [14]:
(df['id1'] == df['id2']).value_counts()

True    11696198
dtype: int64

In [16]:
df['missing'].value_counts()

0    11696198
Name: missing, dtype: int64

In [17]:
df['sex'].value_counts()

2    6342279
1    5351185
0       2734
Name: sex, dtype: int64

In [30]:
df['id1'].max()

6025055

#### Variant EDA

In [2]:
from pyspark.sql.session import SparkSession
from pyspark.sql.types import StringType, StructType, IntegerType, FloatType, StructField
import pyspark.sql.functions as F
import os
import os.path as osp
os.environ['JAVA_HOME'] = osp.join(os.environ['CONDA_PREFIX'].replace('ukb-analysis', 'pyspark'), 'jre')
spark = SparkSession.builder.getOrCreate()

In [15]:
!gsutil -m cp gs://rs-ukb/raw-data/gt-imputation/ukb_mfi_chr*_v3.txt /tmp/ukb_mfi/

Copying gs://rs-ukb/raw-data/gt-imputation/ukb_mfi_chr10_v3.txt...
Copying gs://rs-ukb/raw-data/gt-imputation/ukb_mfi_chr11_v3.txt...              
==> NOTE: You are downloading one or more large file(s), which would            
run significantly faster if you enabled sliced object downloads. This
feature is enabled by default but requires that compiled crcmod be
installed (see "gsutil help crcmod").

Copying gs://rs-ukb/raw-data/gt-imputation/ukb_mfi_chr12_v3.txt...
Copying gs://rs-ukb/raw-data/gt-imputation/ukb_mfi_chr13_v3.txt...              
Copying gs://rs-ukb/raw-data/gt-imputation/ukb_mfi_chr15_v3.txt...              
Copying gs://rs-ukb/raw-data/gt-imputation/ukb_mfi_chr16_v3.txt...
Copying gs://rs-ukb/raw-data/gt-imputation/ukb_mfi_chr18_v3.txt...              
Copying gs://rs-ukb/raw-data/gt-imputation/ukb_mfi_chr14_v3.txt...              
Copying gs://rs-ukb/raw-data/gt-imputation/ukb_mfi_chr1_v3.txt...               
Copying gs://rs-ukb/raw-data/gt-imputation/ukb_mfi_chr17

In [4]:
schema = StructType([
    StructField("id", StringType()),
    StructField("rsid", StringType()),
    StructField("position", IntegerType()),
    StructField("allele1_ref", StringType()),
    StructField("allele2_ref", StringType()),
    StructField("maf", FloatType()),
    StructField("minor_allele", StringType()),
    StructField("info", FloatType())
])
df = spark.read.csv('/tmp/ukb_mfi/*.txt', sep='\t', schema=schema)
df

DataFrame[id: string, rsid: string, position: int, allele1_ref: string, allele2_ref: string, maf: float, minor_allele: string, info: float]

In [8]:
df.agg(*[F.mean(F.col(c).isNotNull().cast('int')) for c in df.columns]).show()

KeyboardInterrupt: 

In [None]:
df.isnull().mean().compute()

In [None]:
df['rsid'].value_counts().tail(25).compute()

In [10]:
chrs = pd.Series(files).str.extract(r'chr(.*)_', expand=False).tolist()
chrs

['10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '19',
 '1',
 '20',
 '21',
 '22',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 'XY',
 'X']

In [12]:
def load(path):
    cols = [
        ("id", str),
        ("rsid", str),
        ("position", "int32"),
        ("allele1_ref", str),
        ("allele2_alt", str),
        ("maf", float),
        ("minor_allele", str),
        ("info", float),
    ]
    return pd.read_csv(path, sep="\t", names=[c[0] for c in cols], dtype=dict(cols))

In [13]:
pd.set_option('max.info_rows', 100000000)
df = pd.concat([
    load(path).assign(chr=chrs[i])
    for i, path in enumerate(files)
])
df.info()

KeyboardInterrupt: 