# UK Biobank (UKB) - Filtering and QC

[UKB Nature paper, "The UK Biobank resource with deep phenotyping and genomic data"](https://www.nature.com/articles/s41586-018-0579-z)

In [None]:
import hail as hl

hl.init()

## UKB SNP QC

Import UKB SNP QC file as Hail table, to get the SNPs used in PCA.

### Create SNP QC Hail table:

In [None]:
# Import UKB SNP QC file as Hail table
snp_qc_ht = hl.import_table(
    'gs://fc-9a7c5487-04c9-4182-b3ec-13de7f6b409b/genotype/ukb_snp_qc.txt', 
    delimiter='\s+',
    impute=True,
    types={'chromosome': hl.tstr, 'position': hl.tint32},
    min_partitions=4
)
snp_qc_ht = snp_qc_ht.rename({
    'PC9_loading_1': 'PC19_loading', 
    'PC9_loading_2': 'PC29_loading', 
    'PC9_loading_3': 'PC39_loading'
})

# Need to update contigs 23, 24, 25, 26 to X, Y, X, MT 
chr_mapping = hl.dict({
    **{str(k): str(k) for k in range(1,23)}, 
    **{'23': 'X', '24': 'Y', '25': 'X', '26': 'MT'}
})
snp_qc_ht = snp_qc_ht.annotate(chromosome=chr_mapping[snp_qc_ht.chromosome])

# Key by locus/alleles and write out SNP QC Hail table
snp_qc_ht = snp_qc_ht.annotate(
    locus=hl.locus(snp_qc_ht.chromosome, snp_qc_ht.position, reference_genome='GRCh37'),
    alleles=[snp_qc_ht.allele1_ref, snp_qc_ht.allele2_alt]
)
snp_qc_ht = snp_qc_ht.key_by('locus', 'alleles')
snp_qc_ht = snp_qc_ht.select(*list(snp_qc_ht.row_value))
snp_qc_ht = snp_qc_ht.checkpoint('gs://ukb-data/variants/ukb_snp_qc.ht', overwrite=False, _read_if_exists=True)

print(f'snp_qc_ht, count: {snp_qc_ht.count()}')
print(f'snp_qc_ht, distinct count: {snp_qc_ht.distinct().count()}')
snp_qc_ht.show()

### Create Hail table with 147,604 SNPs used in PCA:

In [None]:
# Load UKB SNP QC Hail table we just wrote, should have 147,604 SNPs used in PCA
snp_qc_ht = hl.read_table('gs://ukb-data/variants/ukb_snp_qc.ht')
snp_in_pca_ht = snp_qc_ht.filter(snp_qc_ht.in_PCA == 1)
snp_in_pca_ht = snp_in_pca_ht.checkpoint('gs://ukb-data/variants/ukb_snp_in_pca_147604.ht', overwrite=False, _read_if_exists=True)

print(f'snp_in_pca_ht, count: {snp_in_pca_ht.count()}')

## UKB Sample QC

### Create sample QC Hail table:

In [None]:
sample_qc_ht = hl.import_table(
    'gs://ukb31063/ukb31063.sample_qc.tsv.bgz',
    delimiter='\s+',
    impute=True,
    min_partitions=4
)
sample_qc_ht = sample_qc_ht.checkpoint('gs://ukb-data/samples/ukb_sample_qc.ht', overwrite=False, _read_if_exists=True)
print(f'sample_qc_ht count: {sample_qc_ht.count()}')
sample_qc_ht.show()

### Withdrawn participants:

The most recent withdrawn participants file is located at:  [gs://ukb31063/ukb31063.withdrawn_participants.20210809.csv](https://storage.googleapis.com/ukb31063/ukb31063.withdrawn_participants.20210809.csv).

In [None]:
# Create Hail table from the most recently updated withdrawn participants file
wd_samples_ht = hl.import_table(
    'gs://ukb31063/ukb31063.withdrawn_participants.20210809.csv', 
    delimiter='\s+', 
    no_header=True,
    impute=True
)
wd_samples_ht = wd_samples_ht.rename({'f0': 's'})
wd_samples_ht = wd_samples_ht.key_by('s')
wd_samples_ht = wd_samples_ht.checkpoint('gs://ukb-data/samples/withdrawn_samples_20210809.ht', overwrite=False, _read_if_exists=True)

print(f'wd_samples_ht count: {wd_samples_ht.count()}')
wd_samples_ht.describe()

## White British UKB subset

Source: 

https://github.com/Nealelab/UK_Biobank_GWAS/tree/master/imputed-v2-gwas

> **Primary sample QC parameters for GWAS from ukb_sqc_v2.txt file:**
>   * in.Phasing.Input.chr1_22==1
>   * in.white.British.ancestry.subset==1
>   * used.in.pca.calculation==1
>   * excess.relatives==0
>   * putative.sex.chromosome.aneuploidy==0
>  
> **Additional QC parameters**
>   * Samples withdrawn un UK Biobank update = 8 
>   * Samples redacted = 3 ([-3,-2,-1] in the sample ID) 
> 
> **Pre/post QC sample counts**
>   * Imputed samples removed from QC file = 151180
>   * Imputed samples retained in QC file = 337199
>   * NOTE: all samples retained are in the .bgen files
>   * NOTE: The ukb_sqc_v2.txt file has more samples than the .bgen files, but the same number of samples as the application specific .sample file

After applying the sample QC filters from the v2 GWAS we end up with 337208 samples, and we still need to remove samples from withdrawn participants.

In [None]:
sample_qc_ht = hl.read_table('gs://ukb-data/samples/ukb_sample_qc.ht')
wd_samples_ht = hl.read_table('gs://ukb-data/samples/withdrawn_samples_20210809.ht')

# Apply sample QC filters used in the v2 GWAS
samples_v2_ht = sample_qc_ht.filter(
    (sample_qc_ht['in.Phasing.Input.chr1_22'] == 1) & 
    (sample_qc_ht['in.white.British.ancestry.subset'] == 1) &
    (sample_qc_ht['used.in.pca.calculation'] == 1) &
    (sample_qc_ht['excess.relatives'] == 0) &
    (sample_qc_ht['putative.sex.chromosome.aneuploidy'] == 0)
)
print('Removed samples not satisfying sample QC parameters.')
print(f'samples_v2_ht count: {samples_v2_ht.count()}')
print()

# Remove redacted samples (negative ID)
samples_v2_ht = samples_v2_ht.filter(samples_v2_ht.id > 0)
print('Removed redacted samples.')
print(f'samples_v2_ht count: {samples_v2_ht.count()}')
print()

# Remove withdrawn participant samples
withdrawn_samples = hl.set(wd_samples_ht.s.collect())
samples_v2_ht = samples_v2_ht.filter(~withdrawn_samples.contains(samples_v2_ht['id']))
print('Removed withdrawn participant samples.')
print(f'samples_v2_ht count: {samples_v2_ht.count()}')
print()

samples_v2_ht = samples_v2_ht.checkpoint('gs://ukb-data/samples/wb_337111.ht', overwrite=False)

### Load and filter UKB genotype MatrixTable to White British Subset:

In [None]:
variants_ht = hl.read_table('gs://ukb-data/variants/ukb_snp_in_pca_147604.ht')

samples_ht = hl.read_table('gs://ukb-data/samples/wb_337111.ht')
samples_ht = samples_ht.annotate(s=hl.str(samples_ht.id))
samples_ht = samples_ht.key_by('s')

gt_mt = hl.read_matrix_table('gs://ukb31063/ukb31063.genotype.mt')
gt_mt = gt_mt.semi_join_cols(samples_ht).semi_join_rows(variants_ht)
gt_mt = gt_mt.repartition(256)

gt_mt.write('gs://ukb-data/genotypes/337111-samples/gt_147604_337111.mt', overwrite=False)
gt_mt = hl.read_matrix_table('gs://ukb-data/genotypes/337111-samples/gt_147604_337111.mt')
print(gt_mt.count())
gt_mt.describe()

## Pan-UKB subset

In [None]:
sample_qc_ht = hl.read_table('gs://ukb-data/samples/ukb_sample_qc.ht')
wd_samples_ht = hl.read_table('gs://ukb-data/samples/withdrawn_samples_20210809.ht')
                                       
# Apply sample QC filters used in the v2 GWAS, but omitting 'in.white.British.ancestry.subset==1'
samples_panukb_ht = sample_qc_ht.filter(
    (sample_qc_ht['in.Phasing.Input.chr1_22'] == 1) & 
    (sample_qc_ht['used.in.pca.calculation'] == 1) &
    (sample_qc_ht['excess.relatives'] == 0) &
    (sample_qc_ht['putative.sex.chromosome.aneuploidy'] == 0)
)
print('Removed samples not satisfying sample QC parameters.')
print(f'samples_panukb_ht count: {samples_panukb_ht.count()}')
print()

# Remove redacted samples (negative ID)
samples_panukb_ht = samples_panukb_ht.filter(samples_panukb_ht.id > 0)
print('Removed redacted samples.')
print(f'samples_panukb_ht count: {samples_panukb_ht.count()}')
print()

# Remove withdrawn participant samples
withdrawn_samples = hl.set(wd_samples_ht.s.collect())
samples_panukb_ht = samples_panukb_ht.filter(~withdrawn_samples.contains(samples_panukb_ht['id']))
print('Removed withdrawn participant samples.')
print(f'samples_panukb_ht count: {samples_panukb_ht.count()}')
print()

samples_panukb_ht = samples_panukb_ht.checkpoint('gs://ukb-data/samples/pan_ukb_406696.ht', overwrite=False)

### Load and filter UKB genotype MatrixTable to Pan-UKB subset:

In [None]:
variants_ht = hl.read_table('gs://ukb-data/variants/ukb_snp_in_pca_147604.ht')

samples_ht = hl.read_table('gs://ukb-data/samples/pan_ukb_406696.ht')
samples_ht = samples_ht.annotate(s=hl.str(samples_ht.id))
samples_ht = samples_ht.key_by('s')

gt_mt = hl.read_matrix_table('gs://ukb31063/ukb31063.genotype.mt')
gt_mt = gt_mt.semi_join_cols(samples_ht).semi_join_rows(variants_ht)
gt_mt = gt_mt.repartition(256)

gt_mt.write('gs://ukb-data/genotypes/406696-samples/gt_147604_406696.mt', overwrite=False)
gt_mt = hl.read_matrix_table('gs://ukb-data/genotypes/406696-samples/gt_147604_406696.mt')
print(gt_mt.count())
gt_mt.describe()

## UK Birth Coordinates (for PC score heatmaps)

In [None]:
# Hacky way to just get the fields we need and avoid issues with import of file above
lines_ht = hl.import_lines('gs://ukb31063/ukb31063.phenotypes.20191008.csv.bgz')
lines_ht = lines_ht.annotate(text_split = lines_ht.text.split(","))
lines_ht = lines_ht.annotate(s = lines_ht.text_split[0].replace('"', ''),
                             _129_0_0 = lines_ht.text_split[370].replace('"', ''),
                             _129_1_0 = lines_ht.text_split[371].replace('"', ''),
                             _129_2_0 = lines_ht.text_split[372].replace('"', ''),
                             _130_0_0 = lines_ht.text_split[373].replace('"', ''),
                             _130_1_0 = lines_ht.text_split[374].replace('"', ''),
                             _130_2_0 = lines_ht.text_split[375].replace('"', ''))
lines_ht = lines_ht.select('s', 
                           '_129_0_0', 
                           '_129_1_0', 
                           '_129_2_0',
                           '_130_0_0', 
                           '_130_1_0', 
                           '_130_2_0')
lines_ht = lines_ht.filter(lines_ht.s != 'eid')
lines_ht = lines_ht.annotate(
    s = hl.int32(lines_ht.s),
    _129_0_0 = hl.if_else(lines_ht._129_0_0 == "", hl.missing(hl.tint32), hl.int32(lines_ht._129_0_0)),
    _129_1_0 = hl.if_else(lines_ht._129_1_0 == "", hl.missing(hl.tint32), hl.int32(lines_ht._129_1_0)),
    _129_2_0 = hl.if_else(lines_ht._129_2_0 == "", hl.missing(hl.tint32), hl.int32(lines_ht._129_2_0)),
    _130_0_0 = hl.if_else(lines_ht._130_0_0 == "", hl.missing(hl.tint32), hl.int32(lines_ht._130_0_0)),
    _130_1_0 = hl.if_else(lines_ht._130_1_0 == "", hl.missing(hl.tint32), hl.int32(lines_ht._130_1_0)),
    _130_2_0 = hl.if_else(lines_ht._130_2_0 == "", hl.missing(hl.tint32), hl.int32(lines_ht._130_2_0)))
lines_ht = lines_ht.key_by('s')
lines_ht = lines_ht.checkpoint('gs://ukb-data/samples/uk_birth_coordinates-checkpoint.ht', overwrite=False)

### Create Hail Table containing all samples with defined birth coordinates:

In [None]:
uk_birth_coordinates_ht = hl.read_table('gs://ukb-data/samples/uk_birth_coordinates-checkpoint.ht')

# Get first available UK birth coordinates (if available)
uk_birth_coordinates_ht = uk_birth_coordinates_ht.annotate(
    _129_tup=hl.enumerate([uk_birth_coordinates_ht._129_0_0, 
                           uk_birth_coordinates_ht._129_1_0, 
                           uk_birth_coordinates_ht._129_2_0]),
    _130_tup=hl.enumerate([uk_birth_coordinates_ht._130_0_0, 
                           uk_birth_coordinates_ht._130_1_0, 
                           uk_birth_coordinates_ht._130_2_0])
)
uk_birth_coordinates_ht = uk_birth_coordinates_ht.annotate(
    north_coord_129=uk_birth_coordinates_ht._129_tup.find(lambda x: hl.is_defined(x[1]))[1],
    east_coord_130=uk_birth_coordinates_ht._130_tup.find(lambda x: hl.is_defined(x[1]))[1]
)
uk_birth_coordinates_ht = uk_birth_coordinates_ht.drop('_129_tup', '_130_tup')

# Filter to only samples with defined UK birth coordinates and write out table
uk_birth_coordinates_ht = uk_birth_coordinates_ht.filter(
    hl.is_defined(uk_birth_coordinates_ht.north_coord_129) & (uk_birth_coordinates_ht.north_coord_129 != -1) & 
    hl.is_defined(uk_birth_coordinates_ht.east_coord_130) & (uk_birth_coordinates_ht.east_coord_130 != -1)
)
uk_birth_coordinates_ht = uk_birth_coordinates_ht.naive_coalesce(1)
uk_birth_coordinates_ht = uk_birth_coordinates_ht.checkpoint('gs://ukb-data/samples/uk_birth_coordinates.ht', 
                                                             overwrite=False)

### Pan-UKB subset:

In [None]:
overwrite = False
uk_birth_coordinates_ht = hl.read_table('gs://ukb-data/samples/uk_birth_coordinates.ht')

# Load pan-UKB samples table
pan_ukb_sample_ht = hl.read_table('gs://ukb-data/samples/pan_ukb_406696.ht')
pan_ukb_sample_ht = pan_ukb_sample_ht.rename({'id': 's'})
pan_ukb_sample_ht = pan_ukb_sample_ht.key_by('s')

# Create sample birth coordinates table for pan-UKB sample subset and checkpoint
uk_birth_coordinates_406696_ht = uk_birth_coordinates_ht.semi_join(pan_ukb_sample_ht)
uk_birth_coordinates_406696_ht = uk_birth_coordinates_406696_ht.checkpoint(
    'gs://ukb-data/samples/pan_ukb_406696-uk_birth_coordinates.ht', 
    overwrite=overwrite, 
    _read_if_exists=not overwrite
)
print(f'Pan-UKB birth coordinates row count: {uk_birth_coordinates_406696_ht.count()}.')

### White British subset:

In [None]:
overwrite = False
uk_birth_coordinates_ht = hl.read_table('gs://ukb-data/samples/uk_birth_coordinates.ht')

# Load WB samples table
wb_sample_ht = hl.read_table('gs://ukb-data/samples/wb_337111.ht')
wb_sample_ht = wb_sample_ht.rename({'id': 's'})
wb_sample_ht = wb_sample_ht.key_by('s')

# Create sample birth coordinates table for WB sample subset and checkpoint
uk_birth_coordinates_337111_ht = uk_birth_coordinates_ht.semi_join(wb_sample_ht)
uk_birth_coordinates_337111_ht = uk_birth_coordinates_337111_ht.checkpoint(
    'gs://ukb-data/samples/wb_337111-uk_birth_coordinates.ht', 
    overwrite=overwrite, 
    _read_if_exists=not overwrite
)
print(f'WB birth coordinates row count: {uk_birth_coordinates_337111_ht.count()}.')